update AUS Prospectus data point configurations

This commit is contained in:
Blade He 2025-03-04 16:52:06 -06:00
parent f4b4d00f58
commit d00820c14d
9 changed files with 90 additions and 33 deletions

View File

@ -3,13 +3,13 @@
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "investment fees and costs", "investment fee and costs", "Management costs", "Investment fees"]},
"performance_fee": {"english": ["performance fee", "performance fees"]},
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
"administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
"benchmark_name": {"english": ["benchmark fund","benchmark name", "Benchmark", "aims to outperform"]},
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]},
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost", "recoverable costs", "expense recoveries"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
"change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
}

View File

@ -3,13 +3,13 @@
"management_fee_and_costs": "share_level",
"management_fee": "share_level",
"performance_fee": "share_level",
"performance_fee_costs": "share_level",
"buy_spread": "share_level",
"sell_spread": "share_level",
"administration_fees": "share_level",
"interposed_vehicle_performance_fee_cost": "share_level",
"benchmark_name": "fund_level",
"minimum_initial_investment": "fund_level",
"indirect_costs": "share_level",
"recoverable_expenses": "share_level",
"indirect_costs": "share_level"
"change_recoverable_expanses": "share_level"
}

View File

@ -3,13 +3,13 @@
"management_fee_and_costs": "management fee and costs",
"management_fee": "management fee",
"performance_fee": "performance fee",
"performance_fee_costs": "performance fee costs",
"buy_spread": "buy spread",
"sell_spread": "sell spread",
"administration_fees": "administration fee",
"interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost",
"benchmark_name": "benchmark name",
"minimum_initial_investment": "minimum initial investment",
"indirect_costs": "indirect cost",
"recoverable_expenses": "recoverable expenses",
"indirect_costs": "indirect cost"
"change_recoverable_expanses": "change recoverable expanses"
}

View File

@ -3,13 +3,13 @@
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]},
"performance_fee": {"english": ["performance fee", "performance fees"]},
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
"administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
"benchmark_name": {"english": ["benchmark fund", "benchmark name", "Benchmark", "aims to outperform"]},
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]},
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable cost", "recoverable costs", "expense recoveries"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
"change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
}

View File

@ -3,13 +3,13 @@
"management_fee_and_costs": "float",
"management_fee": "float",
"performance_fee": "float",
"performance_fee_costs": "float",
"buy_spread": "float",
"sell_spread": "float",
"administration_fees": "float",
"interposed_vehicle_performance_fee_cost": "float",
"benchmark_name": "text",
"minimum_initial_investment": "integer",
"indirect_costs": "float",
"recoverable_expenses": "float",
"indirect_costs": "float"
"change_recoverable_expanses": "float"
}

View File

@ -25,15 +25,15 @@
"management_fee_and_costs",
"management_fee",
"performance_fee",
"performance_fee_costs",
"buy_spread",
"sell_spread",
"administration_fees",
"interposed_vehicle_performance_fee_cost",
"benchmark_name",
"minimum_initial_investment",
"recoverable_expenses",
"indirect_costs"
"indirect_costs",
"change_recoverable_expanses",
"recoverable_expenses"
]
}
}

View File

@ -116,8 +116,9 @@
"date_of_last_performance_fee_restructure": "Date of last performance fee restructure is belong to date, the value should be date format. e.g. 12 August 2022",
"high_water_mark_type": "High water mark type is belong to text, the value should be text.",
"minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00",
"indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100.",
"recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.",
"indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100."
"change_recoverable_expanses": "Change recoverable expanses is belong to percentage number, the value should be less than 100."
},
"special_rule": {
"management_fee_and_costs": [
@ -145,13 +146,22 @@
"{\"data\": []}",
"\n",
"B. If there are multiple Management fee and costs sub-columns, here is the rule: ",
"With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
"B.1 With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
"---Example Start---",
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
"---Example End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]}",
"\n",
"B.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".",
"The management_fee is the value of \"Management fee (% pa)\".",
"The management_fee_and_costs is the value of \"Total management cost (% pa)\".",
"---Example Start---",
"Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n",
"---Example End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
"\n",
"C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
"---Example 1 Start---",
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
@ -536,35 +546,36 @@
"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
"recoverable_expenses_value": [0.12, 0.05, 0.06],
"indirect_costs_value": [0.12, 0.16, 0.02]
"indirect_costs_value": [0.12, 0.16, 0.02],
"recoverable_expenses_value": [0.01, 0.05, 0.06],
"change_recoverable_expanses_value": [0.01, 0.02, 0.03]
},
"dp_reported_name" : {
"total_annual_dollar_based_charges": "Total annual dollar based charges",
"management_fee_and_costs": "Management fee and costs",
"management_fee": "Management fee",
"performance_fee": "Performance fee",
"performance_fee_costs": "Performance fee costs",
"buy_spread": "Buy spread",
"sell_spread": "Sell spread",
"administration_fees": "Administration fee",
"interposed_vehicle_performance_fee_cost": "Interposed vehicle performance fee cost",
"benchmark_name": "Benchmark name",
"minimum_initial_investment": "Minimum initial investment",
"indirect_costs": "Indirect cost",
"recoverable_expenses": "Recoverable expenses",
"change_recoverable_expanses": "Change recoverable expanses",
"establishment_fee": "Establishment fee",
"contribution_fee": "Contribution fee",
"withdrawal_fee": "Withdrawal fee",
"switching_fee": "Switching fee",
"activity_fee": "Activity fee",
"exit_fee": "Exit fee",
"administration_fees": "Administration fee",
"interposed_vehicle_performance_fee_cost": "Interposed vehicle performance fee cost",
"exit_fee": "Exit fee",
"additional_hurdle": "Additional hurdle",
"benchmark_name": "Benchmark name",
"reference_rate": "Reference rate",
"crystallisation_frequency": "Crystallisation frequency",
"date_of_last_hwm_reset": "Date of last hwm reset",
"date_of_last_performance_fee_restructure": "Date of last performance fee restructure",
"high_water_mark_type": "High-water mark type",
"minimum_initial_investment": "Minimum initial investment",
"recoverable_expenses": "Recoverable expenses",
"indirect_costs": "Indirect cost"
"high_water_mark_type": "High-water mark type"
}
},
"end": [

10
main.py
View File

@ -1504,8 +1504,8 @@ if __name__ == "__main__":
# special_doc_id_list = ["553242411"]
re_run_extract_data = True
re_run_mapping_data = True
re_run_extract_data = False
re_run_mapping_data = False
force_save_total_data = True
doc_source = "aus_prospectus"
# doc_source = "emea_ar"
@ -1517,15 +1517,15 @@ if __name__ == "__main__":
# r"./sample_documents/aus_prospectus_17_documents_sample.txt"
# )
document_sample_file = (
r"./sample_documents/aus_prospectus_29_documents_sample.txt"
r"./sample_documents/aus_prospectus_46_documents_sample.txt"
)
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/29_documents/aus_prospectus_29_documents_mapping.xlsx"
# special_doc_id_list: list = ["441280757"]
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
# special_doc_id_list: list = ["384508026"]
# special_doc_id_list: list = ["401212184"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"

View File

@ -0,0 +1,46 @@
377377369
397107472
401212184
409723592
411062815
412778803
414751292
462770987
471206458
391080133
391080140
410899007
420339794
441280757
446324179
454036250
384508026
530101994
550769189
550522985
539266893
539241700
539261734
550533961
506913190
539266814
521606716
553449663
528208796
539266817
539266874
539266880
526200514
523516443
526200513
521606755
557526129
540028470
531373053
544886057
557362556
557362553
520663234
527969661
541356150
555377021