update AUS Prospectus data point configurations

This commit is contained in:
Blade He 2025-03-04 16:52:06 -06:00
parent f4b4d00f58
commit d00820c14d
9 changed files with 90 additions and 33 deletions

View File

@ -3,13 +3,13 @@
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees"]}, "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "investment fees and costs", "investment fee and costs", "Management costs", "Investment fees"]}, "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "investment fees and costs", "investment fee and costs", "Management costs", "Investment fees"]},
"performance_fee": {"english": ["performance fee", "performance fees"]}, "performance_fee": {"english": ["performance fee", "performance fees"]},
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]}, "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
"administration_fees": {"english": ["administration fee", "administration fees","admin fee"]}, "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]}, "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
"benchmark_name": {"english": ["benchmark fund","benchmark name", "Benchmark", "aims to outperform"]}, "benchmark_name": {"english": ["benchmark fund","benchmark name", "Benchmark", "aims to outperform"]},
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]}, "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]},
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost", "recoverable costs", "expense recoveries"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
"change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
} }

View File

@ -3,13 +3,13 @@
"management_fee_and_costs": "share_level", "management_fee_and_costs": "share_level",
"management_fee": "share_level", "management_fee": "share_level",
"performance_fee": "share_level", "performance_fee": "share_level",
"performance_fee_costs": "share_level",
"buy_spread": "share_level", "buy_spread": "share_level",
"sell_spread": "share_level", "sell_spread": "share_level",
"administration_fees": "share_level", "administration_fees": "share_level",
"interposed_vehicle_performance_fee_cost": "share_level", "interposed_vehicle_performance_fee_cost": "share_level",
"benchmark_name": "fund_level", "benchmark_name": "fund_level",
"minimum_initial_investment": "fund_level", "minimum_initial_investment": "fund_level",
"indirect_costs": "share_level",
"recoverable_expenses": "share_level", "recoverable_expenses": "share_level",
"indirect_costs": "share_level" "change_recoverable_expanses": "share_level"
} }

View File

@ -3,13 +3,13 @@
"management_fee_and_costs": "management fee and costs", "management_fee_and_costs": "management fee and costs",
"management_fee": "management fee", "management_fee": "management fee",
"performance_fee": "performance fee", "performance_fee": "performance fee",
"performance_fee_costs": "performance fee costs",
"buy_spread": "buy spread", "buy_spread": "buy spread",
"sell_spread": "sell spread", "sell_spread": "sell spread",
"administration_fees": "administration fee", "administration_fees": "administration fee",
"interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost", "interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost",
"benchmark_name": "benchmark name", "benchmark_name": "benchmark name",
"minimum_initial_investment": "minimum initial investment", "minimum_initial_investment": "minimum initial investment",
"indirect_costs": "indirect cost",
"recoverable_expenses": "recoverable expenses", "recoverable_expenses": "recoverable expenses",
"indirect_costs": "indirect cost" "change_recoverable_expanses": "change recoverable expanses"
} }

View File

@ -3,13 +3,13 @@
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]}, "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]}, "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]},
"performance_fee": {"english": ["performance fee", "performance fees"]}, "performance_fee": {"english": ["performance fee", "performance fees"]},
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]}, "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
"administration_fees": {"english": ["administration fee", "administration fees","admin fee"]}, "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]}, "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
"benchmark_name": {"english": ["benchmark fund", "benchmark name", "Benchmark", "aims to outperform"]}, "benchmark_name": {"english": ["benchmark fund", "benchmark name", "Benchmark", "aims to outperform"]},
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]}, "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]},
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable cost", "recoverable costs", "expense recoveries"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
"change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
} }

View File

@ -3,13 +3,13 @@
"management_fee_and_costs": "float", "management_fee_and_costs": "float",
"management_fee": "float", "management_fee": "float",
"performance_fee": "float", "performance_fee": "float",
"performance_fee_costs": "float",
"buy_spread": "float", "buy_spread": "float",
"sell_spread": "float", "sell_spread": "float",
"administration_fees": "float", "administration_fees": "float",
"interposed_vehicle_performance_fee_cost": "float", "interposed_vehicle_performance_fee_cost": "float",
"benchmark_name": "text", "benchmark_name": "text",
"minimum_initial_investment": "integer", "minimum_initial_investment": "integer",
"indirect_costs": "float",
"recoverable_expenses": "float", "recoverable_expenses": "float",
"indirect_costs": "float" "change_recoverable_expanses": "float"
} }

View File

@ -25,15 +25,15 @@
"management_fee_and_costs", "management_fee_and_costs",
"management_fee", "management_fee",
"performance_fee", "performance_fee",
"performance_fee_costs",
"buy_spread", "buy_spread",
"sell_spread", "sell_spread",
"administration_fees", "administration_fees",
"interposed_vehicle_performance_fee_cost", "interposed_vehicle_performance_fee_cost",
"benchmark_name", "benchmark_name",
"minimum_initial_investment", "minimum_initial_investment",
"recoverable_expenses", "indirect_costs",
"indirect_costs" "change_recoverable_expanses",
"recoverable_expenses"
] ]
} }
} }

View File

@ -116,8 +116,9 @@
"date_of_last_performance_fee_restructure": "Date of last performance fee restructure is belong to date, the value should be date format. e.g. 12 August 2022", "date_of_last_performance_fee_restructure": "Date of last performance fee restructure is belong to date, the value should be date format. e.g. 12 August 2022",
"high_water_mark_type": "High water mark type is belong to text, the value should be text.", "high_water_mark_type": "High water mark type is belong to text, the value should be text.",
"minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00", "minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00",
"indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100.",
"recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.", "recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.",
"indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100." "change_recoverable_expanses": "Change recoverable expanses is belong to percentage number, the value should be less than 100."
}, },
"special_rule": { "special_rule": {
"management_fee_and_costs": [ "management_fee_and_costs": [
@ -145,13 +146,22 @@
"{\"data\": []}", "{\"data\": []}",
"\n", "\n",
"B. If there are multiple Management fee and costs sub-columns, here is the rule: ", "B. If there are multiple Management fee and costs sub-columns, here is the rule: ",
"With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".", "B.1 With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
"---Example Start---", "---Example Start---",
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n", "\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
"---Example End---", "---Example End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]}", "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]}",
"\n", "\n",
"B.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".",
"The management_fee is the value of \"Management fee (% pa)\".",
"The management_fee_and_costs is the value of \"Total management cost (% pa)\".",
"---Example Start---",
"Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n",
"---Example End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
"\n",
"C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
"---Example 1 Start---", "---Example 1 Start---",
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
@ -536,35 +546,36 @@
"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"], "date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"], "date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"], "high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
"recoverable_expenses_value": [0.12, 0.05, 0.06], "indirect_costs_value": [0.12, 0.16, 0.02],
"indirect_costs_value": [0.12, 0.16, 0.02] "recoverable_expenses_value": [0.01, 0.05, 0.06],
"change_recoverable_expanses_value": [0.01, 0.02, 0.03]
}, },
"dp_reported_name" : { "dp_reported_name" : {
"total_annual_dollar_based_charges": "Total annual dollar based charges", "total_annual_dollar_based_charges": "Total annual dollar based charges",
"management_fee_and_costs": "Management fee and costs", "management_fee_and_costs": "Management fee and costs",
"management_fee": "Management fee", "management_fee": "Management fee",
"performance_fee": "Performance fee", "performance_fee": "Performance fee",
"performance_fee_costs": "Performance fee costs",
"buy_spread": "Buy spread", "buy_spread": "Buy spread",
"sell_spread": "Sell spread", "sell_spread": "Sell spread",
"administration_fees": "Administration fee",
"interposed_vehicle_performance_fee_cost": "Interposed vehicle performance fee cost",
"benchmark_name": "Benchmark name",
"minimum_initial_investment": "Minimum initial investment",
"indirect_costs": "Indirect cost",
"recoverable_expenses": "Recoverable expenses",
"change_recoverable_expanses": "Change recoverable expanses",
"establishment_fee": "Establishment fee", "establishment_fee": "Establishment fee",
"contribution_fee": "Contribution fee", "contribution_fee": "Contribution fee",
"withdrawal_fee": "Withdrawal fee", "withdrawal_fee": "Withdrawal fee",
"switching_fee": "Switching fee", "switching_fee": "Switching fee",
"activity_fee": "Activity fee", "activity_fee": "Activity fee",
"exit_fee": "Exit fee", "exit_fee": "Exit fee",
"administration_fees": "Administration fee",
"interposed_vehicle_performance_fee_cost": "Interposed vehicle performance fee cost",
"additional_hurdle": "Additional hurdle", "additional_hurdle": "Additional hurdle",
"benchmark_name": "Benchmark name",
"reference_rate": "Reference rate", "reference_rate": "Reference rate",
"crystallisation_frequency": "Crystallisation frequency", "crystallisation_frequency": "Crystallisation frequency",
"date_of_last_hwm_reset": "Date of last hwm reset", "date_of_last_hwm_reset": "Date of last hwm reset",
"date_of_last_performance_fee_restructure": "Date of last performance fee restructure", "date_of_last_performance_fee_restructure": "Date of last performance fee restructure",
"high_water_mark_type": "High-water mark type", "high_water_mark_type": "High-water mark type"
"minimum_initial_investment": "Minimum initial investment",
"recoverable_expenses": "Recoverable expenses",
"indirect_costs": "Indirect cost"
} }
}, },
"end": [ "end": [

10
main.py
View File

@ -1504,8 +1504,8 @@ if __name__ == "__main__":
# special_doc_id_list = ["553242411"] # special_doc_id_list = ["553242411"]
re_run_extract_data = True re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = False
force_save_total_data = True force_save_total_data = True
doc_source = "aus_prospectus" doc_source = "aus_prospectus"
# doc_source = "emea_ar" # doc_source = "emea_ar"
@ -1517,15 +1517,15 @@ if __name__ == "__main__":
# r"./sample_documents/aus_prospectus_17_documents_sample.txt" # r"./sample_documents/aus_prospectus_17_documents_sample.txt"
# ) # )
document_sample_file = ( document_sample_file = (
r"./sample_documents/aus_prospectus_29_documents_sample.txt" r"./sample_documents/aus_prospectus_46_documents_sample.txt"
) )
with open(document_sample_file, "r", encoding="utf-8") as f: with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/29_documents/aus_prospectus_29_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
# special_doc_id_list: list = ["441280757"] # special_doc_id_list: list = ["384508026"]
# special_doc_id_list: list = ["401212184"] # special_doc_id_list: list = ["401212184"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"

View File

@ -0,0 +1,46 @@
377377369
397107472
401212184
409723592
411062815
412778803
414751292
462770987
471206458
391080133
391080140
410899007
420339794
441280757
446324179
454036250
384508026
530101994
550769189
550522985
539266893
539241700
539261734
550533961
506913190
539266814
521606716
553449663
528208796
539266817
539266874
539266880
526200514
523516443
526200513
521606755
557526129
540028470
531373053
544886057
557362556
557362553
520663234
527969661
541356150
555377021