diff --git a/configuration/aus_prospectus/datapoint_keyword.json b/configuration/aus_prospectus/datapoint_keyword.json index 32f0e73..9e5281d 100644 --- a/configuration/aus_prospectus/datapoint_keyword.json +++ b/configuration/aus_prospectus/datapoint_keyword.json @@ -3,13 +3,13 @@ "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees"]}, "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "investment fees and costs", "investment fee and costs", "Management costs", "Investment fees"]}, "performance_fee": {"english": ["performance fee", "performance fees"]}, - "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]}, "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]}, "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]}, "benchmark_name": {"english": ["benchmark fund","benchmark name", "Benchmark", "aims to outperform"]}, "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]}, - "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost", "recoverable costs", "expense recoveries"]}, - "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} + "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}, + "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]}, + "change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_level.json b/configuration/aus_prospectus/datapoint_level.json index f71e08d..036e792 100644 --- a/configuration/aus_prospectus/datapoint_level.json +++ b/configuration/aus_prospectus/datapoint_level.json @@ -3,13 +3,13 @@ "management_fee_and_costs": "share_level", "management_fee": "share_level", "performance_fee": "share_level", - "performance_fee_costs": "share_level", "buy_spread": "share_level", "sell_spread": "share_level", "administration_fees": "share_level", "interposed_vehicle_performance_fee_cost": "share_level", "benchmark_name": "fund_level", "minimum_initial_investment": "fund_level", + "indirect_costs": "share_level", "recoverable_expenses": "share_level", - "indirect_costs": "share_level" + "change_recoverable_expanses": "share_level" } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_name.json b/configuration/aus_prospectus/datapoint_name.json index 6fbfa2a..fd91a4a 100644 --- a/configuration/aus_prospectus/datapoint_name.json +++ b/configuration/aus_prospectus/datapoint_name.json @@ -3,13 +3,13 @@ "management_fee_and_costs": "management fee and costs", "management_fee": "management fee", "performance_fee": "performance fee", - "performance_fee_costs": "performance fee costs", "buy_spread": "buy spread", "sell_spread": "sell spread", "administration_fees": "administration fee", "interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost", "benchmark_name": "benchmark name", "minimum_initial_investment": "minimum initial investment", + "indirect_costs": "indirect cost", "recoverable_expenses": "recoverable expenses", - "indirect_costs": "indirect cost" + "change_recoverable_expanses": "change recoverable expanses" } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_reported_name.json b/configuration/aus_prospectus/datapoint_reported_name.json index bccfcc6..c0906c0 100644 --- a/configuration/aus_prospectus/datapoint_reported_name.json +++ b/configuration/aus_prospectus/datapoint_reported_name.json @@ -3,13 +3,13 @@ "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]}, "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]}, "performance_fee": {"english": ["performance fee", "performance fees"]}, - "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]}, "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]}, "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]}, "benchmark_name": {"english": ["benchmark fund", "benchmark name", "Benchmark", "aims to outperform"]}, "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]}, - "recoverable_expenses": {"english": ["recoverable expenses", "recoverable cost", "recoverable costs", "expense recoveries"]}, - "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} + "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}, + "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]}, + "change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_type.json b/configuration/aus_prospectus/datapoint_type.json index 4885434..d1ed4a1 100644 --- a/configuration/aus_prospectus/datapoint_type.json +++ b/configuration/aus_prospectus/datapoint_type.json @@ -3,13 +3,13 @@ "management_fee_and_costs": "float", "management_fee": "float", "performance_fee": "float", - "performance_fee_costs": "float", "buy_spread": "float", "sell_spread": "float", "administration_fees": "float", "interposed_vehicle_performance_fee_cost": "float", "benchmark_name": "text", "minimum_initial_investment": "integer", + "indirect_costs": "float", "recoverable_expenses": "float", - "indirect_costs": "float" + "change_recoverable_expanses": "float" } \ No newline at end of file diff --git a/configuration/aus_prospectus/domicile_datapoints.json b/configuration/aus_prospectus/domicile_datapoints.json index 04e48df..c4ff806 100644 --- a/configuration/aus_prospectus/domicile_datapoints.json +++ b/configuration/aus_prospectus/domicile_datapoints.json @@ -25,15 +25,15 @@ "management_fee_and_costs", "management_fee", "performance_fee", - "performance_fee_costs", "buy_spread", "sell_spread", "administration_fees", "interposed_vehicle_performance_fee_cost", "benchmark_name", "minimum_initial_investment", - "recoverable_expenses", - "indirect_costs" + "indirect_costs", + "change_recoverable_expanses", + "recoverable_expenses" ] } } \ No newline at end of file diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index bed2550..78de6d0 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -116,8 +116,9 @@ "date_of_last_performance_fee_restructure": "Date of last performance fee restructure is belong to date, the value should be date format. e.g. 12 August 2022", "high_water_mark_type": "High water mark type is belong to text, the value should be text.", "minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00", + "indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100.", "recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.", - "indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100." + "change_recoverable_expanses": "Change recoverable expanses is belong to percentage number, the value should be less than 100." }, "special_rule": { "management_fee_and_costs": [ @@ -145,13 +146,22 @@ "{\"data\": []}", "\n", "B. If there are multiple Management fee and costs sub-columns, here is the rule: ", - "With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".", + "B.1 With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".", "---Example Start---", "\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n", "---Example End---", "The output should be:", "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]}", "\n", + "B.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".", + "The management_fee is the value of \"Management fee (% pa)\".", + "The management_fee_and_costs is the value of \"Total management cost (% pa)\".", + "---Example Start---", + "Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n", + "---Example End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", + "\n", "C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", @@ -536,35 +546,36 @@ "date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"], "date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"], "high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"], - "recoverable_expenses_value": [0.12, 0.05, 0.06], - "indirect_costs_value": [0.12, 0.16, 0.02] + "indirect_costs_value": [0.12, 0.16, 0.02], + "recoverable_expenses_value": [0.01, 0.05, 0.06], + "change_recoverable_expanses_value": [0.01, 0.02, 0.03] }, "dp_reported_name" : { "total_annual_dollar_based_charges": "Total annual dollar based charges", "management_fee_and_costs": "Management fee and costs", "management_fee": "Management fee", "performance_fee": "Performance fee", - "performance_fee_costs": "Performance fee costs", "buy_spread": "Buy spread", "sell_spread": "Sell spread", + "administration_fees": "Administration fee", + "interposed_vehicle_performance_fee_cost": "Interposed vehicle performance fee cost", + "benchmark_name": "Benchmark name", + "minimum_initial_investment": "Minimum initial investment", + "indirect_costs": "Indirect cost", + "recoverable_expenses": "Recoverable expenses", + "change_recoverable_expanses": "Change recoverable expanses", "establishment_fee": "Establishment fee", "contribution_fee": "Contribution fee", "withdrawal_fee": "Withdrawal fee", "switching_fee": "Switching fee", "activity_fee": "Activity fee", - "exit_fee": "Exit fee", - "administration_fees": "Administration fee", - "interposed_vehicle_performance_fee_cost": "Interposed vehicle performance fee cost", + "exit_fee": "Exit fee", "additional_hurdle": "Additional hurdle", - "benchmark_name": "Benchmark name", "reference_rate": "Reference rate", "crystallisation_frequency": "Crystallisation frequency", "date_of_last_hwm_reset": "Date of last hwm reset", "date_of_last_performance_fee_restructure": "Date of last performance fee restructure", - "high_water_mark_type": "High-water mark type", - "minimum_initial_investment": "Minimum initial investment", - "recoverable_expenses": "Recoverable expenses", - "indirect_costs": "Indirect cost" + "high_water_mark_type": "High-water mark type" } }, "end": [ diff --git a/main.py b/main.py index 834d5b4..1171af8 100644 --- a/main.py +++ b/main.py @@ -1504,8 +1504,8 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - re_run_extract_data = True - re_run_mapping_data = True + re_run_extract_data = False + re_run_mapping_data = False force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" @@ -1517,15 +1517,15 @@ if __name__ == "__main__": # r"./sample_documents/aus_prospectus_17_documents_sample.txt" # ) document_sample_file = ( - r"./sample_documents/aus_prospectus_29_documents_sample.txt" + r"./sample_documents/aus_prospectus_46_documents_sample.txt" ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" - document_mapping_file = r"/data/aus_prospectus/basic_information/29_documents/aus_prospectus_29_documents_mapping.xlsx" - # special_doc_id_list: list = ["441280757"] + document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" + # special_doc_id_list: list = ["384508026"] # special_doc_id_list: list = ["401212184"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" diff --git a/sample_documents/aus_prospectus_46_documents_sample.txt b/sample_documents/aus_prospectus_46_documents_sample.txt new file mode 100644 index 0000000..b37d34f --- /dev/null +++ b/sample_documents/aus_prospectus_46_documents_sample.txt @@ -0,0 +1,46 @@ +377377369 +397107472 +401212184 +409723592 +411062815 +412778803 +414751292 +462770987 +471206458 +391080133 +391080140 +410899007 +420339794 +441280757 +446324179 +454036250 +384508026 +530101994 +550769189 +550522985 +539266893 +539241700 +539261734 +550533961 +506913190 +539266814 +521606716 +553449663 +528208796 +539266817 +539266874 +539266880 +526200514 +523516443 +526200513 +521606755 +557526129 +540028470 +531373053 +544886057 +557362556 +557362553 +520663234 +527969661 +541356150 +555377021 \ No newline at end of file