diff --git a/configuration/aus_prospectus/replace_table_header.json b/configuration/aus_prospectus/replace_table_header.json index 7dfc383..587d193 100644 --- a/configuration/aus_prospectus/replace_table_header.json +++ b/configuration/aus_prospectus/replace_table_header.json @@ -30,6 +30,12 @@ ["Investment\\s*option\\s*Administration fees[\\s\\S]*?administration\\s*costs\\s*Investment\\s*fees[\\s\\S]*?investment\\s*costs\\s*Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Estimated\\s*administration[\\s\\S]*?transaction\\s*costs[\\s\\S]*?annual\\s*fees\\s*and\\s*costs\\s*\\(\\%\\s*pa\\)\\s*\\n"], "replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n", "comments": ["item 0: document 411062815, page 17"] + }, + { + "regex_all_list": + ["\\nFund\\s*name\\s*Management\\s*fee\\s*Indirect\\s*costs\\s*Recoverable\\s*expenses[\\s\\S]*?performance.*\\s*fee\\s*Estimated\\s*other\\s*indirect\\s*costs\\s*\\n"], + "replace_text": "\nFund name \nManagement fee \nRecoverable expenses \nEstimated performance-related fee \nEstimated other indirect costs \n", + "comments": ["item 0: document 391080133, page 21"] } ] } \ No newline at end of file diff --git a/core/data_extraction.py b/core/data_extraction.py index 80486e6..12faf86 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -614,7 +614,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num not in [4, 5]: + # if page_num not in [20]: # continue if page_num in handled_page_num_list: continue diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index bb08215..66e3f3f 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -171,11 +171,21 @@ "C.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".", "The management_fee is the value of \"Management fee (% pa)\".", "The management_fee_and_costs is the value of \"Total management cost (% pa)\".", - "---Example Start---", + "---Example 1 Start---", "Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n", - "---Example End---", + "---Example 1 End---", "The output should be:", "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee_costs\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", + "---Example 2 Start---", + "\nFund name \nManagement fee \nRecoverable expenses \nEstimated performance-related fee \nEstimated other indirect costs \nPathways 30 \n1.16% pa \n0.02% pa \n0.05% pa \n0.05% pa \nPathways 70 \n1.30% pa \n0.01% pa \n0.06% pa \n0.04% pa \n", + "---Example 2 End---", + "The management_fee_and_costs is the value of \"Management fee\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".", + "The management_fee is the value of \"Management fee\".", + "The performance_fee_costs is the value of \"Estimated performance-related fee\".", + "The indirect_costs is the value of \"Estimated other indirect costs\".", + "The recoverable_expenses is the value of \"Recoverable expenses\".", + "The output should be:", + "{\"data\": [{\"fund name\": \"Pathways 30\", \"share name\": \"Pathways 30\", \"management_fee_and_costs\": 1.23, \"management_fee\": 1.16, \"recoverable_expenses\": 0.02, \"performance_fee_costs\": 0.05, \"indirect_costs\": 0.05}, {\"fund name\": \"Pathways 70\", \"share name\": \"Pathways 70\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.3, \"recoverable_expenses\": 0.01, \"performance_fee_costs\": 0.06, \"indirect_costs\": 0.04}]}", "\n", "D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", diff --git a/main.py b/main.py index 22894f9..7993804 100644 --- a/main.py +++ b/main.py @@ -1526,8 +1526,8 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - re_run_extract_data = False - re_run_mapping_data = False + re_run_extract_data = True + re_run_mapping_data = True force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" @@ -1560,8 +1560,8 @@ if __name__ == "__main__": # "544886057", # "550769189", # "553449663"] - special_doc_id_list = ["420339794", "441280757", "454036250", "471206458", "412778803"] - # special_doc_id_list = ["441280757"] + special_doc_id_list = ["391080133"] + # special_doc_id_list = ["391080133", ""] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = (