diff --git a/configuration/aus_prospectus/datapoint_keyword.json b/configuration/aus_prospectus/datapoint_keyword.json index 168a207..a7514cf 100644 --- a/configuration/aus_prospectus/datapoint_keyword.json +++ b/configuration/aus_prospectus/datapoint_keyword.json @@ -10,6 +10,6 @@ "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]}, "benchmark_name": {"english": ["benchmark fund","benchmark name", "Benchmark", "aims to outperform"]}, "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]}, - "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]}, + "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost", "recoverable costs", "expense recoveries"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_reported_name.json b/configuration/aus_prospectus/datapoint_reported_name.json index c6a06ec..020f252 100644 --- a/configuration/aus_prospectus/datapoint_reported_name.json +++ b/configuration/aus_prospectus/datapoint_reported_name.json @@ -10,6 +10,6 @@ "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]}, "benchmark_name": {"english": ["benchmark fund", "benchmark name", "Benchmark", "aims to outperform"]}, "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]}, - "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]}, + "recoverable_expenses": {"english": ["recoverable expenses", "recoverable cost", "recoverable costs", "expense recoveries"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} } \ No newline at end of file diff --git a/core/data_extraction.py b/core/data_extraction.py index 25b78fc..055d8fc 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -237,11 +237,71 @@ class DataExtraction: data_dict["completion_token"] = result.get("completion_token", 0) data_dict["total_token"] = result.get("total_token", 0) """ + data_list = self.remove_duplicate_data(data_list) data_list = self.post_adjust_management_fee_costs(data_list) data_list = self.supplement_minimum_initial_investment(data_list) return data_list + def remove_duplicate_data(self, data_list: list): + """ + The purpose is to remove duplicate data in the different pages. + Reason: + 1. Some pdf documents have multiple pages for the same data + 2. Usually, the first data is the latest data, and the others is the older data. + 3. That's why we need to remove the duplicate data in the different pages. + """ + handled_data_dict_list = [] + for data_dict in data_list: + extract_data = data_dict.get("extract_data", {}) + data = extract_data.get("data", []) + for data_item in data: + keys = list(data_item.keys()) + fund_name = data_item.get("fund_name", "") + share_name = data_item.get("share_name", "") + raw_name = self.get_raw_name(fund_name, share_name) + dp_keys = [key for key in keys if key not in ["fund_name", "share_name"]] + # sort the keys + dp_keys.sort() + additional_dp_keys = [dp_key for dp_key in dp_keys + if dp_key not in ["management_fee", "management_fee_and_costs"]] + if len(additional_dp_keys) == 0: + continue + for c_data_dict in data_list: + if c_data_dict in handled_data_dict_list: + continue + if c_data_dict == data_dict: + continue + c_extract_data = c_data_dict.get("extract_data", {}) + c_data = c_extract_data.get("data", []) + remove_c_items = [] + for c_data_item in c_data: + c_keys = list(c_data_item.keys()) + c_fund_name = c_data_item.get("fund_name", "") + c_share_name = c_data_item.get("share_name", "") + c_raw_name = self.get_raw_name(c_fund_name, c_share_name) + if raw_name != c_raw_name: + continue + c_dp_keys = [key for key in c_keys if key not in ["fund_name", "share_name"]] + c_dp_keys.sort() + if dp_keys == c_dp_keys: + remove_c_items.append(c_data_item) + for remove_c_item in remove_c_items: + if remove_c_item in c_data: + c_data.remove(remove_c_item) + handled_data_dict_list.append(data_dict) + return data_list + + def get_raw_name(self, fund_name: str, share_name: str) -> str: + raw_name = "" + if fund_name == share_name: + raw_name = fund_name + elif share_name.startswith(fund_name): + raw_name = share_name + else: + raw_name = f"{fund_name} {share_name}" + return raw_name + def post_adjust_management_fee_costs(self, data_list: list): management_fee_costs_list = [] management_fee_list = [] diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 9ee75a3..80f14cf 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -120,6 +120,7 @@ }, "special_rule": { "management_fee_and_costs": [ + "Management fee and cost = Management fee + indirect cost + recoverable expense (Also known as Expense recovery cost or recovery fee or Expense recovery fee or expense recoveries) + Manager fee or Responsible entity fee.", "If there are multiple Management fee and costs reported names, here is the priority rule:", "A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".", @@ -130,11 +131,19 @@ "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}", "\n", "A.2 The data value with gross and net, please ignore gross value, output the net value only.", - "---Example 2 Start---", + "---Example Start---", "Small Fund \nManagement fees \nand costs \n1.17% pa (gross)/2.51% pa (net) \n", - "---Example 2 End---", + "---Example End---", "The output should be:", "{\"data\": [{\"fund name\": \"Small Fund\", \"share name\": \"Small Fund\", \"management_fee_and_costs\": 2.51, \"management_fee\": 2.51}]}", + "\n", + "If the context only mentions the gross value or gross asset value or GAV, please ignore the gross value as the management_fee_and_costs and management_fee value, just output empty.", + "---Example Start---", + "Fees and other costs (cont’d) \n\n08 \n\nType of fee or cost Amount 2 How and when paid \nMANAGEMENT COSTS \nThe fees and costs for \nmanaging your investment 1 \nEstimated at 0.75625% of gross \nasset value (GAV) per annum, \ncomprising: \nThe base management fee is payable from \nthe income and assets of the Fund to the \nInvestment Manager half-yearly in arrears \nBase Management Fee \n0.50% per annum of the Average \nGAV 3 \nAnd \nExpense Recovery Costs \n0.25625% (estimated) per annum \nof GAV in other fees, expenses \nand indirect costs.", + "---Example End---", + "The output should be:", + "{\"data\": []}", + "\n", "B. If there are multiple Management fee and costs sub-columns, here is the rule: ", "With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".", "---Example Start---", @@ -154,6 +163,7 @@ "---Example 2 End---", "The output should be:", "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}", + "\n", "D. With table header: \"Management Fees and costs (A)\" and \"(A)+(B) + (C) = (D) Total Fees and Costs\", please only focus the values under \"Management Fees and costs (A)\"", "Please get the first \"Entry Fee Option\" and \"Estimated Other investment costs\" sub-columns values, and sum as the management_fee_and_costs and management_fee value, ignore other columns values \n", "---Example 1 Start---", diff --git a/main.py b/main.py index 4506633..b7c03aa 100644 --- a/main.py +++ b/main.py @@ -1042,8 +1042,8 @@ def batch_run_documents( page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) - re_run_extract_data = False - re_run_mapping_data = False + re_run_extract_data = True + re_run_mapping_data = True force_save_total_data = True calculate_metrics = False @@ -1377,7 +1377,7 @@ def merge_output_data_aus_prospectus( def get_aus_prospectus_document_category(): document_sample_file = ( - r"./sample_documents/aus_prospectus_17_documents_sample.txt" + r"./sample_documents/aus_prospectus_29_documents_sample.txt" ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] @@ -1422,30 +1422,73 @@ def get_aus_prospectus_document_category(): r"/data/aus_prospectus/output/document_category/" ) os.makedirs(output_extract_document_category_folder, exist_ok=True) - output_file = os.path.join(output_extract_document_category_folder, "document_category.json") + output_file = os.path.join(output_extract_document_category_folder, "29_documents_category.json") with open(output_file, "w", encoding="utf-8") as f: json.dump(document_category_dict, f, ensure_ascii=False, indent=4) logger.info(f"Document category: {document_category_dict}") + + +def test_remove_duplicate_extract_data(): + doc_id = "369105359" + pdf_folder: str = r"/data/aus_prospectus/pdf/" + output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" + output_extract_data_child_folder: str = ( + r"/data/aus_prospectus/output/extract_data/docs/" + ) + output_mapping_child_folder: str = ( + r"/data/aus_prospectus/output/mapping_data/docs/" + ) + drilldown_folder = r"/data/aus_prospectus/output/drilldown/" + doc_source = "aus_prospectus" + extract_way = "text" + emea_ar_parsing = EMEA_AR_Parsing( + doc_id, + doc_source=doc_source, + pdf_folder=pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, + output_extract_data_folder=output_extract_data_child_folder, + output_mapping_data_folder=output_mapping_child_folder, + extract_way=extract_way, + drilldown_folder=drilldown_folder, + compare_with_provider=False + ) + data_extraction = DataExtraction(doc_source=emea_ar_parsing.doc_source, + doc_id=emea_ar_parsing.doc_id, + pdf_file=emea_ar_parsing.pdf_file, + output_data_folder=emea_ar_parsing.output_extract_data_folder, + page_text_dict=emea_ar_parsing.page_text_dict, + datapoint_page_info=emea_ar_parsing.datapoint_page_info, + datapoints=emea_ar_parsing.datapoints, + document_mapping_info_df=emea_ar_parsing.document_mapping_info_df, + extract_way=extract_way) + data_folder = r"/data/aus_prospectus/output/extract_data/docs/by_text/json/" + + data_file = f"{doc_id}.json" + data_file_path = os.path.join(data_folder, data_file) + with open(data_file_path, "r", encoding="utf-8") as f: + data_list = json.load(f) + data_list = data_extraction.remove_duplicate_data(data_list) if __name__ == "__main__": + # test_remove_duplicate_extract_data() # get_aus_prospectus_document_category() # test_data_extraction_metrics() - # data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_17_documents_by_text_20250219123515.xlsx" - # document_mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" + # data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_1_documents_by_text_20250226155259.xlsx" + # document_mapping_file_path = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/' # os.makedirs(merged_total_data_folder, exist_ok=True) # data_file_base_name = os.path.basename(data_file_path) # output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) # merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path) - # doc_source = "aus_prospectus" - # sample_document_list_folder: str = r'./sample_documents/' - # document_list_file: str = "aus_prospectus_52_documents_sample.txt" - # pdf_folder: str = r"/data/aus_prospectus/pdf/" - # output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" - # output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/" - # output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/" + doc_source = "aus_prospectus" + sample_document_list_folder: str = r'./sample_documents/' + document_list_file: str = "aus_prospectus_29_documents_sample.txt" + pdf_folder: str = r"/data/aus_prospectus/pdf/" + output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" + output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/" + output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/" # batch_initial_document(sample_document_list_folder=sample_document_list_folder, # document_list_file=document_list_file, # doc_source=doc_source, @@ -1453,6 +1496,8 @@ if __name__ == "__main__": # output_pdf_text_folder=output_pdf_text_folder, # output_extract_data_child_folder=output_extract_data_child_folder, # output_mapping_child_folder=output_mapping_child_folder) + + # get_aus_prospectus_document_category() # special_doc_id_list = ["553242411"] @@ -1486,7 +1531,7 @@ if __name__ == "__main__": # "555377021", # "555654388", # ] - # special_doc_id_list: list = ["411062815", "462770987", "420339794", "441280757"] + special_doc_id_list: list = ["412778803"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/sample_documents/aus_prospectus_29_documents_sample.txt b/sample_documents/aus_prospectus_29_documents_sample.txt new file mode 100644 index 0000000..cc9f525 --- /dev/null +++ b/sample_documents/aus_prospectus_29_documents_sample.txt @@ -0,0 +1,29 @@ +530101994 +550769189 +550522985 +539266893 +539241700 +539261734 +550533961 +506913190 +539266814 +521606716 +553449663 +528208796 +539266817 +521606755 +557526129 +540028470 +531373053 +544886057 +557362556 +557362553 +520663234 +527969661 +541356150 +555377021 +523516443 +539266874 +539266880 +526200514 +526200513 \ No newline at end of file