diff --git a/configuration/aus_prospectus/datapoint_keyword.json b/configuration/aus_prospectus/datapoint_keyword.json index 696a982..24e94ad 100644 --- a/configuration/aus_prospectus/datapoint_keyword.json +++ b/configuration/aus_prospectus/datapoint_keyword.json @@ -1,7 +1,7 @@ { "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, - "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]}, - "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]}, + "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs", "Management costs"]}, + "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]}, "performance_fee": {"english": ["performance fee", "performance fees"]}, "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, diff --git a/configuration/aus_prospectus/datapoint_reported_name.json b/configuration/aus_prospectus/datapoint_reported_name.json index 115122a..9928cab 100644 --- a/configuration/aus_prospectus/datapoint_reported_name.json +++ b/configuration/aus_prospectus/datapoint_reported_name.json @@ -1,7 +1,7 @@ { "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, - "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs"]}, - "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]}, + "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs"]}, + "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]}, "performance_fee": {"english": ["performance fee", "performance fees"]}, "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, diff --git a/core/data_extraction.py b/core/data_extraction.py index 715aa56..c153431 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -205,6 +205,76 @@ class DataExtraction: data_dict["completion_token"] = result.get("completion_token", 0) data_dict["total_token"] = result.get("total_token", 0) """ + data_list = self.post_adjust_management_fee_costs(data_list) + data_list = self.supplement_minimum_initial_investment(data_list) + + return data_list + + def post_adjust_management_fee_costs(self, data_list: list): + management_fee_costs_list = [] + management_fee_list = [] + for data_dict in data_list: + extract_data = data_dict.get("extract_data", {}) + data = extract_data.get("data", []) + for data_item in data: + keys = list(data_item.keys()) + fund_name = data_item.get("fund_name", "") + share_name = data_item.get("share_name", "") + if fund_name == "" or share_name == "": + continue + if "management_fee" in keys: + management_fee = data_item.get("management_fee", -1) + if management_fee != -1: + found = False + for mf in management_fee_list: + if mf.get("fund_name", "") == fund_name and mf.get("share_name", "") == share_name: + mf_value = mf.get("management_fee", -1) + if mf_value != -1 and mf_value >= management_fee: + mf["management_fee"] = management_fee + found = True + break + if not found: + management_fee_list.append({"fund_name": fund_name, + "share_name": share_name, + "management_fee": management_fee}) + if "management_fee_and_costs" in keys: + management_fee_costs = data_item.get("management_fee_and_costs", -1) + if management_fee_costs != -1: + found = False + for mfc in management_fee_costs_list: + if mfc.get("fund_name", "") == fund_name and mfc.get("share_name", "") == share_name: + mfc_value = mfc.get("management_fee_and_costs", -1) + if mfc_value != -1 and mfc_value <= management_fee_costs: + mfc["management_fee_and_costs"] = management_fee_costs + found = True + break + if not found: + management_fee_costs_list.append({"fund_name": fund_name, + "share_name": share_name, + "management_fee_and_costs": management_fee_costs}) + for data_dict in data_list: + extract_data = data_dict.get("extract_data", {}) + data = extract_data.get("data", []) + for data_item in data: + keys = list(data_item.keys()) + fund_name = data_item.get("fund_name", "") + share_name = data_item.get("share_name", "") + if fund_name == "" or share_name == "": + continue + if "management_fee" in keys: + for mf in management_fee_list: + if mf.get("fund_name", "") == fund_name and mf.get("share_name", "") == share_name: + data_item["management_fee"] = mf.get("management_fee", -1) + break + if "management_fee_and_costs" in keys: + for mfc in management_fee_costs_list: + if mfc.get("fund_name", "") == fund_name and mfc.get("share_name", "") == share_name: + data_item["management_fee_and_costs"] = mfc.get("management_fee_and_costs", -1) + break + return data_list + + + def supplement_minimum_initial_investment(self, data_list: list): exist_minimum_initial_investment = False minimum_initial_investment = -1 mii_fund_name = "" @@ -241,8 +311,6 @@ class DataExtraction: new_mii_data_list.append(new_data_dict) mii_dict["extract_data"]["data"] = new_mii_data_list return data_list - - def extract_data_by_text(self) -> dict: """ @@ -318,12 +386,18 @@ class DataExtraction: should_continue = True else: for next_datapoint in next_datapoints: - if next_datapoint not in page_datapoints: - should_continue = True - break - next_datapoints.extend(page_datapoints) - # remove duplicate datapoints - next_datapoints = list(set(next_datapoints)) + if self.doc_source == "aus_prospectus": + if next_datapoint in page_datapoints: + should_continue = False + break + else: + if next_datapoint not in page_datapoints: + should_continue = True + break + if should_continue: + next_datapoints.extend(page_datapoints) + # remove duplicate datapoints + next_datapoints = list(set(next_datapoints)) if not should_continue: break if extract_way == "text": diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 12025ce..9e21861 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -146,7 +146,19 @@ "Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n", "---Example 2 End---", "The output should be:", - "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]" + "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]", + "D. With table header: \"Management Fees and costs (A)\" and \"(A)+(B) + (C) = (D) Total Fees and Costs\", please only focus the values under \"Management Fees and costs (A)\"", + "Please get the first \"Entry Fee Option\" and \"Estimated Other investment costs\" sub-columns values, and sum as the management_fee_and_costs and management_fee value, ignore other columns values \n", + "---Example 1 Start---", + "Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.02 0.000.00 0.49 1.32\n", + "---Example 1 End---", + "For this case, the first \"Entry Fee Option\" value is 0.47, the first \"Estimated Other investment costs\" value is 0.02, the sum is 0.49, so the output should be:", + "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged)\", \"share name\": \"OnePath International Shares Index (Hedged)\", \"management_fee_and_costs\": 0.49, \"management_fee\": 0.49}]", + "---Example 2 Start---", + "Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\n", + "---Example 2 End---", + "For this case, the first \"Entry Fee Option\" value is 1.44, the first \"Estimated Other investment costs\" value is 0.00, the sum is 1.44, so the output should be:", + "{\"data\": [{\"fund name\": \"Pendal Concentrated Global Shares Hedged II\", \"share name\": \"Pendal Concentrated Global Shares Hedged II\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44}]" ], "buy_spread": [ "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", diff --git a/main.py b/main.py index 89dba42..8664504 100644 --- a/main.py +++ b/main.py @@ -1042,8 +1042,8 @@ def batch_run_documents( page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) - re_run_extract_data = True - re_run_mapping_data = True + re_run_extract_data = False + re_run_mapping_data = False force_save_total_data = True calculate_metrics = False @@ -1397,16 +1397,17 @@ if __name__ == "__main__": # document_sample_file = ( # r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" # ) - # document_sample_file = ( - # r"./sample_documents/aus_prospectus_17_documents_sample.txt" - # ) document_sample_file = ( - r"./sample_documents/aus_prospectus_52_documents_sample.txt" + r"./sample_documents/aus_prospectus_17_documents_sample.txt" ) + # document_sample_file = ( + # r"./sample_documents/aus_prospectus_52_documents_sample.txt" + # ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" - document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" + # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" + document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" # special_doc_id_list: list = [ # "539790009", # "542300403", @@ -1420,7 +1421,7 @@ if __name__ == "__main__": # "555377021", # "555654388", # ] - # special_doc_id_list: list = ["377377369"] + # special_doc_id_list: list = ["401212184"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = (