From a8810519f89b2869fbcd52baa9a3a9d0eda30589 Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 4 Feb 2025 15:29:24 -0600 Subject: [PATCH] optimize instructions configuration optimize drilldown part logic --- .../aus_prospectus/datapoint_keyword.json | 2 +- core/data_extraction.py | 4 +- .../data_extraction_prompts_config.json | 40 +++++++------------ main.py | 21 ++++++---- .../aus_prospectus_17_documents_sample.txt | 17 ++++++++ utils/pdf_util.py | 2 +- 6 files changed, 50 insertions(+), 36 deletions(-) create mode 100644 sample_documents/aus_prospectus_17_documents_sample.txt diff --git a/configuration/aus_prospectus/datapoint_keyword.json b/configuration/aus_prospectus/datapoint_keyword.json index b0f7c60..f8773c8 100644 --- a/configuration/aus_prospectus/datapoint_keyword.json +++ b/configuration/aus_prospectus/datapoint_keyword.json @@ -1,6 +1,6 @@ { "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, - "management_fee_and_costs": {"english": ["management fees and cost", "Plus other investment fees and costs"]}, + "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]}, "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]}, "performance_fee": {"english": ["performance fee", "performance fees"]}, "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]}, diff --git a/core/data_extraction.py b/core/data_extraction.py index f607f95..ea5c7e7 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -969,7 +969,9 @@ class DataExtraction: if datapoint_name == "performance_fee": datapoint_name = "performance fees" else: - datapoint_name = datapoint_name.upper() + datapoint_name = self.datapoint_name_config.get(datapoint_name, "") + if len(datapoint_name) == 0: + datapoint_name = datapoint.upper() reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}" instructions.append(reported_name) diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 0dfc49f..ccb7903 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -61,23 +61,8 @@ "---Example End---", "The output should be:", "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]", - "- 5. Reverse order of data columns from table text in PDF:", - "For this case, 1. the columns order is reversed, \n2. The fund name is in the end of row with number value in front of fund name.", - "---Example 1 Start---", - "Transaction\ncosts\n(gross)1\nBuy-sell\nspreads\nTransaction\ncosts (net)\nEquals\ninvestment fees and\ncosts\nThe investment fees and\ncosts are made up of\nPlus\nother\ninvestment\nfees and\ncosts\nPerformance\nfee\n% pa\nEntry %/\nExit %\n% pa\n% pa\n% pa\nReady-made portfolios\nSimple choice\n0.04\n0.10/0.10\n0.00\n0.62\n0.55\n0.07\nMLC Stable\n0.05\n0.10/0.10\n0.02\n0.80\n0.65\n0.15\nMLC Conservative Balanced", - "---Example 1 End---", - "For this case, Management fees and costs = Management fees with same reported name: Plus\nother\ninvestment\nfees and\ncosts", - "The output should be: ", - "{\"data\": [{\"fund name\": \"MLC Stable\", \"share name\": \"MLC Stable\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.07}, {\"fund name\": \"MLC Conservative Balanced\", \"share name\": \"MLC Conservative Balanced\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.65, \"management_fee\": 0.65, \"performance_fee\": 0.15}]", - "\n", - "---Example 2 Start---", - "\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n", - "---Example 2 End---", - "For this case, Management fees and costs = Management fees + Indirect Fee.", - "The output should be:", - "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]", "- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:", - "\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"." + "\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"." ], "investment_level": { "total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.", @@ -136,7 +121,7 @@ "special_rule": { "management_fee_and_costs": [ "If there are multiple Management fee and costs reported names, here is the priority rule:", - "- With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".", + "A. With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".", "---Example Start---", "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n", "---Example End---", @@ -144,19 +129,24 @@ "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]", "\n", "If there are multiple Management fee and costs sub-columns, here is the rule:", - "- With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".", + "B. With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".", "---Example Start---", - "\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n", + "\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n", "---Example End---", "The output should be:", "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]", "\n", - "- With \"Management fees\" and \"Administration fee\", sum the values from these two columns: \"Management fees\" + \"Administration fee\".", - "---Example Start---", - "\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nAdministration Fee\nManagement fees\nMLC diversified investment\noption\n1.62% p.a.\n0.02% p.a.\n0.03% p.a.\n0.09% p.a.\n1.58% p.a.\nMLC Horizon 4 Balanced\nPortfolio\n", - "---Example End---", + "C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".", + "---Example 1 Start---", + "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", + "---Example 1 End---", "The output should be:", - "{\"data\": [{\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.67, \"management_fee\": 1.58, \"administration_fees\": 0.09, \"performance_fee\": 0.03}]" + "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]", + "---Example 2 Start---", + "Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n", + "---Example 2 End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]" ], "buy_spread": [ "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", @@ -263,7 +253,7 @@ "date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"], "date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"], "high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"], - "minimum_initial_investment_value": [0, 5, 12], + "minimum_initial_investment_value": [0, 5000, 10000], "recoverable_expenses_value": [0.12, 0.05, 0.06], "indirect_costs_value": [0.12, 0.16, 0.02] }, diff --git a/main.py b/main.py index 1d852e6..f6dc138 100644 --- a/main.py +++ b/main.py @@ -518,7 +518,8 @@ def batch_start_job( ) if ( - document_mapping_file is not None + doc_source == "aus_prospectus" + and document_mapping_file is not None and len(document_mapping_file) > 0 and os.path.exists(document_mapping_file) ): @@ -1040,9 +1041,9 @@ def batch_run_documents( page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) - re_run_extract_data = False - re_run_mapping_data = False - force_save_total_data = True + re_run_extract_data = True + re_run_mapping_data = True + force_save_total_data = False calculate_metrics = False extract_way = "text" @@ -1383,14 +1384,18 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - doc_source = "emea_ar" + doc_source = "aus_prospectus" if doc_source == "aus_prospectus": + # document_sample_file = ( + # r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" + # ) document_sample_file = ( - r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" + r"./sample_documents/aus_prospectus_17_documents_sample.txt" ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] - document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" + # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" + document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" # special_doc_id_list: list = [ # "539790009", # "542300403", @@ -1404,7 +1409,7 @@ if __name__ == "__main__": # "555377021", # "555654388", # ] - special_doc_id_list: list = ["539790009"] + special_doc_id_list: list = ["377377369"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/sample_documents/aus_prospectus_17_documents_sample.txt b/sample_documents/aus_prospectus_17_documents_sample.txt new file mode 100644 index 0000000..17ebb54 --- /dev/null +++ b/sample_documents/aus_prospectus_17_documents_sample.txt @@ -0,0 +1,17 @@ +377377369 +397107472 +401212184 +409723592 +411062815 +412778803 +414751292 +462770987 +471206458 +391080133 +391080140 +410899007 +420339794 +441280757 +446324179 +454036250 +384508026 \ No newline at end of file diff --git a/utils/pdf_util.py b/utils/pdf_util.py index 008c71a..7587e2f 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -543,7 +543,7 @@ class PDFUtil: matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', '')) if len(matching_val_area) == 0: matching_val_area = page.search_for(text_block.replace('-\n', '')) - if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1: + if len(matching_val_area) > 0 and len(text_block.strip().split()) < 3: new_matching_val_area = [] for area in matching_val_area: # get text by text_bbox