From 91c86bb9833b551af3ae071dd7f9387233ef9842 Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 8 Jan 2025 17:40:57 -0600 Subject: [PATCH] update AUS Prospectus relevant configuration --- configuration/datapoint_keyword.json | 6 ++-- configuration/datapoint_reported_name.json | 6 ++-- .../data_extraction_prompts_config.json | 31 ++++++++++++++++--- main.py | 4 +-- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/configuration/datapoint_keyword.json b/configuration/datapoint_keyword.json index 43d8c12..b0f7c60 100644 --- a/configuration/datapoint_keyword.json +++ b/configuration/datapoint_keyword.json @@ -1,7 +1,7 @@ { "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, - "management_fee_and_costs": {"english": ["management fees and cost"]}, - "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1"]}, + "management_fee_and_costs": {"english": ["management fees and cost", "Plus other investment fees and costs"]}, + "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]}, "performance_fee": {"english": ["performance fee", "performance fees"]}, "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, @@ -23,5 +23,5 @@ "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]}, "minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]}, "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]}, - "indirect_costs": {"english": ["indirect cost","indirect fees","indirect costs"]} + "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} } \ No newline at end of file diff --git a/configuration/datapoint_reported_name.json b/configuration/datapoint_reported_name.json index 0f0acad..c119485 100644 --- a/configuration/datapoint_reported_name.json +++ b/configuration/datapoint_reported_name.json @@ -1,7 +1,7 @@ { "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, - "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost"]}, - "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1"]}, + "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs"]}, + "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]}, "performance_fee": {"english": ["performance fee", "performance fees"]}, "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, @@ -23,5 +23,5 @@ "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]}, "minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]}, "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]}, - "indirect_costs": {"english": ["indirect cost","indirect fees","indirect costs"]} + "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} } \ No newline at end of file diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index ac0df64..0dfc49f 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -17,8 +17,8 @@ "data_business_features": { "common": [ "General rules:", - "- The data is in the context, perhaps in table(s), semi-table(s) or paragraphs.", - "- Fund name: ", + "- 1. The data is in the context, perhaps in table(s), semi-table(s) or paragraphs.", + "- 2. Fund name: ", "a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.", "b. The sub-fund name may be as the first column or first row values in the table.", "b.1 fund name example:", @@ -34,7 +34,7 @@ "---- Example End ----", "Correct fund name: AXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon", "\n", - "- Only extract the latest data from context:", + "- 3. Only extract the latest data from context:", "If with multiple data values in same row, please extract the latest.", "\n", "d. Some table format, the fund name is in the end of row, please extract the fund name from the end of row.", @@ -51,7 +51,7 @@ "---Example End---", "Correct fund name: MLC Horizon 2 Income Portfolio", "Correct share name: MLC Horizon 2 Income Portfolio", - "- Reported names:", + "- 4. Reported names:", "Only output the values which with significant reported names.", "- Multiple data columns with same reported name but different post-fix:", "If there are multiple reported names with different post-fix text, here is the priority rule:", @@ -60,7 +60,24 @@ "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n", "---Example End---", "The output should be:", - "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]" + "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]", + "- 5. Reverse order of data columns from table text in PDF:", + "For this case, 1. the columns order is reversed, \n2. The fund name is in the end of row with number value in front of fund name.", + "---Example 1 Start---", + "Transaction\ncosts\n(gross)1\nBuy-sell\nspreads\nTransaction\ncosts (net)\nEquals\ninvestment fees and\ncosts\nThe investment fees and\ncosts are made up of\nPlus\nother\ninvestment\nfees and\ncosts\nPerformance\nfee\n% pa\nEntry %/\nExit %\n% pa\n% pa\n% pa\nReady-made portfolios\nSimple choice\n0.04\n0.10/0.10\n0.00\n0.62\n0.55\n0.07\nMLC Stable\n0.05\n0.10/0.10\n0.02\n0.80\n0.65\n0.15\nMLC Conservative Balanced", + "---Example 1 End---", + "For this case, Management fees and costs = Management fees with same reported name: Plus\nother\ninvestment\nfees and\ncosts", + "The output should be: ", + "{\"data\": [{\"fund name\": \"MLC Stable\", \"share name\": \"MLC Stable\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.07}, {\"fund name\": \"MLC Conservative Balanced\", \"share name\": \"MLC Conservative Balanced\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.65, \"management_fee\": 0.65, \"performance_fee\": 0.15}]", + "\n", + "---Example 2 Start---", + "\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n", + "---Example 2 End---", + "For this case, Management fees and costs = Management fees + Indirect Fee.", + "The output should be:", + "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]", + "- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:", + "\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"." ], "investment_level": { "total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.", @@ -140,6 +157,10 @@ "---Example End---", "The output should be:", "{\"data\": [{\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.67, \"management_fee\": 1.58, \"administration_fees\": 0.09, \"performance_fee\": 0.03}]" + ], + "buy_spread": [ + "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", + "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)" ] } }, diff --git a/main.py b/main.py index 9bbaa04..841bb3d 100644 --- a/main.py +++ b/main.py @@ -887,7 +887,7 @@ def batch_run_documents(special_doc_id_list: list = None, ) re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = False + force_save_total_data = True calculate_metrics = False extract_way = "text" @@ -1051,7 +1051,7 @@ if __name__ == "__main__": special_doc_id_list: list = ["539790009", "542300403", "542301117", - # "542306317", + "542306317", "547567013", "552505237", "552505278",