optimize instructions configuration

optimize drilldown part logic
This commit is contained in:
Blade He 2025-02-04 15:29:24 -06:00
parent f9ef4cec96
commit a8810519f8
6 changed files with 50 additions and 36 deletions

View File

@ -1,6 +1,6 @@
{
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
"management_fee_and_costs": {"english": ["management fees and cost", "Plus other investment fees and costs"]},
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
"performance_fee": {"english": ["performance fee", "performance fees"]},
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},

View File

@ -969,7 +969,9 @@ class DataExtraction:
if datapoint_name == "performance_fee":
datapoint_name = "performance fees"
else:
datapoint_name = datapoint_name.upper()
datapoint_name = self.datapoint_name_config.get(datapoint_name, "")
if len(datapoint_name) == 0:
datapoint_name = datapoint.upper()
reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}"
instructions.append(reported_name)

View File

@ -61,23 +61,8 @@
"---Example End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
"- 5. Reverse order of data columns from table text in PDF:",
"For this case, 1. the columns order is reversed, \n2. The fund name is in the end of row with number value in front of fund name.",
"---Example 1 Start---",
"Transaction\ncosts\n(gross)1\nBuy-sell\nspreads\nTransaction\ncosts (net)\nEquals\ninvestment fees and\ncosts\nThe investment fees and\ncosts are made up of\nPlus\nother\ninvestment\nfees and\ncosts\nPerformance\nfee\n% pa\nEntry %/\nExit %\n% pa\n% pa\n% pa\nReady-made portfolios\nSimple choice\n0.04\n0.10/0.10\n0.00\n0.62\n0.55\n0.07\nMLC Stable\n0.05\n0.10/0.10\n0.02\n0.80\n0.65\n0.15\nMLC Conservative Balanced",
"---Example 1 End---",
"For this case, Management fees and costs = Management fees with same reported name: Plus\nother\ninvestment\nfees and\ncosts",
"The output should be: ",
"{\"data\": [{\"fund name\": \"MLC Stable\", \"share name\": \"MLC Stable\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.07}, {\"fund name\": \"MLC Conservative Balanced\", \"share name\": \"MLC Conservative Balanced\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.65, \"management_fee\": 0.65, \"performance_fee\": 0.15}]",
"\n",
"---Example 2 Start---",
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
"---Example 2 End---",
"For this case, Management fees and costs = Management fees + Indirect Fee.",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
],
"investment_level": {
"total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.",
@ -136,7 +121,7 @@
"special_rule": {
"management_fee_and_costs": [
"If there are multiple Management fee and costs reported names, here is the priority rule:",
"- With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
"A. With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
"---Example Start---",
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
"---Example End---",
@ -144,19 +129,24 @@
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
"\n",
"If there are multiple Management fee and costs sub-columns, here is the rule:",
"- With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
"B. With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
"---Example Start---",
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
"---Example End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
"\n",
"- With \"Management fees\" and \"Administration fee\", sum the values from these two columns: \"Management fees\" + \"Administration fee\".",
"---Example Start---",
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nAdministration Fee\nManagement fees\nMLC diversified investment\noption\n1.62% p.a.\n0.02% p.a.\n0.03% p.a.\n0.09% p.a.\n1.58% p.a.\nMLC Horizon 4 Balanced\nPortfolio\n",
"---Example End---",
"C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".",
"---Example 1 Start---",
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.67, \"management_fee\": 1.58, \"administration_fees\": 0.09, \"performance_fee\": 0.03}]"
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]",
"---Example 2 Start---",
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]"
],
"buy_spread": [
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
@ -263,7 +253,7 @@
"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
"minimum_initial_investment_value": [0, 5, 12],
"minimum_initial_investment_value": [0, 5000, 10000],
"recoverable_expenses_value": [0.12, 0.05, 0.06],
"indirect_costs_value": [0.12, 0.16, 0.02]
},

21
main.py
View File

@ -518,7 +518,8 @@ def batch_start_job(
)
if (
document_mapping_file is not None
doc_source == "aus_prospectus"
and document_mapping_file is not None
and len(document_mapping_file) > 0
and os.path.exists(document_mapping_file)
):
@ -1040,9 +1041,9 @@ def batch_run_documents(
page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
)
re_run_extract_data = False
re_run_mapping_data = False
force_save_total_data = True
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = False
calculate_metrics = False
extract_way = "text"
@ -1383,14 +1384,18 @@ if __name__ == "__main__":
# special_doc_id_list = ["553242411"]
doc_source = "emea_ar"
doc_source = "aus_prospectus"
if doc_source == "aus_prospectus":
# document_sample_file = (
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
# )
document_sample_file = (
r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
)
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
# special_doc_id_list: list = [
# "539790009",
# "542300403",
@ -1404,7 +1409,7 @@ if __name__ == "__main__":
# "555377021",
# "555654388",
# ]
special_doc_id_list: list = ["539790009"]
special_doc_id_list: list = ["377377369"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (

View File

@ -0,0 +1,17 @@
377377369
397107472
401212184
409723592
411062815
412778803
414751292
462770987
471206458
391080133
391080140
410899007
420339794
441280757
446324179
454036250
384508026

View File

@ -543,7 +543,7 @@ class PDFUtil:
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
if len(matching_val_area) == 0:
matching_val_area = page.search_for(text_block.replace('-\n', ''))
if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
if len(matching_val_area) > 0 and len(text_block.strip().split()) < 3:
new_matching_val_area = []
for area in matching_val_area:
# get text by text_bbox