optimize instructions configuration
optimize drilldown part logic
This commit is contained in:
parent
f9ef4cec96
commit
a8810519f8
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
|
||||
"management_fee_and_costs": {"english": ["management fees and cost", "Plus other investment fees and costs"]},
|
||||
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]},
|
||||
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
|
||||
"performance_fee": {"english": ["performance fee", "performance fees"]},
|
||||
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
|
||||
|
|
|
|||
|
|
@ -969,7 +969,9 @@ class DataExtraction:
|
|||
if datapoint_name == "performance_fee":
|
||||
datapoint_name = "performance fees"
|
||||
else:
|
||||
datapoint_name = datapoint_name.upper()
|
||||
datapoint_name = self.datapoint_name_config.get(datapoint_name, "")
|
||||
if len(datapoint_name) == 0:
|
||||
datapoint_name = datapoint.upper()
|
||||
reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}"
|
||||
|
||||
instructions.append(reported_name)
|
||||
|
|
|
|||
|
|
@ -61,21 +61,6 @@
|
|||
"---Example End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
|
||||
"- 5. Reverse order of data columns from table text in PDF:",
|
||||
"For this case, 1. the columns order is reversed, \n2. The fund name is in the end of row with number value in front of fund name.",
|
||||
"---Example 1 Start---",
|
||||
"Transaction\ncosts\n(gross)1\nBuy-sell\nspreads\nTransaction\ncosts (net)\nEquals\ninvestment fees and\ncosts\nThe investment fees and\ncosts are made up of\nPlus\nother\ninvestment\nfees and\ncosts\nPerformance\nfee\n% pa\nEntry %/\nExit %\n% pa\n% pa\n% pa\nReady-made portfolios\nSimple choice\n0.04\n0.10/0.10\n0.00\n0.62\n0.55\n0.07\nMLC Stable\n0.05\n0.10/0.10\n0.02\n0.80\n0.65\n0.15\nMLC Conservative Balanced",
|
||||
"---Example 1 End---",
|
||||
"For this case, Management fees and costs = Management fees with same reported name: Plus\nother\ninvestment\nfees and\ncosts",
|
||||
"The output should be: ",
|
||||
"{\"data\": [{\"fund name\": \"MLC Stable\", \"share name\": \"MLC Stable\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.07}, {\"fund name\": \"MLC Conservative Balanced\", \"share name\": \"MLC Conservative Balanced\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.65, \"management_fee\": 0.65, \"performance_fee\": 0.15}]",
|
||||
"\n",
|
||||
"---Example 2 Start---",
|
||||
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
|
||||
"---Example 2 End---",
|
||||
"For this case, Management fees and costs = Management fees + Indirect Fee.",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
|
||||
"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
|
||||
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
|
||||
],
|
||||
|
|
@ -136,7 +121,7 @@
|
|||
"special_rule": {
|
||||
"management_fee_and_costs": [
|
||||
"If there are multiple Management fee and costs reported names, here is the priority rule:",
|
||||
"- With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
|
||||
"A. With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
|
||||
"---Example Start---",
|
||||
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
|
||||
"---Example End---",
|
||||
|
|
@ -144,19 +129,24 @@
|
|||
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
|
||||
"\n",
|
||||
"If there are multiple Management fee and costs sub-columns, here is the rule:",
|
||||
"- With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
||||
"B. With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
||||
"---Example Start---",
|
||||
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
|
||||
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
|
||||
"---Example End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
|
||||
"\n",
|
||||
"- With \"Management fees\" and \"Administration fee\", sum the values from these two columns: \"Management fees\" + \"Administration fee\".",
|
||||
"---Example Start---",
|
||||
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nAdministration Fee\nManagement fees\nMLC diversified investment\noption\n1.62% p.a.\n0.02% p.a.\n0.03% p.a.\n0.09% p.a.\n1.58% p.a.\nMLC Horizon 4 Balanced\nPortfolio\n",
|
||||
"---Example End---",
|
||||
"C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".",
|
||||
"---Example 1 Start---",
|
||||
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
|
||||
"---Example 1 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.67, \"management_fee\": 1.58, \"administration_fees\": 0.09, \"performance_fee\": 0.03}]"
|
||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]",
|
||||
"---Example 2 Start---",
|
||||
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
|
||||
"---Example 2 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]"
|
||||
],
|
||||
"buy_spread": [
|
||||
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
||||
|
|
@ -263,7 +253,7 @@
|
|||
"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
|
||||
"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
|
||||
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
|
||||
"minimum_initial_investment_value": [0, 5, 12],
|
||||
"minimum_initial_investment_value": [0, 5000, 10000],
|
||||
"recoverable_expenses_value": [0.12, 0.05, 0.06],
|
||||
"indirect_costs_value": [0.12, 0.16, 0.02]
|
||||
},
|
||||
|
|
|
|||
21
main.py
21
main.py
|
|
@ -518,7 +518,8 @@ def batch_start_job(
|
|||
)
|
||||
|
||||
if (
|
||||
document_mapping_file is not None
|
||||
doc_source == "aus_prospectus"
|
||||
and document_mapping_file is not None
|
||||
and len(document_mapping_file) > 0
|
||||
and os.path.exists(document_mapping_file)
|
||||
):
|
||||
|
|
@ -1040,9 +1041,9 @@ def batch_run_documents(
|
|||
page_filter_ground_truth_file = (
|
||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||
)
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = False
|
||||
force_save_total_data = True
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = False
|
||||
calculate_metrics = False
|
||||
|
||||
extract_way = "text"
|
||||
|
|
@ -1383,14 +1384,18 @@ if __name__ == "__main__":
|
|||
|
||||
# special_doc_id_list = ["553242411"]
|
||||
|
||||
doc_source = "emea_ar"
|
||||
doc_source = "aus_prospectus"
|
||||
if doc_source == "aus_prospectus":
|
||||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||
# )
|
||||
document_sample_file = (
|
||||
r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
||||
)
|
||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
||||
document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||
# special_doc_id_list: list = [
|
||||
# "539790009",
|
||||
# "542300403",
|
||||
|
|
@ -1404,7 +1409,7 @@ if __name__ == "__main__":
|
|||
# "555377021",
|
||||
# "555654388",
|
||||
# ]
|
||||
special_doc_id_list: list = ["539790009"]
|
||||
special_doc_id_list: list = ["377377369"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
|
|
@ -0,0 +1,17 @@
|
|||
377377369
|
||||
397107472
|
||||
401212184
|
||||
409723592
|
||||
411062815
|
||||
412778803
|
||||
414751292
|
||||
462770987
|
||||
471206458
|
||||
391080133
|
||||
391080140
|
||||
410899007
|
||||
420339794
|
||||
441280757
|
||||
446324179
|
||||
454036250
|
||||
384508026
|
||||
|
|
@ -543,7 +543,7 @@ class PDFUtil:
|
|||
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
|
||||
if len(matching_val_area) == 0:
|
||||
matching_val_area = page.search_for(text_block.replace('-\n', ''))
|
||||
if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
|
||||
if len(matching_val_area) > 0 and len(text_block.strip().split()) < 3:
|
||||
new_matching_val_area = []
|
||||
for area in matching_val_area:
|
||||
# get text by text_bbox
|
||||
|
|
|
|||
Loading…
Reference in New Issue