optimize instructions configuration
optimize drilldown part logic
This commit is contained in:
parent
f9ef4cec96
commit
a8810519f8
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
|
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
|
||||||
"management_fee_and_costs": {"english": ["management fees and cost", "Plus other investment fees and costs"]},
|
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]},
|
||||||
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
|
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
|
||||||
"performance_fee": {"english": ["performance fee", "performance fees"]},
|
"performance_fee": {"english": ["performance fee", "performance fees"]},
|
||||||
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
|
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
|
||||||
|
|
|
||||||
|
|
@ -969,7 +969,9 @@ class DataExtraction:
|
||||||
if datapoint_name == "performance_fee":
|
if datapoint_name == "performance_fee":
|
||||||
datapoint_name = "performance fees"
|
datapoint_name = "performance fees"
|
||||||
else:
|
else:
|
||||||
datapoint_name = datapoint_name.upper()
|
datapoint_name = self.datapoint_name_config.get(datapoint_name, "")
|
||||||
|
if len(datapoint_name) == 0:
|
||||||
|
datapoint_name = datapoint.upper()
|
||||||
reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}"
|
reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}"
|
||||||
|
|
||||||
instructions.append(reported_name)
|
instructions.append(reported_name)
|
||||||
|
|
|
||||||
|
|
@ -61,23 +61,8 @@
|
||||||
"---Example End---",
|
"---Example End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
|
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
|
||||||
"- 5. Reverse order of data columns from table text in PDF:",
|
|
||||||
"For this case, 1. the columns order is reversed, \n2. The fund name is in the end of row with number value in front of fund name.",
|
|
||||||
"---Example 1 Start---",
|
|
||||||
"Transaction\ncosts\n(gross)1\nBuy-sell\nspreads\nTransaction\ncosts (net)\nEquals\ninvestment fees and\ncosts\nThe investment fees and\ncosts are made up of\nPlus\nother\ninvestment\nfees and\ncosts\nPerformance\nfee\n% pa\nEntry %/\nExit %\n% pa\n% pa\n% pa\nReady-made portfolios\nSimple choice\n0.04\n0.10/0.10\n0.00\n0.62\n0.55\n0.07\nMLC Stable\n0.05\n0.10/0.10\n0.02\n0.80\n0.65\n0.15\nMLC Conservative Balanced",
|
|
||||||
"---Example 1 End---",
|
|
||||||
"For this case, Management fees and costs = Management fees with same reported name: Plus\nother\ninvestment\nfees and\ncosts",
|
|
||||||
"The output should be: ",
|
|
||||||
"{\"data\": [{\"fund name\": \"MLC Stable\", \"share name\": \"MLC Stable\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.07}, {\"fund name\": \"MLC Conservative Balanced\", \"share name\": \"MLC Conservative Balanced\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.65, \"management_fee\": 0.65, \"performance_fee\": 0.15}]",
|
|
||||||
"\n",
|
|
||||||
"---Example 2 Start---",
|
|
||||||
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
|
|
||||||
"---Example 2 End---",
|
|
||||||
"For this case, Management fees and costs = Management fees + Indirect Fee.",
|
|
||||||
"The output should be:",
|
|
||||||
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
|
|
||||||
"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
|
"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
|
||||||
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
|
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
|
||||||
],
|
],
|
||||||
"investment_level": {
|
"investment_level": {
|
||||||
"total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.",
|
"total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.",
|
||||||
|
|
@ -136,7 +121,7 @@
|
||||||
"special_rule": {
|
"special_rule": {
|
||||||
"management_fee_and_costs": [
|
"management_fee_and_costs": [
|
||||||
"If there are multiple Management fee and costs reported names, here is the priority rule:",
|
"If there are multiple Management fee and costs reported names, here is the priority rule:",
|
||||||
"- With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
|
"A. With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
|
||||||
"---Example Start---",
|
"---Example Start---",
|
||||||
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
|
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
|
||||||
"---Example End---",
|
"---Example End---",
|
||||||
|
|
@ -144,19 +129,24 @@
|
||||||
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
|
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
|
||||||
"\n",
|
"\n",
|
||||||
"If there are multiple Management fee and costs sub-columns, here is the rule:",
|
"If there are multiple Management fee and costs sub-columns, here is the rule:",
|
||||||
"- With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
"B. With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
||||||
"---Example Start---",
|
"---Example Start---",
|
||||||
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
|
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
|
||||||
"---Example End---",
|
"---Example End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
|
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
|
||||||
"\n",
|
"\n",
|
||||||
"- With \"Management fees\" and \"Administration fee\", sum the values from these two columns: \"Management fees\" + \"Administration fee\".",
|
"C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".",
|
||||||
"---Example Start---",
|
"---Example 1 Start---",
|
||||||
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nAdministration Fee\nManagement fees\nMLC diversified investment\noption\n1.62% p.a.\n0.02% p.a.\n0.03% p.a.\n0.09% p.a.\n1.58% p.a.\nMLC Horizon 4 Balanced\nPortfolio\n",
|
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
|
||||||
"---Example End---",
|
"---Example 1 End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.67, \"management_fee\": 1.58, \"administration_fees\": 0.09, \"performance_fee\": 0.03}]"
|
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]",
|
||||||
|
"---Example 2 Start---",
|
||||||
|
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
|
||||||
|
"---Example 2 End---",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]"
|
||||||
],
|
],
|
||||||
"buy_spread": [
|
"buy_spread": [
|
||||||
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
||||||
|
|
@ -263,7 +253,7 @@
|
||||||
"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
|
"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
|
||||||
"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
|
"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
|
||||||
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
|
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
|
||||||
"minimum_initial_investment_value": [0, 5, 12],
|
"minimum_initial_investment_value": [0, 5000, 10000],
|
||||||
"recoverable_expenses_value": [0.12, 0.05, 0.06],
|
"recoverable_expenses_value": [0.12, 0.05, 0.06],
|
||||||
"indirect_costs_value": [0.12, 0.16, 0.02]
|
"indirect_costs_value": [0.12, 0.16, 0.02]
|
||||||
},
|
},
|
||||||
|
|
|
||||||
21
main.py
21
main.py
|
|
@ -518,7 +518,8 @@ def batch_start_job(
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
document_mapping_file is not None
|
doc_source == "aus_prospectus"
|
||||||
|
and document_mapping_file is not None
|
||||||
and len(document_mapping_file) > 0
|
and len(document_mapping_file) > 0
|
||||||
and os.path.exists(document_mapping_file)
|
and os.path.exists(document_mapping_file)
|
||||||
):
|
):
|
||||||
|
|
@ -1040,9 +1041,9 @@ def batch_run_documents(
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
)
|
)
|
||||||
re_run_extract_data = False
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_way = "text"
|
extract_way = "text"
|
||||||
|
|
@ -1383,14 +1384,18 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# special_doc_id_list = ["553242411"]
|
# special_doc_id_list = ["553242411"]
|
||||||
|
|
||||||
doc_source = "emea_ar"
|
doc_source = "aus_prospectus"
|
||||||
if doc_source == "aus_prospectus":
|
if doc_source == "aus_prospectus":
|
||||||
|
# document_sample_file = (
|
||||||
|
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||||
|
# )
|
||||||
document_sample_file = (
|
document_sample_file = (
|
||||||
r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
||||||
)
|
)
|
||||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
||||||
document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||||
|
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||||
# special_doc_id_list: list = [
|
# special_doc_id_list: list = [
|
||||||
# "539790009",
|
# "539790009",
|
||||||
# "542300403",
|
# "542300403",
|
||||||
|
|
@ -1404,7 +1409,7 @@ if __name__ == "__main__":
|
||||||
# "555377021",
|
# "555377021",
|
||||||
# "555654388",
|
# "555654388",
|
||||||
# ]
|
# ]
|
||||||
special_doc_id_list: list = ["539790009"]
|
special_doc_id_list: list = ["377377369"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,17 @@
|
||||||
|
377377369
|
||||||
|
397107472
|
||||||
|
401212184
|
||||||
|
409723592
|
||||||
|
411062815
|
||||||
|
412778803
|
||||||
|
414751292
|
||||||
|
462770987
|
||||||
|
471206458
|
||||||
|
391080133
|
||||||
|
391080140
|
||||||
|
410899007
|
||||||
|
420339794
|
||||||
|
441280757
|
||||||
|
446324179
|
||||||
|
454036250
|
||||||
|
384508026
|
||||||
|
|
@ -543,7 +543,7 @@ class PDFUtil:
|
||||||
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
|
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
|
||||||
if len(matching_val_area) == 0:
|
if len(matching_val_area) == 0:
|
||||||
matching_val_area = page.search_for(text_block.replace('-\n', ''))
|
matching_val_area = page.search_for(text_block.replace('-\n', ''))
|
||||||
if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
|
if len(matching_val_area) > 0 and len(text_block.strip().split()) < 3:
|
||||||
new_matching_val_area = []
|
new_matching_val_area = []
|
||||||
for area in matching_val_area:
|
for area in matching_val_area:
|
||||||
# get text by text_bbox
|
# get text by text_bbox
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue