optimized for phase 2 data

This commit is contained in:
Blade He 2025-02-18 18:52:26 -06:00
parent 353bc28599
commit 705933bbdd
5 changed files with 108 additions and 21 deletions

View File

@ -1,7 +1,7 @@
{
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs", "Management costs"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]},
"performance_fee": {"english": ["performance fee", "performance fees"]},
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},

View File

@ -1,7 +1,7 @@
{
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]},
"performance_fee": {"english": ["performance fee", "performance fees"]},
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},

View File

@ -205,6 +205,76 @@ class DataExtraction:
data_dict["completion_token"] = result.get("completion_token", 0)
data_dict["total_token"] = result.get("total_token", 0)
"""
data_list = self.post_adjust_management_fee_costs(data_list)
data_list = self.supplement_minimum_initial_investment(data_list)
return data_list
def post_adjust_management_fee_costs(self, data_list: list):
management_fee_costs_list = []
management_fee_list = []
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
keys = list(data_item.keys())
fund_name = data_item.get("fund_name", "")
share_name = data_item.get("share_name", "")
if fund_name == "" or share_name == "":
continue
if "management_fee" in keys:
management_fee = data_item.get("management_fee", -1)
if management_fee != -1:
found = False
for mf in management_fee_list:
if mf.get("fund_name", "") == fund_name and mf.get("share_name", "") == share_name:
mf_value = mf.get("management_fee", -1)
if mf_value != -1 and mf_value >= management_fee:
mf["management_fee"] = management_fee
found = True
break
if not found:
management_fee_list.append({"fund_name": fund_name,
"share_name": share_name,
"management_fee": management_fee})
if "management_fee_and_costs" in keys:
management_fee_costs = data_item.get("management_fee_and_costs", -1)
if management_fee_costs != -1:
found = False
for mfc in management_fee_costs_list:
if mfc.get("fund_name", "") == fund_name and mfc.get("share_name", "") == share_name:
mfc_value = mfc.get("management_fee_and_costs", -1)
if mfc_value != -1 and mfc_value <= management_fee_costs:
mfc["management_fee_and_costs"] = management_fee_costs
found = True
break
if not found:
management_fee_costs_list.append({"fund_name": fund_name,
"share_name": share_name,
"management_fee_and_costs": management_fee_costs})
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
keys = list(data_item.keys())
fund_name = data_item.get("fund_name", "")
share_name = data_item.get("share_name", "")
if fund_name == "" or share_name == "":
continue
if "management_fee" in keys:
for mf in management_fee_list:
if mf.get("fund_name", "") == fund_name and mf.get("share_name", "") == share_name:
data_item["management_fee"] = mf.get("management_fee", -1)
break
if "management_fee_and_costs" in keys:
for mfc in management_fee_costs_list:
if mfc.get("fund_name", "") == fund_name and mfc.get("share_name", "") == share_name:
data_item["management_fee_and_costs"] = mfc.get("management_fee_and_costs", -1)
break
return data_list
def supplement_minimum_initial_investment(self, data_list: list):
exist_minimum_initial_investment = False
minimum_initial_investment = -1
mii_fund_name = ""
@ -241,8 +311,6 @@ class DataExtraction:
new_mii_data_list.append(new_data_dict)
mii_dict["extract_data"]["data"] = new_mii_data_list
return data_list
def extract_data_by_text(self) -> dict:
"""
@ -318,12 +386,18 @@ class DataExtraction:
should_continue = True
else:
for next_datapoint in next_datapoints:
if next_datapoint not in page_datapoints:
should_continue = True
break
next_datapoints.extend(page_datapoints)
# remove duplicate datapoints
next_datapoints = list(set(next_datapoints))
if self.doc_source == "aus_prospectus":
if next_datapoint in page_datapoints:
should_continue = False
break
else:
if next_datapoint not in page_datapoints:
should_continue = True
break
if should_continue:
next_datapoints.extend(page_datapoints)
# remove duplicate datapoints
next_datapoints = list(set(next_datapoints))
if not should_continue:
break
if extract_way == "text":

View File

@ -146,7 +146,19 @@
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]"
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]",
"D. With table header: \"Management Fees and costs (A)\" and \"(A)+(B) + (C) = (D) Total Fees and Costs\", please only focus the values under \"Management Fees and costs (A)\"",
"Please get the first \"Entry Fee Option\" and \"Estimated Other investment costs\" sub-columns values, and sum as the management_fee_and_costs and management_fee value, ignore other columns values \n",
"---Example 1 Start---",
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.02 0.000.00 0.49 1.32\n",
"---Example 1 End---",
"For this case, the first \"Entry Fee Option\" value is 0.47, the first \"Estimated Other investment costs\" value is 0.02, the sum is 0.49, so the output should be:",
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged)\", \"share name\": \"OnePath International Shares Index (Hedged)\", \"management_fee_and_costs\": 0.49, \"management_fee\": 0.49}]",
"---Example 2 Start---",
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\n",
"---Example 2 End---",
"For this case, the first \"Entry Fee Option\" value is 1.44, the first \"Estimated Other investment costs\" value is 0.00, the sum is 1.44, so the output should be:",
"{\"data\": [{\"fund name\": \"Pendal Concentrated Global Shares Hedged II\", \"share name\": \"Pendal Concentrated Global Shares Hedged II\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44}]"
],
"buy_spread": [
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",

17
main.py
View File

@ -1042,8 +1042,8 @@ def batch_run_documents(
page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
)
re_run_extract_data = True
re_run_mapping_data = True
re_run_extract_data = False
re_run_mapping_data = False
force_save_total_data = True
calculate_metrics = False
@ -1397,16 +1397,17 @@ if __name__ == "__main__":
# document_sample_file = (
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
# )
# document_sample_file = (
# r"./sample_documents/aus_prospectus_17_documents_sample.txt"
# )
document_sample_file = (
r"./sample_documents/aus_prospectus_52_documents_sample.txt"
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
)
# document_sample_file = (
# r"./sample_documents/aus_prospectus_52_documents_sample.txt"
# )
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
# special_doc_id_list: list = [
# "539790009",
# "542300403",
@ -1420,7 +1421,7 @@ if __name__ == "__main__":
# "555377021",
# "555654388",
# ]
# special_doc_id_list: list = ["377377369"]
# special_doc_id_list: list = ["401212184"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (