optimized for phase 2 data
This commit is contained in:
parent
353bc28599
commit
705933bbdd
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
|
||||
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]},
|
||||
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
|
||||
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs", "Management costs"]},
|
||||
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]},
|
||||
"performance_fee": {"english": ["performance fee", "performance fees"]},
|
||||
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
|
||||
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
|
||||
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs"]},
|
||||
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
|
||||
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs"]},
|
||||
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]},
|
||||
"performance_fee": {"english": ["performance fee", "performance fees"]},
|
||||
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
|
||||
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
|
||||
|
|
|
|||
|
|
@ -205,6 +205,76 @@ class DataExtraction:
|
|||
data_dict["completion_token"] = result.get("completion_token", 0)
|
||||
data_dict["total_token"] = result.get("total_token", 0)
|
||||
"""
|
||||
data_list = self.post_adjust_management_fee_costs(data_list)
|
||||
data_list = self.supplement_minimum_initial_investment(data_list)
|
||||
|
||||
return data_list
|
||||
|
||||
def post_adjust_management_fee_costs(self, data_list: list):
|
||||
management_fee_costs_list = []
|
||||
management_fee_list = []
|
||||
for data_dict in data_list:
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
data = extract_data.get("data", [])
|
||||
for data_item in data:
|
||||
keys = list(data_item.keys())
|
||||
fund_name = data_item.get("fund_name", "")
|
||||
share_name = data_item.get("share_name", "")
|
||||
if fund_name == "" or share_name == "":
|
||||
continue
|
||||
if "management_fee" in keys:
|
||||
management_fee = data_item.get("management_fee", -1)
|
||||
if management_fee != -1:
|
||||
found = False
|
||||
for mf in management_fee_list:
|
||||
if mf.get("fund_name", "") == fund_name and mf.get("share_name", "") == share_name:
|
||||
mf_value = mf.get("management_fee", -1)
|
||||
if mf_value != -1 and mf_value >= management_fee:
|
||||
mf["management_fee"] = management_fee
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
management_fee_list.append({"fund_name": fund_name,
|
||||
"share_name": share_name,
|
||||
"management_fee": management_fee})
|
||||
if "management_fee_and_costs" in keys:
|
||||
management_fee_costs = data_item.get("management_fee_and_costs", -1)
|
||||
if management_fee_costs != -1:
|
||||
found = False
|
||||
for mfc in management_fee_costs_list:
|
||||
if mfc.get("fund_name", "") == fund_name and mfc.get("share_name", "") == share_name:
|
||||
mfc_value = mfc.get("management_fee_and_costs", -1)
|
||||
if mfc_value != -1 and mfc_value <= management_fee_costs:
|
||||
mfc["management_fee_and_costs"] = management_fee_costs
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
management_fee_costs_list.append({"fund_name": fund_name,
|
||||
"share_name": share_name,
|
||||
"management_fee_and_costs": management_fee_costs})
|
||||
for data_dict in data_list:
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
data = extract_data.get("data", [])
|
||||
for data_item in data:
|
||||
keys = list(data_item.keys())
|
||||
fund_name = data_item.get("fund_name", "")
|
||||
share_name = data_item.get("share_name", "")
|
||||
if fund_name == "" or share_name == "":
|
||||
continue
|
||||
if "management_fee" in keys:
|
||||
for mf in management_fee_list:
|
||||
if mf.get("fund_name", "") == fund_name and mf.get("share_name", "") == share_name:
|
||||
data_item["management_fee"] = mf.get("management_fee", -1)
|
||||
break
|
||||
if "management_fee_and_costs" in keys:
|
||||
for mfc in management_fee_costs_list:
|
||||
if mfc.get("fund_name", "") == fund_name and mfc.get("share_name", "") == share_name:
|
||||
data_item["management_fee_and_costs"] = mfc.get("management_fee_and_costs", -1)
|
||||
break
|
||||
return data_list
|
||||
|
||||
|
||||
def supplement_minimum_initial_investment(self, data_list: list):
|
||||
exist_minimum_initial_investment = False
|
||||
minimum_initial_investment = -1
|
||||
mii_fund_name = ""
|
||||
|
|
@ -242,8 +312,6 @@ class DataExtraction:
|
|||
mii_dict["extract_data"]["data"] = new_mii_data_list
|
||||
return data_list
|
||||
|
||||
|
||||
|
||||
def extract_data_by_text(self) -> dict:
|
||||
"""
|
||||
keys are
|
||||
|
|
@ -318,12 +386,18 @@ class DataExtraction:
|
|||
should_continue = True
|
||||
else:
|
||||
for next_datapoint in next_datapoints:
|
||||
if next_datapoint not in page_datapoints:
|
||||
should_continue = True
|
||||
break
|
||||
next_datapoints.extend(page_datapoints)
|
||||
# remove duplicate datapoints
|
||||
next_datapoints = list(set(next_datapoints))
|
||||
if self.doc_source == "aus_prospectus":
|
||||
if next_datapoint in page_datapoints:
|
||||
should_continue = False
|
||||
break
|
||||
else:
|
||||
if next_datapoint not in page_datapoints:
|
||||
should_continue = True
|
||||
break
|
||||
if should_continue:
|
||||
next_datapoints.extend(page_datapoints)
|
||||
# remove duplicate datapoints
|
||||
next_datapoints = list(set(next_datapoints))
|
||||
if not should_continue:
|
||||
break
|
||||
if extract_way == "text":
|
||||
|
|
|
|||
|
|
@ -146,7 +146,19 @@
|
|||
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
|
||||
"---Example 2 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]"
|
||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]",
|
||||
"D. With table header: \"Management Fees and costs (A)\" and \"(A)+(B) + (C) = (D) Total Fees and Costs\", please only focus the values under \"Management Fees and costs (A)\"",
|
||||
"Please get the first \"Entry Fee Option\" and \"Estimated Other investment costs\" sub-columns values, and sum as the management_fee_and_costs and management_fee value, ignore other columns values \n",
|
||||
"---Example 1 Start---",
|
||||
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.02 0.000.00 0.49 1.32\n",
|
||||
"---Example 1 End---",
|
||||
"For this case, the first \"Entry Fee Option\" value is 0.47, the first \"Estimated Other investment costs\" value is 0.02, the sum is 0.49, so the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged)\", \"share name\": \"OnePath International Shares Index (Hedged)\", \"management_fee_and_costs\": 0.49, \"management_fee\": 0.49}]",
|
||||
"---Example 2 Start---",
|
||||
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\n",
|
||||
"---Example 2 End---",
|
||||
"For this case, the first \"Entry Fee Option\" value is 1.44, the first \"Estimated Other investment costs\" value is 0.00, the sum is 1.44, so the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Pendal Concentrated Global Shares Hedged II\", \"share name\": \"Pendal Concentrated Global Shares Hedged II\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44}]"
|
||||
],
|
||||
"buy_spread": [
|
||||
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
||||
|
|
|
|||
17
main.py
17
main.py
|
|
@ -1042,8 +1042,8 @@ def batch_run_documents(
|
|||
page_filter_ground_truth_file = (
|
||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||
)
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = False
|
||||
force_save_total_data = True
|
||||
calculate_metrics = False
|
||||
|
||||
|
|
@ -1397,16 +1397,17 @@ if __name__ == "__main__":
|
|||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||
# )
|
||||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
||||
# )
|
||||
document_sample_file = (
|
||||
r"./sample_documents/aus_prospectus_52_documents_sample.txt"
|
||||
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
||||
)
|
||||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_52_documents_sample.txt"
|
||||
# )
|
||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||
document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
||||
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||
# special_doc_id_list: list = [
|
||||
# "539790009",
|
||||
# "542300403",
|
||||
|
|
@ -1420,7 +1421,7 @@ if __name__ == "__main__":
|
|||
# "555377021",
|
||||
# "555654388",
|
||||
# ]
|
||||
# special_doc_id_list: list = ["377377369"]
|
||||
# special_doc_id_list: list = ["401212184"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
Loading…
Reference in New Issue