optimize for management_fee_and_costs and management_fee

This commit is contained in:
Blade He 2025-03-07 18:38:36 -06:00
parent 2cd4f5f787
commit fa2dede454
6 changed files with 99 additions and 25 deletions

View File

@ -1,7 +1,7 @@
{ {
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees"]}, "management_fee_and_costs": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "investment fees and costs", "investment fee and costs", "Management costs", "Investment fees"]}, "management_fee": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]},
"performance_fee": {"english": ["performance fee", "performance fees"]}, "performance_fee": {"english": ["performance fee", "performance fees"]},
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]}, "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},

View File

@ -1,14 +1,14 @@
{ {
"total_annual_dollar_based_charges": "total annual dollar based charges",
"management_fee_and_costs": "management fee and costs", "management_fee_and_costs": "management fee and costs",
"management_fee": "management fee", "management_fee": "management fee",
"administration_fees": "administration fee",
"performance_fee": "performance fee", "performance_fee": "performance fee",
"interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost",
"buy_spread": "buy spread", "buy_spread": "buy spread",
"sell_spread": "sell spread", "sell_spread": "sell spread",
"administration_fees": "administration fee", "total_annual_dollar_based_charges": "total annual dollar based charges",
"interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost", "minimum_initial_investment": "minimum initial investment",
"benchmark_name": "benchmark name", "benchmark_name": "benchmark name",
"minimum_initial_investment": "minimum initial investment",
"indirect_costs": "indirect cost", "indirect_costs": "indirect cost",
"recoverable_expenses": "recoverable expenses", "recoverable_expenses": "recoverable expenses",
"change_recoverable_expanses": "change recoverable expanses" "change_recoverable_expanses": "change recoverable expanses"

View File

@ -448,8 +448,13 @@ class DataExtraction:
""" """
management_fee_costs_list = [] management_fee_costs_list = []
management_fee_list = [] management_fee_list = []
complex_rule_keywords = "Recoverable expenses \nEstimated other indirect costs"
for data_dict in data_list: for data_dict in data_list:
extract_data = data_dict.get("extract_data", {}) extract_data = data_dict.get("extract_data", {})
exist_complex_rule_keywords = False
page_text = data_dict.get("page_text", "")
if complex_rule_keywords in page_text:
exist_complex_rule_keywords = True
data = extract_data.get("data", []) data = extract_data.get("data", [])
for data_item in data: for data_item in data:
keys = list(data_item.keys()) keys = list(data_item.keys())
@ -467,11 +472,17 @@ class DataExtraction:
if (mf_fund_name == fund_name and mf_share_name == share_name) or \ if (mf_fund_name == fund_name and mf_share_name == share_name) or \
(len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and (len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and
(mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))): (mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))):
mf_value = mf.get("management_fee", -1) if exist_complex_rule_keywords and \
if mf_value != -1 and mf_value >= management_fee: ("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
mf["management_fee"] = management_fee mfc["management_fee"] = management_fee
found = True found = True
break break
else:
mf_value = mf.get("management_fee", -1)
if mf_value != -1 and mf_value >= management_fee:
mf["management_fee"] = management_fee
found = True
break
if not found: if not found:
management_fee_list.append({"fund_name": fund_name, management_fee_list.append({"fund_name": fund_name,
"share_name": share_name, "share_name": share_name,
@ -486,11 +497,17 @@ class DataExtraction:
if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \ if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \
(len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and (len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and
(mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))): (mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))):
mfc_value = mfc.get("management_fee_and_costs", -1) if exist_complex_rule_keywords and \
if mfc_value != -1 and mfc_value <= management_fee_costs: ("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
mfc["management_fee_and_costs"] = management_fee_costs mfc["management_fee_and_costs"] = management_fee_costs
found = True found = True
break break
else:
mfc_value = mfc.get("management_fee_and_costs", -1)
if mfc_value != -1 and mfc_value <= management_fee_costs:
mfc["management_fee_and_costs"] = management_fee_costs
found = True
break
if not found: if not found:
management_fee_costs_list.append({"fund_name": fund_name, management_fee_costs_list.append({"fund_name": fund_name,
"share_name": share_name, "share_name": share_name,
@ -576,7 +593,7 @@ class DataExtraction:
previous_page_datapoints = [] previous_page_datapoints = []
previous_page_fund_name = None previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
# if page_num != 21: # if page_num != 18:
# continue # continue
if page_num in handled_page_num_list: if page_num in handled_page_num_list:
continue continue

View File

@ -191,7 +191,7 @@
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee\": 0.18}]}", "{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee\": 0.18}]}",
"\n", "\n",
"F. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00%, please ignore and output empty.", "F. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00% or 0.55%-1.00%, please ignore and output empty.",
"---Example 1 Start---", "---Example 1 Start---",
"Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to Administration fee \nrebate section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown in the Management \nfees and costs section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n", "Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to Administration fee \nrebate section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown in the Management \nfees and costs section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n",
"---Example 1 End---", "---Example 1 End---",
@ -202,6 +202,11 @@
"---Example 2 End---", "---Example 2 End---",
"The relevant values: 0.07 and 1.00, are in the range, should ignore, so the output should be:", "The relevant values: 0.07 and 1.00, are in the range, should ignore, so the output should be:",
"{\"data\": []}", "{\"data\": []}",
"---Example 3 Start---",
"Management fees and costs \n0.671.17% p.a. (estimated) \nThe fees and costs for \nmanaging your investment \n",
"---Example 3 End---",
"The relevant values: 0.67 and 1.17, are in the range, should ignore, so the output should be:",
"{\"data\": []}",
"\n", "\n",
"G. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.", "G. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.",
"---Example 1 Start---", "---Example 1 Start---",
@ -224,7 +229,15 @@
"So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee is 0.09.", "So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee is 0.09.",
"For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee is 0.11.", "For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee is 0.11.",
"So the output should be:", "So the output should be:",
"{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee\": 0.11}]}" "{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee\": 0.11}]}",
"---Example 4 Start---",
"Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n",
"---Example 4 End---",
"The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.",
"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
"So the output should be:",
"{\"data\": [{\"fund name\": \"CFS Real Return Class A\", \"share name\": \"CFS Real Return Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}"
], ],
"administration_fees":[ "administration_fees":[
"Administration fees and costs is share class level data.", "Administration fees and costs is share class level data.",
@ -395,7 +408,7 @@
] ]
}, },
{ {
"keywords": "Recoverable expenses \nEstimated other indirect costs", "keywords": ["Recoverable expenses \nEstimated other indirect costs"],
"prompts": ["Complex management fee and costs rule:", "prompts": ["Complex management fee and costs rule:",
"If the table with columns:", "If the table with columns:",
"\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"",

52
main.py
View File

@ -279,7 +279,39 @@ class EMEA_AR_Parsing:
) )
with open(json_file, "r", encoding="utf-8") as f: with open(json_file, "r", encoding="utf-8") as f:
doc_mapping_data = json.load(f) doc_mapping_data = json.load(f)
return doc_mapping_data if self.doc_source == "aus_prospectus":
output_data_folder_splits = output_data_json_folder.split("output")
if len(output_data_folder_splits) == 2:
merged_data_folder = f'{output_data_folder_splits[0]}output/merged_data/docs/'
os.makedirs(merged_data_folder, exist_ok=True)
merged_data_json_folder = os.path.join(merged_data_folder, "json/")
os.makedirs(merged_data_json_folder, exist_ok=True)
merged_data_excel_folder = os.path.join(merged_data_folder, "excel/")
os.makedirs(merged_data_excel_folder, exist_ok=True)
merged_data_file = os.path.join(merged_data_json_folder, f"merged_{self.doc_id}.json")
if os.path.exists(merged_data_file):
with open(merged_data_file, "r", encoding="utf-8") as f:
merged_data_list = json.load(f)
return merged_data_list
else:
data_mapping = DataMapping(
self.doc_id,
self.datapoints,
data_from_gpt,
self.document_mapping_info_df,
self.output_mapping_data_folder,
self.doc_source,
compare_with_provider=self.compare_with_provider
)
merged_data_list = data_mapping.merge_output_data_aus_prospectus(doc_mapping_data,
merged_data_json_folder,
merged_data_excel_folder)
return merged_data_list
else:
return doc_mapping_data
""" """
doc_id, doc_id,
datapoints: list, datapoints: list,
@ -1420,7 +1452,7 @@ def get_aus_prospectus_document_category():
def test_post_adjust_extract_data(): def test_post_adjust_extract_data():
doc_id = "454036250" doc_id = "539266814"
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (
@ -1459,7 +1491,8 @@ def test_post_adjust_extract_data():
with open(data_file_path, "r", encoding="utf-8") as f: with open(data_file_path, "r", encoding="utf-8") as f:
data_list = json.load(f) data_list = json.load(f)
# data_list = data_extraction.remove_duplicate_data(data_list) # data_list = data_extraction.remove_duplicate_data(data_list)
data_list = data_extraction.post_adjust_for_value_with_production_name(data_list) # data_list = data_extraction.post_adjust_for_value_with_production_name(data_list)
data_list = data_extraction.post_supplement_data(data_list)
if __name__ == "__main__": if __name__ == "__main__":
@ -1516,7 +1549,18 @@ if __name__ == "__main__":
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
# special_doc_id_list: list = ["410899007", "539266880", "539266817", # special_doc_id_list: list = ["410899007", "539266880", "539266817",
# "539261734", "539266893"] # "539261734", "539266893"]
# special_doc_id_list: list = ["539266880"] # special_doc_id_list: list = ["530101994",
# "539241700",
# "539261734",
# "539266814",
# "539266817",
# "539266874",
# "539266880",
# "539266893",
# "544886057",
# "550769189",
# "553449663"]
# special_doc_id_list = ["539241700"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (

View File

@ -1090,7 +1090,7 @@ def replace_special_table_header(page_text: str):
# item 2: document 539266893 # item 2: document 539266893
"regex_all_list": "regex_all_list":
[r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n", [r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n",
r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?interposed\s*vehicles\s*\n", r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?(interposed\s*vehicles|managers\s*vehicles)\s*\n",
r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"], r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"],
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n" "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
}, },