optimize for management_fee_and_costs and management_fee
This commit is contained in:
parent
2cd4f5f787
commit
fa2dede454
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
|
||||
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees"]},
|
||||
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "investment fees and costs", "investment fee and costs", "Management costs", "Investment fees"]},
|
||||
"management_fee_and_costs": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]},
|
||||
"management_fee": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]},
|
||||
"performance_fee": {"english": ["performance fee", "performance fees"]},
|
||||
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
|
||||
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
|
||||
|
|
|
|||
|
|
@ -1,14 +1,14 @@
|
|||
{
|
||||
"total_annual_dollar_based_charges": "total annual dollar based charges",
|
||||
"management_fee_and_costs": "management fee and costs",
|
||||
"management_fee": "management fee",
|
||||
"administration_fees": "administration fee",
|
||||
"performance_fee": "performance fee",
|
||||
"interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost",
|
||||
"buy_spread": "buy spread",
|
||||
"sell_spread": "sell spread",
|
||||
"administration_fees": "administration fee",
|
||||
"interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost",
|
||||
"benchmark_name": "benchmark name",
|
||||
"total_annual_dollar_based_charges": "total annual dollar based charges",
|
||||
"minimum_initial_investment": "minimum initial investment",
|
||||
"benchmark_name": "benchmark name",
|
||||
"indirect_costs": "indirect cost",
|
||||
"recoverable_expenses": "recoverable expenses",
|
||||
"change_recoverable_expanses": "change recoverable expanses"
|
||||
|
|
|
|||
|
|
@ -448,8 +448,13 @@ class DataExtraction:
|
|||
"""
|
||||
management_fee_costs_list = []
|
||||
management_fee_list = []
|
||||
complex_rule_keywords = "Recoverable expenses \nEstimated other indirect costs"
|
||||
for data_dict in data_list:
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
exist_complex_rule_keywords = False
|
||||
page_text = data_dict.get("page_text", "")
|
||||
if complex_rule_keywords in page_text:
|
||||
exist_complex_rule_keywords = True
|
||||
data = extract_data.get("data", [])
|
||||
for data_item in data:
|
||||
keys = list(data_item.keys())
|
||||
|
|
@ -467,11 +472,17 @@ class DataExtraction:
|
|||
if (mf_fund_name == fund_name and mf_share_name == share_name) or \
|
||||
(len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and
|
||||
(mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))):
|
||||
mf_value = mf.get("management_fee", -1)
|
||||
if mf_value != -1 and mf_value >= management_fee:
|
||||
mf["management_fee"] = management_fee
|
||||
found = True
|
||||
break
|
||||
if exist_complex_rule_keywords and \
|
||||
("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
|
||||
mfc["management_fee"] = management_fee
|
||||
found = True
|
||||
break
|
||||
else:
|
||||
mf_value = mf.get("management_fee", -1)
|
||||
if mf_value != -1 and mf_value >= management_fee:
|
||||
mf["management_fee"] = management_fee
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
management_fee_list.append({"fund_name": fund_name,
|
||||
"share_name": share_name,
|
||||
|
|
@ -486,11 +497,17 @@ class DataExtraction:
|
|||
if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \
|
||||
(len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and
|
||||
(mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))):
|
||||
mfc_value = mfc.get("management_fee_and_costs", -1)
|
||||
if mfc_value != -1 and mfc_value <= management_fee_costs:
|
||||
mfc["management_fee_and_costs"] = management_fee_costs
|
||||
found = True
|
||||
break
|
||||
if exist_complex_rule_keywords and \
|
||||
("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
|
||||
mfc["management_fee_and_costs"] = management_fee_costs
|
||||
found = True
|
||||
break
|
||||
else:
|
||||
mfc_value = mfc.get("management_fee_and_costs", -1)
|
||||
if mfc_value != -1 and mfc_value <= management_fee_costs:
|
||||
mfc["management_fee_and_costs"] = management_fee_costs
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
management_fee_costs_list.append({"fund_name": fund_name,
|
||||
"share_name": share_name,
|
||||
|
|
@ -576,7 +593,7 @@ class DataExtraction:
|
|||
previous_page_datapoints = []
|
||||
previous_page_fund_name = None
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
# if page_num != 21:
|
||||
# if page_num != 18:
|
||||
# continue
|
||||
if page_num in handled_page_num_list:
|
||||
continue
|
||||
|
|
|
|||
|
|
@ -191,7 +191,7 @@
|
|||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee\": 0.18}]}",
|
||||
"\n",
|
||||
"F. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00%, please ignore and output empty.",
|
||||
"F. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00% or 0.55%-1.00%, please ignore and output empty.",
|
||||
"---Example 1 Start---",
|
||||
"Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to ‘Administration fee \nrebate’ section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown ‘in the ‘Management \nfees and costs’ section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n",
|
||||
"---Example 1 End---",
|
||||
|
|
@ -202,6 +202,11 @@
|
|||
"---Example 2 End---",
|
||||
"The relevant values: 0.07 and 1.00, are in the range, should ignore, so the output should be:",
|
||||
"{\"data\": []}",
|
||||
"---Example 3 Start---",
|
||||
"Management fees and costs \n0.67–1.17% p.a. (estimated) \nThe fees and costs for \nmanaging your investment \n",
|
||||
"---Example 3 End---",
|
||||
"The relevant values: 0.67 and 1.17, are in the range, should ignore, so the output should be:",
|
||||
"{\"data\": []}",
|
||||
"\n",
|
||||
"G. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.",
|
||||
"---Example 1 Start---",
|
||||
|
|
@ -224,7 +229,15 @@
|
|||
"So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee is 0.09.",
|
||||
"For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee is 0.11.",
|
||||
"So the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee\": 0.11}]}"
|
||||
"{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee\": 0.11}]}",
|
||||
"---Example 4 Start---",
|
||||
"Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return – Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n",
|
||||
"---Example 4 End---",
|
||||
"The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.",
|
||||
"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
|
||||
"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
|
||||
"So the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}"
|
||||
],
|
||||
"administration_fees":[
|
||||
"Administration fees and costs is share class level data.",
|
||||
|
|
@ -395,7 +408,7 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"keywords": "Recoverable expenses \nEstimated other indirect costs",
|
||||
"keywords": ["Recoverable expenses \nEstimated other indirect costs"],
|
||||
"prompts": ["Complex management fee and costs rule:",
|
||||
"If the table with columns:",
|
||||
"\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"",
|
||||
|
|
|
|||
52
main.py
52
main.py
|
|
@ -279,7 +279,39 @@ class EMEA_AR_Parsing:
|
|||
)
|
||||
with open(json_file, "r", encoding="utf-8") as f:
|
||||
doc_mapping_data = json.load(f)
|
||||
return doc_mapping_data
|
||||
if self.doc_source == "aus_prospectus":
|
||||
output_data_folder_splits = output_data_json_folder.split("output")
|
||||
if len(output_data_folder_splits) == 2:
|
||||
merged_data_folder = f'{output_data_folder_splits[0]}output/merged_data/docs/'
|
||||
os.makedirs(merged_data_folder, exist_ok=True)
|
||||
|
||||
merged_data_json_folder = os.path.join(merged_data_folder, "json/")
|
||||
os.makedirs(merged_data_json_folder, exist_ok=True)
|
||||
|
||||
merged_data_excel_folder = os.path.join(merged_data_folder, "excel/")
|
||||
os.makedirs(merged_data_excel_folder, exist_ok=True)
|
||||
|
||||
merged_data_file = os.path.join(merged_data_json_folder, f"merged_{self.doc_id}.json")
|
||||
if os.path.exists(merged_data_file):
|
||||
with open(merged_data_file, "r", encoding="utf-8") as f:
|
||||
merged_data_list = json.load(f)
|
||||
return merged_data_list
|
||||
else:
|
||||
data_mapping = DataMapping(
|
||||
self.doc_id,
|
||||
self.datapoints,
|
||||
data_from_gpt,
|
||||
self.document_mapping_info_df,
|
||||
self.output_mapping_data_folder,
|
||||
self.doc_source,
|
||||
compare_with_provider=self.compare_with_provider
|
||||
)
|
||||
merged_data_list = data_mapping.merge_output_data_aus_prospectus(doc_mapping_data,
|
||||
merged_data_json_folder,
|
||||
merged_data_excel_folder)
|
||||
return merged_data_list
|
||||
else:
|
||||
return doc_mapping_data
|
||||
"""
|
||||
doc_id,
|
||||
datapoints: list,
|
||||
|
|
@ -1420,7 +1452,7 @@ def get_aus_prospectus_document_category():
|
|||
|
||||
|
||||
def test_post_adjust_extract_data():
|
||||
doc_id = "454036250"
|
||||
doc_id = "539266814"
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
@ -1459,7 +1491,8 @@ def test_post_adjust_extract_data():
|
|||
with open(data_file_path, "r", encoding="utf-8") as f:
|
||||
data_list = json.load(f)
|
||||
# data_list = data_extraction.remove_duplicate_data(data_list)
|
||||
data_list = data_extraction.post_adjust_for_value_with_production_name(data_list)
|
||||
# data_list = data_extraction.post_adjust_for_value_with_production_name(data_list)
|
||||
data_list = data_extraction.post_supplement_data(data_list)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
@ -1516,7 +1549,18 @@ if __name__ == "__main__":
|
|||
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
|
||||
# special_doc_id_list: list = ["410899007", "539266880", "539266817",
|
||||
# "539261734", "539266893"]
|
||||
# special_doc_id_list: list = ["539266880"]
|
||||
# special_doc_id_list: list = ["530101994",
|
||||
# "539241700",
|
||||
# "539261734",
|
||||
# "539266814",
|
||||
# "539266817",
|
||||
# "539266874",
|
||||
# "539266880",
|
||||
# "539266893",
|
||||
# "544886057",
|
||||
# "550769189",
|
||||
# "553449663"]
|
||||
# special_doc_id_list = ["539241700"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
|
|
@ -1090,7 +1090,7 @@ def replace_special_table_header(page_text: str):
|
|||
# item 2: document 539266893
|
||||
"regex_all_list":
|
||||
[r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n",
|
||||
r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?interposed\s*vehicles\s*\n",
|
||||
r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?(interposed\s*vehicles|managers\s*vehicles)\s*\n",
|
||||
r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"],
|
||||
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
|
||||
},
|
||||
|
|
|
|||
Loading…
Reference in New Issue