Merge branch 'aus_prospectus_ravi' of https://msstash.morningstar.com/scm/dc/dc-ml-emea-ar into aus_prospectus_ravi

This commit is contained in:
Ravi Maheshwari 2025-03-13 11:39:30 +05:30
commit 336fd9a24f
6 changed files with 275 additions and 80 deletions

View File

@ -559,7 +559,8 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx",
verify_data_sheet: str = "total_data",
verify_document_list_file: str = None,
is_for_all: bool = False
is_for_all: bool = False,
zero_equal_none: bool = False
):
print("Start to calculate metrics based on DB data file and extracted file...")
audit_data_df = pd.DataFrame()
@ -733,28 +734,30 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
# v_switching_fee = str(doc_verify_sec_row["switching_fee"])
# v_activity_fee = str(doc_verify_sec_row["activity_fee"])
message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs")
message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs", zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs"))
message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee")
message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee", zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee"))
message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees")
message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees", zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees"))
message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment")
message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment", zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment"))
message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name")
message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name", zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name"))
if is_for_all:
message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list)
message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list, zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee_costs"))
message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost,
gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list)
gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list,
zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost"))
message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list)
message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list, zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread"))
message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list)
message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list, zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread"))
message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges,
gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list)
gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list,
zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges"))
# message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list)
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee"))
@ -892,7 +895,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
os.makedirs(output_folder, exist_ok=True)
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
if is_for_all:
verify_file_name = f"metrics_{verify_file_name}_all"
verify_file_name = f"{verify_file_name}_all"
metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx"
output_file = os.path.join(output_folder, metrics_file_name)
with pd.ExcelWriter(output_file) as writer:
@ -1280,60 +1283,85 @@ def generate_message(message: dict,
return message
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""):
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = "", zero_equal_none: bool = False):
message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""}
if gt_value is not None and len(str(gt_value)) > 0:
if gt_value is not None and len(str(gt_value).strip()) > 0:
gt_list.append(1)
gt_equal_pred = is_equal(gt_value, pred_value, data_point)
gt_equal_pred = is_equal(gt_value, pred_value, data_point, zero_equal_none=zero_equal_none)
if gt_equal_pred:
pred_list.append(1)
else:
pred_list.append(0)
message["error"] = "pred_value is not equal to gt_value"
if pred_value is not None and len(str(pred_value)) > 0:
if pred_value is not None and len(str(pred_value).strip()) > 0:
pred_list.append(1)
gt_list.append(0)
else:
if pred_value is not None and len(str(pred_value)) > 0:
gt_list.append(0)
pred_list.append(1)
message["error"] = "gt_value is empty, but pred_value is not empty"
if pred_value is not None and len(str(pred_value).strip()) > 0:
if data_point not in ["benchmark_name"]:
try:
pred_num = float(pred_value)
# round to 2 decimal places
pred_value = round(pred_num, 4)
if zero_equal_none and pred_value == 0:
gt_list.append(1)
pred_list.append(1)
else:
gt_list.append(0)
pred_list.append(1)
message["error"] = "gt_value is empty, but pred_value is not empty"
except Exception as e:
pass
else:
gt_list.append(0)
pred_list.append(1)
message["error"] = "gt_value is empty, but pred_value is not empty"
# else:
# gt_list.append(1)
# pred_list.append(1)
return message
def is_equal(gt_value, pred_value, data_point: str = ""):
if gt_value is not None and len(str(gt_value).strip()) > 0 and \
pred_value is not None and len(str(pred_value).strip()) > 0:
if gt_value == "0.0":
gt_value = "0"
if pred_value == "0.0":
pred_value = "0"
if data_point not in ["benchmark_name"]:
try:
gt_num = float(gt_value)
# round to 2 decimal places
gt_value = round(gt_num, 4)
except Exception as e:
pass
try:
pred_value = float(pred_value)
pred_value = round(pred_value, 4)
except Exception as e:
pass
if gt_value == pred_value:
return True
if data_point == "benchmark_name":
gt_value = clean_text(gt_value)
pred_value = clean_text(pred_value)
if gt_value == pred_value or gt_value in pred_value or pred_value in gt_value:
return True
similarity = Similarity()
jacard_score = similarity.jaccard_similarity(gt_value.lower().split(), pred_value.lower().split())
if jacard_score > 0.8:
def is_equal(gt_value, pred_value, data_point: str = "", zero_equal_none: bool = False):
if gt_value is not None and len(str(gt_value).strip()) > 0:
if pred_value is not None and len(str(pred_value).strip()) > 0:
if gt_value == "0.0":
gt_value = "0"
if pred_value == "0.0":
pred_value = "0"
if data_point not in ["benchmark_name"]:
try:
gt_num = float(gt_value)
# round to 2 decimal places
gt_value = round(gt_num, 4)
except Exception as e:
pass
try:
pred_value = float(pred_value)
pred_value = round(pred_value, 4)
except Exception as e:
pass
if gt_value == pred_value:
return True
if data_point == "benchmark_name":
gt_value = clean_text(gt_value)
pred_value = clean_text(pred_value)
if gt_value == pred_value or gt_value in pred_value or pred_value in gt_value:
return True
similarity = Similarity()
jacard_score = similarity.jaccard_similarity(gt_value.lower().split(), pred_value.lower().split())
if jacard_score > 0.8:
return True
else:
if data_point not in ["benchmark_name"]:
try:
gt_num = float(gt_value)
# round to 2 decimal places
gt_value = round(gt_num, 4)
except Exception as e:
pass
if zero_equal_none and gt_value == 0:
return True
return False
@ -1404,6 +1432,7 @@ if __name__ == "__main__":
verify_document_list_file_list = [None,
"./sample_documents/aus_prospectus_29_documents_sample.txt",
"./sample_documents/aus_prospectus_17_documents_sample.txt"]
zero_equal_none = False
is_for_all = True
for verify_document_list_file in verify_document_list_file_list:
calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
@ -1411,7 +1440,8 @@ if __name__ == "__main__":
verify_file_path=verify_file_path,
verify_data_sheet=verify_data_sheet,
verify_document_list_file = verify_document_list_file,
is_for_all=is_for_all)
is_for_all=is_for_all,
zero_equal_none=zero_equal_none)
# for verify_document_list_file in verify_document_list_file_list:
# calculate_metrics_by_provider(audit_file_path=audit_file_path,

View File

@ -21,7 +21,7 @@
"regex_all_list":
["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n",
"Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"],
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n",
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \n",
"comments": ["item 0: document 401212184, page 17",
"item 1: document 401212184, page 18 - 20"]
},
@ -30,6 +30,18 @@
["Investment\\s*option\\s*Administration fees[\\s\\S]*?administration\\s*costs\\s*Investment\\s*fees[\\s\\S]*?investment\\s*costs\\s*Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Estimated\\s*administration[\\s\\S]*?transaction\\s*costs[\\s\\S]*?annual\\s*fees\\s*and\\s*costs\\s*\\(\\%\\s*pa\\)\\s*\\n"],
"replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n",
"comments": ["item 0: document 411062815, page 17"]
},
{
"regex_all_list":
["\\nFund\\s*name\\s*Management\\s*fee\\s*Indirect\\s*costs\\s*Recoverable\\s*expenses[\\s\\S]*?performance.*\\s*fee\\s*Estimated\\s*other\\s*indirect\\s*costs\\s*\\n"],
"replace_text": "\nFund name \nManagement fee \nRecoverable expenses \nEstimated performance-related fee \nEstimated other indirect costs \n",
"comments": ["item 0: document 391080133, page 21"]
},
{
"regex_all_list":
["The\\s*investment\\s*fees\\s*and\\s*costs[\\s\\S]*?Performance\\s*fee\\s*Plus\\s*other\\s*investment\\s*fees\\s*and\\s*costs\\s*Equals\\s*investment\\s*fees\\s*and\\s*costs\\s*Transaction\\s*costs[\\s\\S]*?Buy\\-sell\\s*spreads\\s*Transaction[\\s\\S]*?Entry[\\s\\S]*pa\\s*\\n"],
"replace_text": "Performance fee \nPlus other investment fees and costs \nEquals investment fees and costs \nTransaction costs(net) \nBuy-sell spreads \nTransaction costs(gross)\n",
"comments": ["item 0: document 420339794, page 74"]
}
]
}

View File

@ -262,6 +262,8 @@ class DataExtraction:
data_dict["completion_token"] = result.get("completion_token", 0)
data_dict["total_token"] = result.get("total_token", 0)
"""
data_list = self.supplement_ttr_pension(data_list)
data_list = self.align_fund_share_name(data_list)
data_list = self.supplement_minimum_initial_investment(data_list)
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
data_list = self.remove_duplicate_data(data_list)
@ -271,6 +273,114 @@ class DataExtraction:
data_list = self.check_administration_fees(data_list)
return data_list
def align_fund_share_name(self, data_list: list):
"""
Align the fund name and share name to be the same format
"""
if self.document_production is None or len(self.document_production) == 0:
return data_list
fund_name_list = []
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
fund_name = data_item.get("fund_name", "")
if len(fund_name) == 0:
continue
if fund_name not in fund_name_list:
fund_name_list.append(fund_name)
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
fund_name = data_item.get("fund_name", "")
share_name = data_item.get("share_name", "")
if len(fund_name) == 0:
continue
for c_fund_name in fund_name_list:
if c_fund_name == fund_name:
continue
if len(fund_name) < len(c_fund_name) and c_fund_name.endswith(fund_name):
if c_fund_name.replace(fund_name, "").strip() in self.document_production:
data_item["fund_name"] = c_fund_name
if share_name == fund_name:
data_item["share_name"] = c_fund_name
break
return data_list
def supplement_ttr_pension(self, data_list: list):
"""
If with fund name ends with "TTR" and "Pension", and exist same fund name without "TTR" or "Pension",
Supplement the data of "TTR" and "Pension" to the same fund name without "TTR" or "Pension"
"""
ttr_fund_name_list = []
pension_fund_name_list = []
exist_ttr = False
exist_pension = False
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
keys = list(data_item.keys())
fund_name = data_item.get("fund_name", "")
if len(fund_name) == 0:
continue
fund_name_splits = fund_name.split()
if fund_name_splits[-1] == "TTR":
ttr_fund_name_list.append(fund_name)
exist_ttr = True
if fund_name_splits[-1] == "Pension":
pension_fund_name_list.append(fund_name)
exist_pension = True
if exist_ttr and exist_pension:
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
new_item_list = []
remove_item_list = []
for data_item in data:
fund_name = data_item.get("fund_name", "")
share_name = data_item.get("share_name", "")
if len(fund_name) == 0:
continue
fund_name_splits = fund_name.split()
if fund_name_splits[-1] == "TTR" or fund_name_splits[-1] == "Pension":
continue
keys = [key for key in list(data_item.keys())
if key not in ["fund_name", "share_name"]]
for ttr_fund_name in ttr_fund_name_list:
ttr_pure_fund_name = ttr_fund_name.replace(" TTR", "")
if fund_name == ttr_pure_fund_name:
new_fund_name = f"{fund_name} TTR"
if share_name == fund_name:
share_name = new_fund_name
new_item = {"fund_name": new_fund_name, "share_name": share_name}
for key in keys:
new_item[key] = data_item.get(key, "")
new_item_list.append(new_item)
if data_item not in remove_item_list:
remove_item_list.append(data_item)
break
for pension_fund_name in pension_fund_name_list:
pernsion_pure_fund_name = pension_fund_name.replace(" Pension", "")
if fund_name == pernsion_pure_fund_name:
new_fund_name = f"{fund_name} Pension"
if share_name == fund_name:
share_name = new_fund_name
new_item = {"fund_name": new_fund_name, "share_name": share_name}
for key in keys:
new_item[key] = data_item.get(key, "")
new_item_list.append(new_item)
if data_item not in remove_item_list:
remove_item_list.append(data_item)
break
for remove_item in remove_item_list:
if remove_item in data:
data.remove(remove_item)
data.extend(new_item_list)
return data_list
def check_administration_fees(self, data_list: list):
"""
If document source is aus_prospectus and document category is MIS, then remove the administration fees from data_list
@ -614,7 +724,7 @@ class DataExtraction:
previous_page_datapoints = []
previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items():
# if page_num not in [4, 5]:
# if page_num not in [74]:
# continue
if page_num in handled_page_num_list:
continue

View File

@ -171,11 +171,21 @@
"C.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".",
"The management_fee is the value of \"Management fee (% pa)\".",
"The management_fee_and_costs is the value of \"Total management cost (% pa)\".",
"---Example Start---",
"---Example 1 Start---",
"Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n",
"---Example End---",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee_costs\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
"---Example 2 Start---",
"\nFund name \nManagement fee \nRecoverable expenses \nEstimated performance-related fee \nEstimated other indirect costs \nPathways 30 \n1.16% pa \n0.02% pa \n0.05% pa \n0.05% pa \nPathways 70 \n1.30% pa \n0.01% pa \n0.06% pa \n0.04% pa \n",
"---Example 2 End---",
"The management_fee_and_costs is the value of \"Management fee\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".",
"The management_fee is the value of \"Management fee\".",
"The performance_fee_costs is the value of \"Estimated performance-related fee\".",
"The indirect_costs is the value of \"Estimated other indirect costs\".",
"The recoverable_expenses is the value of \"Recoverable expenses\".",
"The output should be:",
"{\"data\": [{\"fund name\": \"Pathways 30\", \"share name\": \"Pathways 30\", \"management_fee_and_costs\": 1.23, \"management_fee\": 1.16, \"recoverable_expenses\": 0.02, \"performance_fee_costs\": 0.05, \"indirect_costs\": 0.05}, {\"fund name\": \"Pathways 70\", \"share name\": \"Pathways 70\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.3, \"recoverable_expenses\": 0.01, \"performance_fee_costs\": 0.06, \"indirect_costs\": 0.04}]}",
"\n",
"D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
"---Example 1 Start---",
@ -230,29 +240,23 @@
"\n",
"H. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.",
"---Example 1 Start---",
"Fees and costs for \nyour investment options \n\nAdministration fees and costs apply in addition to the fees and costs shown in this table. Please refer to the PDS and Fee Brochure for \nfurther information about fees and costs, including how the figures shown below are calculated. \n\nThe investment fees and \ncosts are made up of \nPerformance \nfee \nPlus \nother \ninvestment \nfees and \ncosts \nEquals \ninvestment \nfees and \ncosts \nTransaction \ncosts (net) \nBuy-sell \nspreads \nTransaction \ncosts \n(gross) 1 \n% pa \n% pa \n% pa \nEntry %/ \nExit % \n% pa \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \n",
"---Example 1 End---",
"The column: \"Equals investment fees and costs\" is the sum of \"Performance fee\" and \"Plus other investment fees and costs\", we should ignore the \"Performance fee\" value, just output the \"Plus other investment fees and costs\" value.",
"The \"Plus other investment fees and costs\" could be the values for both of \"management fee\" and \"management fee and costs\", so the output should be:",
"{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee_costs\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee_costs\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
"---Example 2 Start---",
"MANAGEMENT COSTS AND TRANSACTION COSTS \n\nOption name Management costs \nEstimated \nperformance \nfee (pa) 1 \nTotal management\ncosts (including\nestimated performance\nfee) pa\nTransaction costs \nper transaction (%) \nMULTI-MANAGER MULTI-SECTOR (These investment options are located in the Investment Options Menu on pages 18 to 19.) \nFirstChoice Wholesale Defensive 0.85% 0.85% 0.15\nFirstChoice Wholesale Conservative 0.90% 0.02%1 0.92% 1 0.15 \n",
"---Example 2 End---",
"---Example 1 End---",
"The column: \"Total management costs (including estimated performance fee) pa\" is the sum of \"Management costs\" and \"Estimated performance fee (pa)\", we should ignore the \"Estimated performance fee (pa)\" value, just output the \"Management costs\" value.",
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\", so the output should be:",
"{\"data\": [{\"fund name\": \"FirstChoice Wholesale Defensive\", \"share name\": \"FirstChoice Wholesale Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.85}, {\"fund name\": \"FirstChoice Wholesale Conservative\", \"share name\": \"FirstChoice Wholesale Conservative\", \"management_fee_and_costs\": 0.9, \"management_fee\": 0.9, \"performance_fee_costs\": 0.02}]}",
"---Example 3 Start---",
"---Example 2 Start---",
"Investment \noption \nInvestment fees and \ncosts (p.a.) \n1 \nTransaction \ncosts (p.a.) \nMySuper/ \nBalanced \n0.38% (including 0.09% \nPerformance fee) \n0.18% \nManaged \nGrowth \n0.38% (including 0.11% \nPerformance fee) \n0.08% \n",
"---Example 3 End---",
"---Example 2 End---",
"The column: \"Investment fees and costs (p.a.)\", \"including Performance fee\", meaning the value is the sum of \"Management costs\" and \"performance fee\", We should subtract the \"performance fee\" value, just output the \"Management costs\" value.",
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\".",
"So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee_costs is 0.09.",
"For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee_costs is 0.11.",
"So the output should be:",
"{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee_costs\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee_costs\": 0.11}]}",
"---Example 4 Start---",
"---Example 3 Start---",
"Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n",
"---Example 4 End---",
"---Example 3 End---",
"The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.",
"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
@ -350,17 +354,33 @@
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
],
"buy_spread": [
"A. Exclude reported name",
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)",
"B. Simple case with simple table structure:",
"---Example 1 Start---",
"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Lifestyle Growth\", \"share name\": \"Lifestyle Growth\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Lifestyle Balanced\", \"share name\": \"Lifestyle Balanced\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Property\", \"share name\": \"Property\", \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
],
"performance_fee_costs": [
"Performance fees is share class level data.",
"If the performance fees is with the range, please ignore and output empty.",
"A. If the performance fees is with the range, please ignore and output empty.",
"---Example 1 Start---",
"Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.",
"---Example 1 End---",
"The relevant values: 0.00 and 2.18, are in the range, so the output should be:",
"{\"data\": []}"
"{\"data\": []}",
"B If with pure performance fee in table, please extract relevant values",
"---Example Start---",
"\n\nFees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nPerformance fees \nAmounts deducted from your investment in \nrelation to the performance of the product. \nPlatinum International Fund Nil 0.15%\nPlatinum Global Fund (Long Only) Nil 0.24%\n",
"---Example End---",
"a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.",
"b. This example mentioned share classes, please output according to share class.",
"The output should be",
"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}"
],
"minimum_initial_investment": [
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
@ -458,15 +478,15 @@
"For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".",
"For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry option\" + \"Estimated other investment costs\".",
"---Example 1 Start---",
"Management Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n",
"\nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.32 0.00 0.00 0.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.29 0.00 0.00 0.04 1.48 2.33\nPlatinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n",
"---Example 1 End---",
"The data points numbers order in data row (for example: 2.14 2.990.02 0.000.21 2.37 3.22) is correct as initial table structure.",
"The data points numbers order in data row (for example: 2.14 2.99 0.02 0.00 0.21 2.37 3.22) is correct as initial table structure.",
"Please pay attention below information",
"Assume the column sequence number is from 1.",
"Assume the numeric column sequence number is from 1.",
"\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.",
"For main fund: Platinum Asia with values: 2.14 2.990.02 0.000.21 2.37 3.22, ",
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
"For main fund: Platinum Asia with values: 2.14 2.99 0.02 0.00 0.21 2.37 3.22, ",
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
"Therefore, the output should be:",
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}"
]
@ -485,10 +505,11 @@
"it means each investment name is with only one fund name, it is for TTR.",
"For example, if the investment name is \"Balanced\", the TTR fund name is \"Balanced TTR\".",
"---Example 1 Start---",
"Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth",
"Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares, 0.07% p.a. for Cash \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth",
"---Example 1 End---",
"Please read the context carefully, especially for \"Retirement and TTR income streams\" part, output all of fund names and relevant values",
"The output should be:",
"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, , {\"fund name\": \"Cash Pension\", \"share name\": \"Cash Pension\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Cash TTR\", \"share name\": \"Cash TTR\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
]
},
{
@ -532,6 +553,24 @@
"The output should be: ",
"{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
]
},
{
"keywords":["Plus other investment fees and costs \nEquals investment fees and costs"],
"prompts": [
"Complex management fee and costs rule:",
"If the table with columns:",
"\"Performance fee\", \"Plus other investment fees and costs\", \"Equals investment fees and costs\", \"Transaction costs(net)\", \"Buy-sell spreads\", \"Transaction costs(gross)\".",
"Both of the management_fee and management_fee_costs are \"Plus other investment fees and costs\".",
"The performance_fee_costs is \"Performance fee\".",
"The buy_spread and sell_spread are \"Buy-sell spreads\".",
"---Example Start---",
"Performance fee \nPlus other investment fees and costs \nEquals investment fees and costs \nTransaction costs(net) \nBuy-sell spreads \nTransaction costs(gross) \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.04 \n0.10 / 0.10 \n0.09 \n",
"---Example End---",
"Please ignore the 3rd column: \"Equals investment fees and costs\" values!!",
"Please read context carefully, don't miss any data row!!",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
]
}
]
}

View File

@ -1526,8 +1526,8 @@ if __name__ == "__main__":
# special_doc_id_list = ["553242411"]
re_run_extract_data = False
re_run_mapping_data = False
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = True
doc_source = "aus_prospectus"
# doc_source = "emea_ar"
@ -1560,8 +1560,8 @@ if __name__ == "__main__":
# "544886057",
# "550769189",
# "553449663"]
special_doc_id_list = ["420339794", "441280757", "454036250", "471206458", "412778803"]
# special_doc_id_list = ["441280757"]
# special_doc_id_list = ["414751292"]
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (

View File

@ -1096,6 +1096,10 @@ def replace_special_table_header(replace_table_header_config: list, page_text: s
break
if updated_text:
break
# split numbers like 1.320.00 to be 1.32 0.00 by regex
if re.search(r'(\d)\.(\d{2})(\d)\.(\d{2})', page_text):
page_text = re.sub(r'(\d)\.(\d{2})(\d)\.(\d{2})', r'\1.\2 \3.\4', page_text)
return page_text