optimized for management_fee_and_costs and administration_fees
This commit is contained in:
parent
fa2dede454
commit
4ee762963e
|
|
@ -713,6 +713,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
|
||||||
continue
|
continue
|
||||||
doc_verify_sec_row = doc_verify_sec_data.iloc[0]
|
doc_verify_sec_row = doc_verify_sec_data.iloc[0]
|
||||||
raw_fund_name = doc_verify_sec_row["raw_fund_name"]
|
raw_fund_name = doc_verify_sec_row["raw_fund_name"]
|
||||||
|
raw_share_name = doc_verify_sec_row["raw_share_name"]
|
||||||
v_management_fee_and_costs = str(doc_verify_sec_row["management_fee_and_costs"])
|
v_management_fee_and_costs = str(doc_verify_sec_row["management_fee_and_costs"])
|
||||||
v_management_fee = str(doc_verify_sec_row["management_fee"])
|
v_management_fee = str(doc_verify_sec_row["management_fee"])
|
||||||
v_administration_fees = str(doc_verify_sec_row["administration_fees"])
|
v_administration_fees = str(doc_verify_sec_row["administration_fees"])
|
||||||
|
|
@ -733,28 +734,28 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
|
||||||
# v_activity_fee = str(doc_verify_sec_row["activity_fee"])
|
# v_activity_fee = str(doc_verify_sec_row["activity_fee"])
|
||||||
|
|
||||||
message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs")
|
message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee_and_costs"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs"))
|
||||||
message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee")
|
message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee"))
|
||||||
message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees")
|
message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "administration_fees"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees"))
|
||||||
message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment")
|
message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "minimum_initial_investment"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment"))
|
||||||
message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name")
|
message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "benchmark_name"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name"))
|
||||||
if is_for_all:
|
if is_for_all:
|
||||||
message = get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list)
|
message = get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "performance_fee"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee"))
|
||||||
message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost,
|
message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost,
|
||||||
gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list)
|
gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "interposed_vehicle_performance_fee_cost"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost"))
|
||||||
message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list)
|
message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "buy_spread"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread"))
|
||||||
message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list)
|
message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "sell_spread"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread"))
|
||||||
message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges,
|
message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges,
|
||||||
gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list)
|
gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "total_annual_dollar_based_charges"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges"))
|
||||||
# message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list)
|
# message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list)
|
||||||
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee"))
|
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee"))
|
||||||
# message = get_gt_pred_by_compare_values(switching_fee, v_switching_fee, gt_switching_fee_list, pred_switching_fee_list)
|
# message = get_gt_pred_by_compare_values(switching_fee, v_switching_fee, gt_switching_fee_list, pred_switching_fee_list)
|
||||||
|
|
@ -763,9 +764,10 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
|
||||||
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "activity_fee"))
|
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "activity_fee"))
|
||||||
|
|
||||||
message_data_df = pd.DataFrame(message_list)
|
message_data_df = pd.DataFrame(message_list)
|
||||||
message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name', 'data_point', 'gt_value', 'pred_value', 'error']]
|
message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name',
|
||||||
|
'raw_share_name', 'data_point', 'gt_value', 'pred_value', 'error']]
|
||||||
# order by doc_id, raw_fund_name, data_point
|
# order by doc_id, raw_fund_name, data_point
|
||||||
message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_fund_name', 'data_point'])
|
message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_share_name', 'data_point'])
|
||||||
message_data_df.reset_index(drop=True, inplace=True)
|
message_data_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
# calculate metrics
|
# calculate metrics
|
||||||
|
|
@ -1036,6 +1038,7 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/
|
||||||
continue
|
continue
|
||||||
doc_verify_sec_row = doc_verify_sec_data.iloc[0]
|
doc_verify_sec_row = doc_verify_sec_data.iloc[0]
|
||||||
raw_fund_name = doc_verify_sec_row["raw_fund_name"]
|
raw_fund_name = doc_verify_sec_row["raw_fund_name"]
|
||||||
|
raw_share_name = doc_verify_sec_row["raw_share_name"]
|
||||||
v_management_fee_and_costs = str(doc_verify_sec_row["management_fee_and_costs"])
|
v_management_fee_and_costs = str(doc_verify_sec_row["management_fee_and_costs"])
|
||||||
v_management_fee = str(doc_verify_sec_row["management_fee"])
|
v_management_fee = str(doc_verify_sec_row["management_fee"])
|
||||||
v_administration_fees = str(doc_verify_sec_row["administration_fees"])
|
v_administration_fees = str(doc_verify_sec_row["administration_fees"])
|
||||||
|
|
@ -1053,67 +1056,68 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/
|
||||||
provider_gt_pred_data[provider_id]["gt_management_fee_and_costs_list"],
|
provider_gt_pred_data[provider_id]["gt_management_fee_and_costs_list"],
|
||||||
provider_gt_pred_data[provider_id]["pred_management_fee_and_costs_list"],
|
provider_gt_pred_data[provider_id]["pred_management_fee_and_costs_list"],
|
||||||
data_point="management_fee_and_costs")
|
data_point="management_fee_and_costs")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee_and_costs"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs"))
|
||||||
message = get_gt_pred_by_compare_values(management_fee,
|
message = get_gt_pred_by_compare_values(management_fee,
|
||||||
v_management_fee,
|
v_management_fee,
|
||||||
provider_gt_pred_data[provider_id]["gt_management_fee_list"],
|
provider_gt_pred_data[provider_id]["gt_management_fee_list"],
|
||||||
provider_gt_pred_data[provider_id]["pred_management_fee_list"],
|
provider_gt_pred_data[provider_id]["pred_management_fee_list"],
|
||||||
data_point="management_fee")
|
data_point="management_fee")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee"))
|
||||||
message = get_gt_pred_by_compare_values(administration_fees,
|
message = get_gt_pred_by_compare_values(administration_fees,
|
||||||
v_administration_fees,
|
v_administration_fees,
|
||||||
provider_gt_pred_data[provider_id]["gt_administration_fees_list"],
|
provider_gt_pred_data[provider_id]["gt_administration_fees_list"],
|
||||||
provider_gt_pred_data[provider_id]["pred_administration_fees_list"],
|
provider_gt_pred_data[provider_id]["pred_administration_fees_list"],
|
||||||
data_point="administration_fees")
|
data_point="administration_fees")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "administration_fees"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees"))
|
||||||
message = get_gt_pred_by_compare_values(minimum_initial_investment,
|
message = get_gt_pred_by_compare_values(minimum_initial_investment,
|
||||||
v_minimum_initial_investment,
|
v_minimum_initial_investment,
|
||||||
provider_gt_pred_data[provider_id]["gt_minimum_initial_investment_list"],
|
provider_gt_pred_data[provider_id]["gt_minimum_initial_investment_list"],
|
||||||
provider_gt_pred_data[provider_id]["pred_minimum_initial_investment_list"],
|
provider_gt_pred_data[provider_id]["pred_minimum_initial_investment_list"],
|
||||||
data_point="minimum_initial_investment")
|
data_point="minimum_initial_investment")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "minimum_initial_investment"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment"))
|
||||||
message = get_gt_pred_by_compare_values(benchmark_name,
|
message = get_gt_pred_by_compare_values(benchmark_name,
|
||||||
v_benchmark_name,
|
v_benchmark_name,
|
||||||
provider_gt_pred_data[provider_id]["gt_benchmark_name_list"],
|
provider_gt_pred_data[provider_id]["gt_benchmark_name_list"],
|
||||||
provider_gt_pred_data[provider_id]["pred_benchmark_name_list"],
|
provider_gt_pred_data[provider_id]["pred_benchmark_name_list"],
|
||||||
data_point="benchmark_name")
|
data_point="benchmark_name")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "benchmark_name"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name"))
|
||||||
if is_for_all:
|
if is_for_all:
|
||||||
message = get_gt_pred_by_compare_values(performance_fee,
|
message = get_gt_pred_by_compare_values(performance_fee,
|
||||||
v_performance_fee,
|
v_performance_fee,
|
||||||
provider_gt_pred_data[provider_id]["gt_performance_fee_list"],
|
provider_gt_pred_data[provider_id]["gt_performance_fee_list"],
|
||||||
provider_gt_pred_data[provider_id]["pred_performance_fee_list"],
|
provider_gt_pred_data[provider_id]["pred_performance_fee_list"],
|
||||||
data_point="performance_fee")
|
data_point="performance_fee")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "performance_fee"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee"))
|
||||||
message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost,
|
message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost,
|
||||||
v_interposed_vehicle_performance_fee_cost,
|
v_interposed_vehicle_performance_fee_cost,
|
||||||
provider_gt_pred_data[provider_id]["gt_interposed_vehicle_performance_fee_cost_list"],
|
provider_gt_pred_data[provider_id]["gt_interposed_vehicle_performance_fee_cost_list"],
|
||||||
provider_gt_pred_data[provider_id]["pred_interposed_vehicle_performance_fee_cost_list"],
|
provider_gt_pred_data[provider_id]["pred_interposed_vehicle_performance_fee_cost_list"],
|
||||||
data_point="interposed_vehicle_performance_fee_cost")
|
data_point="interposed_vehicle_performance_fee_cost")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "interposed_vehicle_performance_fee_cost"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost"))
|
||||||
message = get_gt_pred_by_compare_values(buy_spread,
|
message = get_gt_pred_by_compare_values(buy_spread,
|
||||||
v_buy_spread,
|
v_buy_spread,
|
||||||
provider_gt_pred_data[provider_id]["gt_buy_spread_list"],
|
provider_gt_pred_data[provider_id]["gt_buy_spread_list"],
|
||||||
provider_gt_pred_data[provider_id]["pred_buy_spread_list"],
|
provider_gt_pred_data[provider_id]["pred_buy_spread_list"],
|
||||||
data_point="buy_spread")
|
data_point="buy_spread")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "buy_spread"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread"))
|
||||||
message = get_gt_pred_by_compare_values(sell_spread,
|
message = get_gt_pred_by_compare_values(sell_spread,
|
||||||
v_sell_spread,
|
v_sell_spread,
|
||||||
provider_gt_pred_data[provider_id]["gt_sell_spread_list"],
|
provider_gt_pred_data[provider_id]["gt_sell_spread_list"],
|
||||||
provider_gt_pred_data[provider_id]["pred_sell_spread_list"],
|
provider_gt_pred_data[provider_id]["pred_sell_spread_list"],
|
||||||
data_point="sell_spread")
|
data_point="sell_spread")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "sell_spread"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread"))
|
||||||
message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges,
|
message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges,
|
||||||
v_total_annual_dollar_based_charges,
|
v_total_annual_dollar_based_charges,
|
||||||
provider_gt_pred_data[provider_id]["gt_total_annual_dollar_based_charges_list"],
|
provider_gt_pred_data[provider_id]["gt_total_annual_dollar_based_charges_list"],
|
||||||
provider_gt_pred_data[provider_id]["pred_total_annual_dollar_based_charges_list"],
|
provider_gt_pred_data[provider_id]["pred_total_annual_dollar_based_charges_list"],
|
||||||
data_point="total_annual_dollar_based_charges")
|
data_point="total_annual_dollar_based_charges")
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "total_annual_dollar_based_charges"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges"))
|
||||||
|
|
||||||
message_data_df = pd.DataFrame(message_list)
|
message_data_df = pd.DataFrame(message_list)
|
||||||
message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name', 'data_point', 'gt_value', 'pred_value', 'error']]
|
message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name',
|
||||||
|
'raw_share_name', 'data_point', 'gt_value', 'pred_value', 'error']]
|
||||||
# order by doc_id, raw_fund_name, data_point
|
# order by doc_id, raw_fund_name, data_point
|
||||||
message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_fund_name', 'data_point'])
|
message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_share_name', 'data_point'])
|
||||||
message_data_df.reset_index(drop=True, inplace=True)
|
message_data_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
# calculate metrics
|
# calculate metrics
|
||||||
|
|
@ -1261,10 +1265,17 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generate_message(message: dict, doc_id: str, sec_id: str, fund_legal_name: str, raw_fund_name: str, datapoint: str):
|
def generate_message(message: dict,
|
||||||
|
doc_id: str,
|
||||||
|
sec_id: str,
|
||||||
|
fund_legal_name: str,
|
||||||
|
raw_fund_name: str,
|
||||||
|
raw_share_name: str,
|
||||||
|
datapoint: str):
|
||||||
message["data_point"] = datapoint
|
message["data_point"] = datapoint
|
||||||
message["fund_legal_name"] = fund_legal_name
|
message["fund_legal_name"] = fund_legal_name
|
||||||
message["raw_fund_name"] = raw_fund_name
|
message["raw_fund_name"] = raw_fund_name
|
||||||
|
message["raw_share_name"] = raw_share_name
|
||||||
message["sec_id"] = sec_id
|
message["sec_id"] = sec_id
|
||||||
message["doc_id"] = str(doc_id)
|
message["doc_id"] = str(doc_id)
|
||||||
return message
|
return message
|
||||||
|
|
@ -1334,27 +1345,29 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
|
audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
|
||||||
audit_data_sheet: str = "Sheet1"
|
audit_data_sheet: str = "Sheet1"
|
||||||
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_46_documents_by_text_20250306171226.xlsx"
|
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250308202351.xlsx"
|
||||||
verify_data_sheet: str = "total_data"
|
verify_data_sheet: str = "total_mapping_data"
|
||||||
# verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt"
|
# verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt"
|
||||||
verify_document_list_file_list = [None, "./sample_documents/aus_prospectus_29_documents_sample.txt", "./sample_documents/aus_prospectus_17_documents_sample.txt"]
|
verify_document_list_file_list = [None,
|
||||||
|
"./sample_documents/aus_prospectus_29_documents_sample.txt",
|
||||||
|
"./sample_documents/aus_prospectus_17_documents_sample.txt"]
|
||||||
is_for_all = False
|
is_for_all = False
|
||||||
# for verify_document_list_file in verify_document_list_file_list:
|
|
||||||
# calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
|
|
||||||
# audit_data_sheet=audit_data_sheet,
|
|
||||||
# verify_file_path=verify_file_path,
|
|
||||||
# verify_data_sheet=verify_data_sheet,
|
|
||||||
# verify_document_list_file = verify_document_list_file,
|
|
||||||
# is_for_all=is_for_all)
|
|
||||||
|
|
||||||
for verify_document_list_file in verify_document_list_file_list:
|
for verify_document_list_file in verify_document_list_file_list:
|
||||||
calculate_metrics_by_provider(audit_file_path=audit_file_path,
|
calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
|
||||||
audit_data_sheet=audit_data_sheet,
|
audit_data_sheet=audit_data_sheet,
|
||||||
verify_file_path=verify_file_path,
|
verify_file_path=verify_file_path,
|
||||||
verify_data_sheet=verify_data_sheet,
|
verify_data_sheet=verify_data_sheet,
|
||||||
verify_document_list_file = verify_document_list_file,
|
verify_document_list_file = verify_document_list_file,
|
||||||
is_for_all=is_for_all)
|
is_for_all=is_for_all)
|
||||||
|
|
||||||
|
# for verify_document_list_file in verify_document_list_file_list:
|
||||||
|
# calculate_metrics_by_provider(audit_file_path=audit_file_path,
|
||||||
|
# audit_data_sheet=audit_data_sheet,
|
||||||
|
# verify_file_path=verify_file_path,
|
||||||
|
# verify_data_sheet=verify_data_sheet,
|
||||||
|
# verify_document_list_file = verify_document_list_file,
|
||||||
|
# is_for_all=is_for_all)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# set_mapping_to_17_documents_data()
|
# set_mapping_to_17_documents_data()
|
||||||
|
|
|
||||||
|
|
@ -302,6 +302,8 @@ class DataExtraction:
|
||||||
raw_name_dict.pop(raw_name_as_production_name)
|
raw_name_dict.pop(raw_name_as_production_name)
|
||||||
|
|
||||||
for data_dict in data_list:
|
for data_dict in data_list:
|
||||||
|
# if data_dict.get("page_index", -1) > 9:
|
||||||
|
# break
|
||||||
extract_data = data_dict.get("extract_data", {})
|
extract_data = data_dict.get("extract_data", {})
|
||||||
data = extract_data.get("data", [])
|
data = extract_data.get("data", [])
|
||||||
remove_item_list = []
|
remove_item_list = []
|
||||||
|
|
@ -312,7 +314,10 @@ class DataExtraction:
|
||||||
share_name = data_item.get("share_name", "")
|
share_name = data_item.get("share_name", "")
|
||||||
raw_name = self.get_raw_name(fund_name, share_name)
|
raw_name = self.get_raw_name(fund_name, share_name)
|
||||||
if raw_name.lower() in self.document_production.lower():
|
if raw_name.lower() in self.document_production.lower():
|
||||||
dp_keys = [key for key in keys if key not in ["fund_name", "share_name"]]
|
dp_keys = [key for key in keys if key not in ["fund_name",
|
||||||
|
"share_name",
|
||||||
|
"management_fee_and_costs",
|
||||||
|
"management_fee"]]
|
||||||
for dp_key in dp_keys:
|
for dp_key in dp_keys:
|
||||||
if dp_key not in datapoint_list_with_production_name:
|
if dp_key not in datapoint_list_with_production_name:
|
||||||
datapoint_list_with_production_name.append(dp_key)
|
datapoint_list_with_production_name.append(dp_key)
|
||||||
|
|
@ -593,7 +598,7 @@ class DataExtraction:
|
||||||
previous_page_datapoints = []
|
previous_page_datapoints = []
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
# if page_num != 18:
|
# if page_num != 25:
|
||||||
# continue
|
# continue
|
||||||
if page_num in handled_page_num_list:
|
if page_num in handled_page_num_list:
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
|
|
@ -145,15 +145,32 @@
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": []}",
|
"{\"data\": []}",
|
||||||
"\n",
|
"\n",
|
||||||
"B. If there are multiple Management fee and costs sub-columns, here is the rule: ",
|
"B. The table title is with Ongoing annual fees and costs.",
|
||||||
"B.1 With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
"B.1 Management fees and costs should not include transaction costs and performance fees.",
|
||||||
|
"---Example Start",
|
||||||
|
"Ongoing annual\nfees and costs\nC Class and E Class -P Class - Performance \nStandard Fee Option Fee Option \n36 \n(E Class is closed to \nnew investors) \nPlatinum International Fund 1.56% p.a. 1.46% p.a. \nOngoing annual fees and costs include estimated management fees and costs, estimated \ntransaction costs and estimated performance fees (for P Class – Performance Fee Option \nonly). Please see page 36 for further information.",
|
||||||
|
"---Example End",
|
||||||
|
"The values 1.56 and 1.46 include estimated management fees and costs, estimated \ntransaction costs and estimated performance fees, should ignore them.",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": []}",
|
||||||
|
"B.2 If with pure management fees and costs in table, please extract relevant values",
|
||||||
|
"---Example Start---",
|
||||||
|
"Fees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nManagement fees and costs \nEstimated management fees and costs \nper annum are: \nPlatinum International Fund 1.41% 1.16%\nPlatinum Global Fund (Long Only) 1.35% 1.10%\n",
|
||||||
|
"---Example End---",
|
||||||
|
"a. For this example, there is pure \"Management fees and costs\", please extract relevant values.",
|
||||||
|
"b. This example mentioned share classes, please output according to share class.",
|
||||||
|
"The output should be",
|
||||||
|
"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"management_fee_and_costs\": 1.41, \"management_fee\": 1.41}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"management_fee_and_costs\": 1.41, \"management_fee\": 1.41}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"management_fee_and_costs\": 1.16, \"management_fee\": 1.16}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.35}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.35}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"management_fee_and_costs\": 1.1, \"management_fee\": 1.1}]}",
|
||||||
|
"\n",
|
||||||
|
"C. If there are multiple Management fee and costs sub-columns, here is the rule: ",
|
||||||
|
"C.1 With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
||||||
"---Example Start---",
|
"---Example Start---",
|
||||||
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
|
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
|
||||||
"---Example End---",
|
"---Example End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]}",
|
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"B.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".",
|
"C.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".",
|
||||||
"The management_fee is the value of \"Management fee (% pa)\".",
|
"The management_fee is the value of \"Management fee (% pa)\".",
|
||||||
"The management_fee_and_costs is the value of \"Total management cost (% pa)\".",
|
"The management_fee_and_costs is the value of \"Total management cost (% pa)\".",
|
||||||
"---Example Start---",
|
"---Example Start---",
|
||||||
|
|
@ -162,7 +179,7 @@
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
|
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
"D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
|
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
|
|
@ -173,15 +190,20 @@
|
||||||
"---Example 2 End---",
|
"---Example 2 End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}",
|
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}",
|
||||||
|
"---Example 3 Start---",
|
||||||
|
"Fund name \nManagement \nfees and costs \n(p.a.) 1 \nBuy/sell \nspread \n(%) 2 \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity \nIncome Fund \n1.22% 0.05\nFirst Sentier Wholesale Geared \nShare Fund 3 \n1.04%(g)/2.18%(n) 4 0.20–0.50 5 \n\n",
|
||||||
|
"---Example 3 End---",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"First Sentier Wholesale Equity Income Fund\", \"share name\": \"First Sentier Wholesale Equity Income Fund\", \"management_fee_and_costs\": 1.22, \"management_fee\": 1.22, \"buy_spread\": 0.05, \"sell_spread\": 0.05}, {\"fund name\": \"First Sentier Wholesale Geared Share Fund\", \"share name\": \"First Sentier Wholesale Geared Share Fund\", \"management_fee_and_costs\": 2.18, \"management_fee\": 2.18, \"buy_spread\": 0.5, \"sell_spread\": 0.5}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"D. If only find \"Management fees\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
"E. If only find \"Management fees\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Underlying FundManagement fee component \nVanguard High Growth Index Fund1.50% p.a. of the NAV of the Underlying Fund\n",
|
"Underlying FundManagement fee component \nVanguard High Growth Index Fund1.50% p.a. of the NAV of the Underlying Fund\n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Vanguard High Growth Index Fund\", \"share name\": \"Vanguard High Growth Index Fund\", \"management_fee_and_costs\": 1.5, \"management_fee\": 1.5}]}",
|
"{\"data\": [{\"fund name\": \"Vanguard High Growth Index Fund\", \"share name\": \"Vanguard High Growth Index Fund\", \"management_fee_and_costs\": 1.5, \"management_fee\": 1.5}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"E. If with columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\", please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
"F. If with columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\", please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"\n\nInvestment option \nInvestment fees \nand costs (excl \nPerformance Fees) \nPerformance \nFee \nTransaction \ncosts \nTotal \ninvestment \nfees and costs \nBalanced 0.53% 0.43% 0.13%1.09% \nCapital Stable \n0.32% \n0.18% \n0.09% \n0.59% \n",
|
"\n\nInvestment option \nInvestment fees \nand costs (excl \nPerformance Fees) \nPerformance \nFee \nTransaction \ncosts \nTotal \ninvestment \nfees and costs \nBalanced 0.53% 0.43% 0.13%1.09% \nCapital Stable \n0.32% \n0.18% \n0.09% \n0.59% \n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
|
|
@ -191,7 +213,7 @@
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee\": 0.18}]}",
|
"{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee\": 0.18}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"F. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00% or 0.55%-1.00%, please ignore and output empty.",
|
"G. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00% or 0.55%-1.00%, please ignore and output empty.",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to ‘Administration fee \nrebate’ section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown ‘in the ‘Management \nfees and costs’ section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n",
|
"Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to ‘Administration fee \nrebate’ section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown ‘in the ‘Management \nfees and costs’ section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
|
|
@ -208,7 +230,7 @@
|
||||||
"The relevant values: 0.67 and 1.17, are in the range, should ignore, so the output should be:",
|
"The relevant values: 0.67 and 1.17, are in the range, should ignore, so the output should be:",
|
||||||
"{\"data\": []}",
|
"{\"data\": []}",
|
||||||
"\n",
|
"\n",
|
||||||
"G. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.",
|
"H. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Fees and costs for \nyour investment options \n\nAdministration fees and costs apply in addition to the fees and costs shown in this table. Please refer to the PDS and Fee Brochure for \nfurther information about fees and costs, including how the figures shown below are calculated. \n\nThe investment fees and \ncosts are made up of \nPerformance \nfee \nPlus \nother \ninvestment \nfees and \ncosts \nEquals \ninvestment \nfees and \ncosts \nTransaction \ncosts (net) \nBuy-sell \nspreads \nTransaction \ncosts \n(gross) 1 \n% pa \n% pa \n% pa \nEntry %/ \nExit % \n% pa \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \n",
|
"Fees and costs for \nyour investment options \n\nAdministration fees and costs apply in addition to the fees and costs shown in this table. Please refer to the PDS and Fee Brochure for \nfurther information about fees and costs, including how the figures shown below are calculated. \n\nThe investment fees and \ncosts are made up of \nPerformance \nfee \nPlus \nother \ninvestment \nfees and \ncosts \nEquals \ninvestment \nfees and \ncosts \nTransaction \ncosts (net) \nBuy-sell \nspreads \nTransaction \ncosts \n(gross) 1 \n% pa \n% pa \n% pa \nEntry %/ \nExit % \n% pa \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
|
|
@ -237,7 +259,35 @@
|
||||||
"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
|
"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
|
||||||
"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
|
"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
|
||||||
"So the output should be:",
|
"So the output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}"
|
"{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}",
|
||||||
|
"\n",
|
||||||
|
"I. Some table is very complex, with many data points columns, please extract the relevant values.",
|
||||||
|
"---Example 1 Start---",
|
||||||
|
"Option name \nTotal administration\nand investment\nfees and costs (p.a.)\n= \nAdministration\nfees and\ncosts (p.a.)\n+ \nInvestment fees \nand costs (p.a.) \n2 \n+ \nPerformance \nfee (p.a.) \n1 \nBuy/sell\nspread\n(%)\n6 \nCFS Multi-Manager Multi-Sector (These investment options are located in the Investment Options Menu.) \nCFS Defensive \n0.94% \n0.20% 0.74%0.15 \nCFS Conservative 1.04% \n1 \n0.20% 0.81% 0.03%\n1 \n0.15 \n",
|
||||||
|
"---Example 1 End---",
|
||||||
|
"For this table, there are \"Administration fees and costs (p.a.)\" as administration_fees, ",
|
||||||
|
"\"Investment fees and costs (p.a.)\" as management_fee_and_costs and management_fee, ",
|
||||||
|
"\"Performance fee (p.a.)\" as performance_fee, ",
|
||||||
|
"\"Buy/sell spread (%)\" as buy_spread and sell_spread.",
|
||||||
|
"If one row has 5 decimal numbers, ",
|
||||||
|
"the 2nd decimal number is the administration_fees, ",
|
||||||
|
"the 3rd decimal number is the management_fee_and_costs and management_fee, ",
|
||||||
|
"the 4th decimal number is the performance_fee, ",
|
||||||
|
"the 5th decimal number is the buy_spread and sell_spread.",
|
||||||
|
"If one row has 4 decimal numbers, ",
|
||||||
|
"the 2nd decimal number is the administration_fees, ",
|
||||||
|
"the 3rd decimal number is the management_fee_and_costs and management_fee, ",
|
||||||
|
"the 4th decimal number is the buy_spread and sell_spread.",
|
||||||
|
"Please always ignore the 1st decimal number, we need not the total sum values.",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}",
|
||||||
|
"J. If exist **\"Maximum management fee\"** in context, please ignore relevant values.",
|
||||||
|
"---Example Start---",
|
||||||
|
"Fund name \nMaximum \nmanagement \nfee (p.a.) \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity Income Fund 3.075% \nAUSTRALIAN SHARE \nFirst Sentier Wholesale Australian Share Fund 1.538%",
|
||||||
|
"---Example End---",
|
||||||
|
"The values in example is **Maximum management fee**, should ignore all of them.",
|
||||||
|
"The Output should be:",
|
||||||
|
"{\"data\": []}"
|
||||||
],
|
],
|
||||||
"administration_fees":[
|
"administration_fees":[
|
||||||
"Administration fees and costs is share class level data.",
|
"Administration fees and costs is share class level data.",
|
||||||
|
|
@ -253,6 +303,11 @@
|
||||||
"----Example 2 End----",
|
"----Example 2 End----",
|
||||||
"The administration fee is $1.00 per week plus 0.17% pa, so the output should be:",
|
"The administration fee is $1.00 per week plus 0.17% pa, so the output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17}]}",
|
"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17}]}",
|
||||||
|
"---Example 3 Start---",
|
||||||
|
"\nPrime Super Income Stream\nType of fee \nor cost \nAmount How and when paid \nOngoing annual fees and costs \n1 \nAdministration \nfees and costs \nAdministration \nfees of $1.30 \nper week \nPlus \n0.50% p.a. of \nyour account \nbalance, capped \nat $500 p.a. \nDeducted from your \naccount on the last \nbusiness day of each \nmonth, except if you \nare leaving Prime \nSuper, in which case \nit is deducted prior to \nyour exit from Prime \nSuper. \nInvestment \nfees and costs \n2 \n0.07% to 1.00% \nof assets p.a. \ndepending on \nthe investment \noption \nTaken into account \nprior to the declaration \nof weekly earning \nrates. This cost is not \ndeducted directly from \nyour account. \n",
|
||||||
|
"---Example 3 End---",
|
||||||
|
"The administration fee is $1.30 per week plus 0.50% p.a., so the output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"Prime Super Income Stream\", \"share name\": \"Prime Super Income Stream\", \"administration_fees\": 0.50}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"Complex cases:",
|
"Complex cases:",
|
||||||
"A. Need to add multiple numbers together.",
|
"A. Need to add multiple numbers together.",
|
||||||
|
|
@ -342,44 +397,43 @@
|
||||||
{
|
{
|
||||||
"management_fee_and_costs": [
|
"management_fee_and_costs": [
|
||||||
{
|
{
|
||||||
"keywords": ["Estimated investment \ncosts \nAdministration \nfees"],
|
"keywords": ["Administration fees \nEstimated administration costs \nInvestment fees"],
|
||||||
"prompts": ["Complex management fee and costs rule:",
|
"prompts": ["Complex management fee and costs rule:",
|
||||||
"If the table with columns:",
|
"If the table with columns:",
|
||||||
"\"Administration fees (% pa)\", \"Investment fees (% pa)\" and \"Estimated other investment costs (% pa)\"",
|
"\"Administration fees\", \"Investment fees\" ,\"Estimated other investment costs\" and \"Estimated performance fees\"",
|
||||||
"The administration_fees is \"Administration fees (% pa)\"",
|
"The administration_fees is \"Administration fees\"",
|
||||||
"The management_fee is \"Investment fees (% pa)\".",
|
"The management_fee is \"Investment fees\".",
|
||||||
"The management_fee_and_costs is \"Investment fees (% pa)\" + \"Estimated other investment costs (% pa)\".",
|
"The management_fee_and_costs is \"Investment fees\" + \"Estimated other investment costs\".",
|
||||||
|
"The performance_fee is \"Estimated performance fees\"",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Investment \noption \nAdministration fees and \nestimated administration costs \nInvestment fees and estimated \ninvestment costs \nEstimated investment \ncosts \nAdministration \nfees \n(% pa) \nInvestment \nfees \n(% pa) \n2 \nEstimated \ntotal \nongoing \nEstimated \nadministration \ncosts \n(% pa) \n1 \nEstimated \nperformance \nfees \n(% pa) \n3 \nEstimated \ntransaction \ncosts \n(% pa) \n5 \nEstimated \nother \ninvestment \ncosts \n(% pa) \n4 \nannual \nfees and \ncosts \n(% pa) \nCash \nPerpetual Cash \n0.10% \n0.00% \n0.00% \nn/a \n0.00% \n0.02% \n0.12% \nFixed income and credit \nBentham Global \nIncome \n0.25% \n0.00% \n0.67% \nn/a \n0.00% \n0.05% \n0.97% \nProperty and infrastructure \nLazard Global \nListed \nInfrastructure \n0.25% \n0.00% \n0.80% \nn/a \n0.08% \n0.09% \n1.22% \n",
|
"\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \nCash \nPerpetual Cash \n0.10% \n0.00% \n0.00% \nn/a \n0.00% \n0.02% \n0.12% \nFixed income and credit \nBentham Global \nIncome \n0.25% \n0.00% \n0.67% \nn/a \n0.00% \n0.05% \n0.97% \nInternetional shares \nPerpetual Global \nInnovation Share \n0.25% \n0.00% \n0.99% \n2.30 \n0.01% \n0.27% \n3.82% \n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
"For this case, although the table header is with disorder issue during PDF contents extraction issue.",
|
"The data points numbers order in data row (for example: 0.25% \n0.00% \n0.99% \n2.30 \n0.01% \n0.27% \n3.82% \n) is correct as initial table structure.",
|
||||||
"But the data points numbers order in data row (for example: 0.25% \n0.00% \n0.80% \nn/a \n0.08% \n0.09% \n1.22% \n) is correct as initial table structure.",
|
|
||||||
"Please pay attention below information",
|
"Please pay attention below information",
|
||||||
"Assume the column sequence number is from 1.",
|
"Assume the column sequence number is from 1.",
|
||||||
"\"Administration fees (% pa)\" values are as the column 1 numbers, \"Investment fees (% pa)\" values are as the column 3 numbers, \"Estimated other investment costs (% pa)\" values are as the column 5 numbers.",
|
"\"Administration fees\" values are as the column 1 numbers, \"Investment fees\" values are as the column 3 numbers, \"Estimated other investment costs\" values are as the column 5 numbers, \"Estimated performance fees\" values are as the column 4 numbers.",
|
||||||
"For fund: Lazard Global Listed Infrastructure, the administration_fees should be the column 1 number: 0.25, the management_fee should be the column 3 number: 0.8, the management_fee_and_costs should be 0.88 = 0.8(the column 3 number) + 0.08 (the column 5 number)",
|
"For fund: Perpetual Global Innovation Share, the administration_fees should be the column 1 number: 0.25, the management_fee should be the column 3 number: 0.99, the management_fee_and_costs should be 1 = 0.99(the column 3 number) + 0.01 (the column 5 number), the performance_fee should be 2.3 (the column 4 number)",
|
||||||
"Therefore, the output should be:",
|
"Therefore, the output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Perpetual Cash\", \"share name\": \"Perpetual Cash\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"administration_fees\": 0.10}, {\"fund name\": \"Bentham Global Income\", \"share name\": \"Bentham Global Income\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0, \"administration_fees\": 0.25}]}, {\"fund name\": \"Lazard Global Listed Infrastructure\", \"share name\": \"Lazard Global Listed Infrastructure\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.08, \"administration_fees\": 0.25}"
|
"{\"data\": [{\"fund name\": \"Perpetual Cash\", \"share name\": \"Perpetual Cash\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"administration_fees\": 0.10}, {\"fund name\": \"Bentham Global Income\", \"share name\": \"Bentham Global Income\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"administration_fees\": 0.25}]}, {\"fund name\": \"Perpetual Global Innovation Share\", \"share name\": \"Perpetual Global Innovation Share\", \"management_fee_and_costs\": 1, \"management_fee\": 0.99, \"administration_fees\": 0.25, \"performance_fee\": 2.3}"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"keywords": ["Entry Fee \nNil Entry"],
|
"keywords": ["Entry Fee option \nNil Entry option"],
|
||||||
"prompts": ["Complex management fee and costs rule:",
|
"prompts": ["Complex management fee and costs rule:",
|
||||||
"If the table with columns:",
|
"If the table with columns:",
|
||||||
"\"Entry Fee option\", \"Nil Entry Free option\", \"Estimated other investment costs\", \"Estimated Performance fees (B)\"",
|
"\"Entry Fee option\", \"Nil Entry option\", \"Estimated Other investment costs\", \"Estimated Performance fees\"",
|
||||||
"The performance_fee is \"Estimated Performance fees (B)\"",
|
"The performance_fee is \"Estimated Performance fees\"",
|
||||||
"The fund name's tail is \"Entry Fee\" for \"Entry Fee option\", e.g. if fund name is \"MultiSeries 30\", the Entry Fee fund name is \"MultiSeries 30 Entry Fee\"",
|
"The fund name's tail is \"Entry Fee\" for \"Entry Fee option\", e.g. if fund name is \"MultiSeries 30\", the Entry Fee fund name is \"MultiSeries 30 Entry Fee\"",
|
||||||
"The fund name's tail is \"Nil Entry\" for \"Nil Entry Free option\", e.g. if fund name is \"MultiSeries 30\", the Nil Entry fund name is \"MultiSeries 30 Nil Entry\".",
|
"The fund name's tail is \"Nil Entry\" for \"Nil Entry option\", e.g. if fund name is \"MultiSeries 30\", the Nil Entry fund name is \"MultiSeries 30 Nil Entry\".",
|
||||||
"For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".",
|
"For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".",
|
||||||
"For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry Free option\" + \"Estimated other investment costs\".",
|
"For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry option\" + \"Estimated other investment costs\".",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n",
|
"Management Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
"For this case, although the table header is with disorder issue during PDF contents extraction issue.",
|
"The data points numbers order in data row (for example: 2.14 2.990.02 0.000.21 2.37 3.22) is correct as initial table structure.",
|
||||||
"But the data points numbers order in data row (for example: 2.14 2.990.02 0.000.21 2.37 3.22) is correct as initial table structure.",
|
|
||||||
"Please pay attention below information",
|
"Please pay attention below information",
|
||||||
"Assume the column sequence number is from 1.",
|
"Assume the column sequence number is from 1.",
|
||||||
"\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry Free option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees (B)\" values are as the column 4 numbers.",
|
"\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.",
|
||||||
"For main fund: Platinum Asia with values: 2.14 2.990.02 0.000.21 2.37 3.22, ",
|
"For main fund: Platinum Asia with values: 2.14 2.990.02 0.000.21 2.37 3.22, ",
|
||||||
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)",
|
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)",
|
||||||
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)",
|
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)",
|
||||||
|
|
|
||||||
6
main.py
6
main.py
|
|
@ -1526,8 +1526,8 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# special_doc_id_list = ["553242411"]
|
# special_doc_id_list = ["553242411"]
|
||||||
|
|
||||||
re_run_extract_data = True
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = False
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
doc_source = "aus_prospectus"
|
doc_source = "aus_prospectus"
|
||||||
# doc_source = "emea_ar"
|
# doc_source = "emea_ar"
|
||||||
|
|
@ -1560,7 +1560,7 @@ if __name__ == "__main__":
|
||||||
# "544886057",
|
# "544886057",
|
||||||
# "550769189",
|
# "550769189",
|
||||||
# "553449663"]
|
# "553449663"]
|
||||||
# special_doc_id_list = ["539241700"]
|
# special_doc_id_list = ["506913190"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
|
|
@ -1099,6 +1099,18 @@ def replace_special_table_header(page_text: str):
|
||||||
"regex_all_list":
|
"regex_all_list":
|
||||||
[r"Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Transactions\s*costs[\s\S]*?Buy\/sell\s*spreads\s*\(\%\)\s*\n"],
|
[r"Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Transactions\s*costs[\s\S]*?Buy\/sell\s*spreads\s*\(\%\)\s*\n"],
|
||||||
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
|
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# item 0: document 401212184, page 17 - 20
|
||||||
|
"regex_all_list":
|
||||||
|
[r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Entry\s*Fee[\s\S]*?Nil\s*Entry[\s\S]*?Other\s*investment\s*costs[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Nil\s*Entry\s*Fee\s*.*\n",
|
||||||
|
r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Estimated\s*Other[\s\S]*?Entry\s*Fee\s*Nil\s*Entry[\s\S]*?Nil\s*Entry[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Fee\s*option.*\n"],
|
||||||
|
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regex_all_list":
|
||||||
|
[r"Investment\s*option\s*Administration fees[\s\S]*?administration\s*costs\s*Investment\s*fees[\s\S]*?investment\s*costs\s*Administration\s*fees[\s\S]*?Investment\s*fees[\s\S]*?Estimated\s*administration[\s\S]*?transaction\s*costs[\s\S]*?annual\s*fees\s*and\s*costs\s*\(\%\s*pa\)\s*\n"],
|
||||||
|
"replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
updated_text = False
|
updated_text = False
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue