1. optimize performance_fee_costs prompts

2. support calculate metrics by zero equal with empty
This commit is contained in:
Blade He 2025-03-12 23:45:52 -05:00
parent c2c0b33015
commit fb5dda2170
3 changed files with 36 additions and 22 deletions

View File

@ -559,7 +559,8 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx", verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx",
verify_data_sheet: str = "total_data", verify_data_sheet: str = "total_data",
verify_document_list_file: str = None, verify_document_list_file: str = None,
is_for_all: bool = False is_for_all: bool = False,
zero_equal_none: bool = False
): ):
print("Start to calculate metrics based on DB data file and extracted file...") print("Start to calculate metrics based on DB data file and extracted file...")
audit_data_df = pd.DataFrame() audit_data_df = pd.DataFrame()
@ -733,28 +734,30 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
# v_switching_fee = str(doc_verify_sec_row["switching_fee"]) # v_switching_fee = str(doc_verify_sec_row["switching_fee"])
# v_activity_fee = str(doc_verify_sec_row["activity_fee"]) # v_activity_fee = str(doc_verify_sec_row["activity_fee"])
message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs") message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs", zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs")) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs"))
message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee") message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee", zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee")) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee"))
message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees") message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees", zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees")) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees"))
message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment") message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment", zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment")) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment"))
message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name") message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name", zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name")) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name"))
if is_for_all: if is_for_all:
message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list) message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list, zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee_costs")) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee_costs"))
message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost,
gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list,
zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost")) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost"))
message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list) message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list, zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread")) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread"))
message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list) message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list, zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread")) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread"))
message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges, message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges,
gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list,
zero_equal_none=zero_equal_none)
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges")) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges"))
# message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list) # message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list)
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee")) # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee"))
@ -892,7 +895,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
os.makedirs(output_folder, exist_ok=True) os.makedirs(output_folder, exist_ok=True)
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
if is_for_all: if is_for_all:
verify_file_name = f"metrics_{verify_file_name}_all" verify_file_name = f"{verify_file_name}_all"
metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx" metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx"
output_file = os.path.join(output_folder, metrics_file_name) output_file = os.path.join(output_folder, metrics_file_name)
with pd.ExcelWriter(output_file) as writer: with pd.ExcelWriter(output_file) as writer:
@ -1280,11 +1283,11 @@ def generate_message(message: dict,
return message return message
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""): def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = "", zero_equal_none: bool = False):
message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""} message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""}
if gt_value is not None and len(str(gt_value).strip()) > 0: if gt_value is not None and len(str(gt_value).strip()) > 0:
gt_list.append(1) gt_list.append(1)
gt_equal_pred = is_equal(gt_value, pred_value, data_point) gt_equal_pred = is_equal(gt_value, pred_value, data_point, zero_equal_none=zero_equal_none)
if gt_equal_pred: if gt_equal_pred:
pred_list.append(1) pred_list.append(1)
else: else:
@ -1300,7 +1303,7 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
pred_num = float(pred_value) pred_num = float(pred_value)
# round to 2 decimal places # round to 2 decimal places
pred_value = round(pred_num, 4) pred_value = round(pred_num, 4)
if pred_value == 0: if zero_equal_none and pred_value == 0:
gt_list.append(1) gt_list.append(1)
pred_list.append(1) pred_list.append(1)
else: else:
@ -1319,7 +1322,7 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
return message return message
def is_equal(gt_value, pred_value, data_point: str = ""): def is_equal(gt_value, pred_value, data_point: str = "", zero_equal_none: bool = False):
if gt_value is not None and len(str(gt_value).strip()) > 0: if gt_value is not None and len(str(gt_value).strip()) > 0:
if pred_value is not None and len(str(pred_value).strip()) > 0: if pred_value is not None and len(str(pred_value).strip()) > 0:
if gt_value == "0.0": if gt_value == "0.0":
@ -1357,7 +1360,7 @@ def is_equal(gt_value, pred_value, data_point: str = ""):
gt_value = round(gt_num, 4) gt_value = round(gt_num, 4)
except Exception as e: except Exception as e:
pass pass
if gt_value == 0: if zero_equal_none and gt_value == 0:
return True return True
return False return False
@ -1429,6 +1432,7 @@ if __name__ == "__main__":
verify_document_list_file_list = [None, verify_document_list_file_list = [None,
"./sample_documents/aus_prospectus_29_documents_sample.txt", "./sample_documents/aus_prospectus_29_documents_sample.txt",
"./sample_documents/aus_prospectus_17_documents_sample.txt"] "./sample_documents/aus_prospectus_17_documents_sample.txt"]
zero_equal_none = False
is_for_all = True is_for_all = True
for verify_document_list_file in verify_document_list_file_list: for verify_document_list_file in verify_document_list_file_list:
calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
@ -1436,7 +1440,8 @@ if __name__ == "__main__":
verify_file_path=verify_file_path, verify_file_path=verify_file_path,
verify_data_sheet=verify_data_sheet, verify_data_sheet=verify_data_sheet,
verify_document_list_file = verify_document_list_file, verify_document_list_file = verify_document_list_file,
is_for_all=is_for_all) is_for_all=is_for_all,
zero_equal_none=zero_equal_none)
# for verify_document_list_file in verify_document_list_file_list: # for verify_document_list_file in verify_document_list_file_list:
# calculate_metrics_by_provider(audit_file_path=audit_file_path, # calculate_metrics_by_provider(audit_file_path=audit_file_path,

View File

@ -366,12 +366,21 @@
], ],
"performance_fee_costs": [ "performance_fee_costs": [
"Performance fees is share class level data.", "Performance fees is share class level data.",
"If the performance fees is with the range, please ignore and output empty.", "A. If the performance fees is with the range, please ignore and output empty.",
"---Example 1 Start---", "---Example 1 Start---",
"Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.", "Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.",
"---Example 1 End---", "---Example 1 End---",
"The relevant values: 0.00 and 2.18, are in the range, so the output should be:", "The relevant values: 0.00 and 2.18, are in the range, so the output should be:",
"{\"data\": []}" "{\"data\": []}",
"B If with pure performance fee in table, please extract relevant values",
"---Example Start---",
"\n\nFees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nPerformance fees \nAmounts deducted from your investment in \nrelation to the performance of the product. \nPlatinum International Fund Nil 0.15%\nPlatinum Global Fund (Long Only) Nil 0.24%\n",
"---Example End---",
"a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.",
"b. This example mentioned share classes, please output according to share class.",
"The output should be",
"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}"
], ],
"minimum_initial_investment": [ "minimum_initial_investment": [
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.", "Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",

View File

@ -1560,8 +1560,8 @@ if __name__ == "__main__":
# "544886057", # "544886057",
# "550769189", # "550769189",
# "553449663"] # "553449663"]
special_doc_id_list = ["454036250"] # special_doc_id_list = ["414751292"]
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250"] # special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (