1. optimize performance_fee_costs prompts
2. support calculate metrics by zero equal with empty
This commit is contained in:
parent
c2c0b33015
commit
fb5dda2170
|
|
@ -559,7 +559,8 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
|
||||||
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx",
|
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx",
|
||||||
verify_data_sheet: str = "total_data",
|
verify_data_sheet: str = "total_data",
|
||||||
verify_document_list_file: str = None,
|
verify_document_list_file: str = None,
|
||||||
is_for_all: bool = False
|
is_for_all: bool = False,
|
||||||
|
zero_equal_none: bool = False
|
||||||
):
|
):
|
||||||
print("Start to calculate metrics based on DB data file and extracted file...")
|
print("Start to calculate metrics based on DB data file and extracted file...")
|
||||||
audit_data_df = pd.DataFrame()
|
audit_data_df = pd.DataFrame()
|
||||||
|
|
@ -733,28 +734,30 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
|
||||||
# v_switching_fee = str(doc_verify_sec_row["switching_fee"])
|
# v_switching_fee = str(doc_verify_sec_row["switching_fee"])
|
||||||
# v_activity_fee = str(doc_verify_sec_row["activity_fee"])
|
# v_activity_fee = str(doc_verify_sec_row["activity_fee"])
|
||||||
|
|
||||||
message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs")
|
message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs", zero_equal_none=zero_equal_none)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs"))
|
||||||
message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee")
|
message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee", zero_equal_none=zero_equal_none)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee"))
|
||||||
message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees")
|
message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees", zero_equal_none=zero_equal_none)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees"))
|
||||||
message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment")
|
message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment", zero_equal_none=zero_equal_none)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment"))
|
||||||
message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name")
|
message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name", zero_equal_none=zero_equal_none)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name"))
|
||||||
if is_for_all:
|
if is_for_all:
|
||||||
message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list)
|
message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list, zero_equal_none=zero_equal_none)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee_costs"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee_costs"))
|
||||||
message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost,
|
message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost,
|
||||||
gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list)
|
gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list,
|
||||||
|
zero_equal_none=zero_equal_none)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost"))
|
||||||
message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list)
|
message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list, zero_equal_none=zero_equal_none)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread"))
|
||||||
message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list)
|
message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list, zero_equal_none=zero_equal_none)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread"))
|
||||||
message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges,
|
message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges,
|
||||||
gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list)
|
gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list,
|
||||||
|
zero_equal_none=zero_equal_none)
|
||||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges"))
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges"))
|
||||||
# message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list)
|
# message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list)
|
||||||
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee"))
|
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee"))
|
||||||
|
|
@ -892,7 +895,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
|
||||||
os.makedirs(output_folder, exist_ok=True)
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
|
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
|
||||||
if is_for_all:
|
if is_for_all:
|
||||||
verify_file_name = f"metrics_{verify_file_name}_all"
|
verify_file_name = f"{verify_file_name}_all"
|
||||||
metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx"
|
metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx"
|
||||||
output_file = os.path.join(output_folder, metrics_file_name)
|
output_file = os.path.join(output_folder, metrics_file_name)
|
||||||
with pd.ExcelWriter(output_file) as writer:
|
with pd.ExcelWriter(output_file) as writer:
|
||||||
|
|
@ -1280,11 +1283,11 @@ def generate_message(message: dict,
|
||||||
return message
|
return message
|
||||||
|
|
||||||
|
|
||||||
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""):
|
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = "", zero_equal_none: bool = False):
|
||||||
message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""}
|
message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""}
|
||||||
if gt_value is not None and len(str(gt_value).strip()) > 0:
|
if gt_value is not None and len(str(gt_value).strip()) > 0:
|
||||||
gt_list.append(1)
|
gt_list.append(1)
|
||||||
gt_equal_pred = is_equal(gt_value, pred_value, data_point)
|
gt_equal_pred = is_equal(gt_value, pred_value, data_point, zero_equal_none=zero_equal_none)
|
||||||
if gt_equal_pred:
|
if gt_equal_pred:
|
||||||
pred_list.append(1)
|
pred_list.append(1)
|
||||||
else:
|
else:
|
||||||
|
|
@ -1300,7 +1303,7 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
|
||||||
pred_num = float(pred_value)
|
pred_num = float(pred_value)
|
||||||
# round to 2 decimal places
|
# round to 2 decimal places
|
||||||
pred_value = round(pred_num, 4)
|
pred_value = round(pred_num, 4)
|
||||||
if pred_value == 0:
|
if zero_equal_none and pred_value == 0:
|
||||||
gt_list.append(1)
|
gt_list.append(1)
|
||||||
pred_list.append(1)
|
pred_list.append(1)
|
||||||
else:
|
else:
|
||||||
|
|
@ -1319,7 +1322,7 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
|
||||||
return message
|
return message
|
||||||
|
|
||||||
|
|
||||||
def is_equal(gt_value, pred_value, data_point: str = ""):
|
def is_equal(gt_value, pred_value, data_point: str = "", zero_equal_none: bool = False):
|
||||||
if gt_value is not None and len(str(gt_value).strip()) > 0:
|
if gt_value is not None and len(str(gt_value).strip()) > 0:
|
||||||
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
||||||
if gt_value == "0.0":
|
if gt_value == "0.0":
|
||||||
|
|
@ -1357,7 +1360,7 @@ def is_equal(gt_value, pred_value, data_point: str = ""):
|
||||||
gt_value = round(gt_num, 4)
|
gt_value = round(gt_num, 4)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
if gt_value == 0:
|
if zero_equal_none and gt_value == 0:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
@ -1429,6 +1432,7 @@ if __name__ == "__main__":
|
||||||
verify_document_list_file_list = [None,
|
verify_document_list_file_list = [None,
|
||||||
"./sample_documents/aus_prospectus_29_documents_sample.txt",
|
"./sample_documents/aus_prospectus_29_documents_sample.txt",
|
||||||
"./sample_documents/aus_prospectus_17_documents_sample.txt"]
|
"./sample_documents/aus_prospectus_17_documents_sample.txt"]
|
||||||
|
zero_equal_none = False
|
||||||
is_for_all = True
|
is_for_all = True
|
||||||
for verify_document_list_file in verify_document_list_file_list:
|
for verify_document_list_file in verify_document_list_file_list:
|
||||||
calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
|
calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
|
||||||
|
|
@ -1436,7 +1440,8 @@ if __name__ == "__main__":
|
||||||
verify_file_path=verify_file_path,
|
verify_file_path=verify_file_path,
|
||||||
verify_data_sheet=verify_data_sheet,
|
verify_data_sheet=verify_data_sheet,
|
||||||
verify_document_list_file = verify_document_list_file,
|
verify_document_list_file = verify_document_list_file,
|
||||||
is_for_all=is_for_all)
|
is_for_all=is_for_all,
|
||||||
|
zero_equal_none=zero_equal_none)
|
||||||
|
|
||||||
# for verify_document_list_file in verify_document_list_file_list:
|
# for verify_document_list_file in verify_document_list_file_list:
|
||||||
# calculate_metrics_by_provider(audit_file_path=audit_file_path,
|
# calculate_metrics_by_provider(audit_file_path=audit_file_path,
|
||||||
|
|
|
||||||
|
|
@ -366,12 +366,21 @@
|
||||||
],
|
],
|
||||||
"performance_fee_costs": [
|
"performance_fee_costs": [
|
||||||
"Performance fees is share class level data.",
|
"Performance fees is share class level data.",
|
||||||
"If the performance fees is with the range, please ignore and output empty.",
|
"A. If the performance fees is with the range, please ignore and output empty.",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.",
|
"Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
"The relevant values: 0.00 and 2.18, are in the range, so the output should be:",
|
"The relevant values: 0.00 and 2.18, are in the range, so the output should be:",
|
||||||
"{\"data\": []}"
|
"{\"data\": []}",
|
||||||
|
"B If with pure performance fee in table, please extract relevant values",
|
||||||
|
"---Example Start---",
|
||||||
|
"\n\nFees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nPerformance fees \nAmounts deducted from your investment in \nrelation to the performance of the product. \nPlatinum International Fund Nil 0.15%\nPlatinum Global Fund (Long Only) Nil 0.24%\n",
|
||||||
|
"---Example End---",
|
||||||
|
"a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.",
|
||||||
|
"b. This example mentioned share classes, please output according to share class.",
|
||||||
|
"The output should be",
|
||||||
|
"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}"
|
||||||
|
|
||||||
],
|
],
|
||||||
"minimum_initial_investment": [
|
"minimum_initial_investment": [
|
||||||
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
|
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
|
||||||
|
|
|
||||||
4
main.py
4
main.py
|
|
@ -1560,8 +1560,8 @@ if __name__ == "__main__":
|
||||||
# "544886057",
|
# "544886057",
|
||||||
# "550769189",
|
# "550769189",
|
||||||
# "553449663"]
|
# "553449663"]
|
||||||
special_doc_id_list = ["454036250"]
|
# special_doc_id_list = ["414751292"]
|
||||||
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250"]
|
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue