1. optimize performance_fee_costs prompts
2. support calculate metrics by zero equal with empty
This commit is contained in:
parent
c2c0b33015
commit
fb5dda2170
|
|
@ -559,7 +559,8 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
|
|||
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx",
|
||||
verify_data_sheet: str = "total_data",
|
||||
verify_document_list_file: str = None,
|
||||
is_for_all: bool = False
|
||||
is_for_all: bool = False,
|
||||
zero_equal_none: bool = False
|
||||
):
|
||||
print("Start to calculate metrics based on DB data file and extracted file...")
|
||||
audit_data_df = pd.DataFrame()
|
||||
|
|
@ -733,28 +734,30 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
|
|||
# v_switching_fee = str(doc_verify_sec_row["switching_fee"])
|
||||
# v_activity_fee = str(doc_verify_sec_row["activity_fee"])
|
||||
|
||||
message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs")
|
||||
message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs", zero_equal_none=zero_equal_none)
|
||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs"))
|
||||
message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee")
|
||||
message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee", zero_equal_none=zero_equal_none)
|
||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee"))
|
||||
message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees")
|
||||
message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees", zero_equal_none=zero_equal_none)
|
||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees"))
|
||||
message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment")
|
||||
message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment", zero_equal_none=zero_equal_none)
|
||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment"))
|
||||
message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name")
|
||||
message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name", zero_equal_none=zero_equal_none)
|
||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name"))
|
||||
if is_for_all:
|
||||
message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list)
|
||||
message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list, zero_equal_none=zero_equal_none)
|
||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee_costs"))
|
||||
message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost,
|
||||
gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list)
|
||||
gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list,
|
||||
zero_equal_none=zero_equal_none)
|
||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost"))
|
||||
message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list)
|
||||
message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list, zero_equal_none=zero_equal_none)
|
||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread"))
|
||||
message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list)
|
||||
message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list, zero_equal_none=zero_equal_none)
|
||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread"))
|
||||
message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges,
|
||||
gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list)
|
||||
gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list,
|
||||
zero_equal_none=zero_equal_none)
|
||||
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges"))
|
||||
# message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list)
|
||||
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee"))
|
||||
|
|
@ -892,7 +895,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
|
|||
os.makedirs(output_folder, exist_ok=True)
|
||||
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
|
||||
if is_for_all:
|
||||
verify_file_name = f"metrics_{verify_file_name}_all"
|
||||
verify_file_name = f"{verify_file_name}_all"
|
||||
metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx"
|
||||
output_file = os.path.join(output_folder, metrics_file_name)
|
||||
with pd.ExcelWriter(output_file) as writer:
|
||||
|
|
@ -1280,11 +1283,11 @@ def generate_message(message: dict,
|
|||
return message
|
||||
|
||||
|
||||
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""):
|
||||
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = "", zero_equal_none: bool = False):
|
||||
message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""}
|
||||
if gt_value is not None and len(str(gt_value).strip()) > 0:
|
||||
gt_list.append(1)
|
||||
gt_equal_pred = is_equal(gt_value, pred_value, data_point)
|
||||
gt_equal_pred = is_equal(gt_value, pred_value, data_point, zero_equal_none=zero_equal_none)
|
||||
if gt_equal_pred:
|
||||
pred_list.append(1)
|
||||
else:
|
||||
|
|
@ -1300,7 +1303,7 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
|
|||
pred_num = float(pred_value)
|
||||
# round to 2 decimal places
|
||||
pred_value = round(pred_num, 4)
|
||||
if pred_value == 0:
|
||||
if zero_equal_none and pred_value == 0:
|
||||
gt_list.append(1)
|
||||
pred_list.append(1)
|
||||
else:
|
||||
|
|
@ -1319,7 +1322,7 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
|
|||
return message
|
||||
|
||||
|
||||
def is_equal(gt_value, pred_value, data_point: str = ""):
|
||||
def is_equal(gt_value, pred_value, data_point: str = "", zero_equal_none: bool = False):
|
||||
if gt_value is not None and len(str(gt_value).strip()) > 0:
|
||||
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
||||
if gt_value == "0.0":
|
||||
|
|
@ -1357,7 +1360,7 @@ def is_equal(gt_value, pred_value, data_point: str = ""):
|
|||
gt_value = round(gt_num, 4)
|
||||
except Exception as e:
|
||||
pass
|
||||
if gt_value == 0:
|
||||
if zero_equal_none and gt_value == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
|
@ -1429,6 +1432,7 @@ if __name__ == "__main__":
|
|||
verify_document_list_file_list = [None,
|
||||
"./sample_documents/aus_prospectus_29_documents_sample.txt",
|
||||
"./sample_documents/aus_prospectus_17_documents_sample.txt"]
|
||||
zero_equal_none = False
|
||||
is_for_all = True
|
||||
for verify_document_list_file in verify_document_list_file_list:
|
||||
calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
|
||||
|
|
@ -1436,7 +1440,8 @@ if __name__ == "__main__":
|
|||
verify_file_path=verify_file_path,
|
||||
verify_data_sheet=verify_data_sheet,
|
||||
verify_document_list_file = verify_document_list_file,
|
||||
is_for_all=is_for_all)
|
||||
is_for_all=is_for_all,
|
||||
zero_equal_none=zero_equal_none)
|
||||
|
||||
# for verify_document_list_file in verify_document_list_file_list:
|
||||
# calculate_metrics_by_provider(audit_file_path=audit_file_path,
|
||||
|
|
|
|||
|
|
@ -366,12 +366,21 @@
|
|||
],
|
||||
"performance_fee_costs": [
|
||||
"Performance fees is share class level data.",
|
||||
"If the performance fees is with the range, please ignore and output empty.",
|
||||
"A. If the performance fees is with the range, please ignore and output empty.",
|
||||
"---Example 1 Start---",
|
||||
"Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.",
|
||||
"---Example 1 End---",
|
||||
"The relevant values: 0.00 and 2.18, are in the range, so the output should be:",
|
||||
"{\"data\": []}"
|
||||
"{\"data\": []}",
|
||||
"B If with pure performance fee in table, please extract relevant values",
|
||||
"---Example Start---",
|
||||
"\n\nFees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nPerformance fees \nAmounts deducted from your investment in \nrelation to the performance of the product. \nPlatinum International Fund Nil 0.15%\nPlatinum Global Fund (Long Only) Nil 0.24%\n",
|
||||
"---Example End---",
|
||||
"a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.",
|
||||
"b. This example mentioned share classes, please output according to share class.",
|
||||
"The output should be",
|
||||
"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}"
|
||||
|
||||
],
|
||||
"minimum_initial_investment": [
|
||||
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
|
||||
|
|
|
|||
4
main.py
4
main.py
|
|
@ -1560,8 +1560,8 @@ if __name__ == "__main__":
|
|||
# "544886057",
|
||||
# "550769189",
|
||||
# "553449663"]
|
||||
special_doc_id_list = ["454036250"]
|
||||
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250"]
|
||||
# special_doc_id_list = ["414751292"]
|
||||
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
Loading…
Reference in New Issue