diff --git a/calc_metrics.py b/calc_metrics.py index 66fc740..6d4384b 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -559,7 +559,8 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx", verify_data_sheet: str = "total_data", verify_document_list_file: str = None, - is_for_all: bool = False + is_for_all: bool = False, + zero_equal_none: bool = False ): print("Start to calculate metrics based on DB data file and extracted file...") audit_data_df = pd.DataFrame() @@ -733,28 +734,30 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros # v_switching_fee = str(doc_verify_sec_row["switching_fee"]) # v_activity_fee = str(doc_verify_sec_row["activity_fee"]) - message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs") + message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs", zero_equal_none=zero_equal_none) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs")) - message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee") + message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee", zero_equal_none=zero_equal_none) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee")) - message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees") + message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees", zero_equal_none=zero_equal_none) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees")) - message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment") + message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment", zero_equal_none=zero_equal_none) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment")) - message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name") + message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name", zero_equal_none=zero_equal_none) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name")) if is_for_all: - message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list) + message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list, zero_equal_none=zero_equal_none) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee_costs")) message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, - gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list, + zero_equal_none=zero_equal_none) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost")) - message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list) + message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list, zero_equal_none=zero_equal_none) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread")) - message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list) + message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list, zero_equal_none=zero_equal_none) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread")) message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges, - gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list, + zero_equal_none=zero_equal_none) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges")) # message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list) # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee")) @@ -892,7 +895,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros os.makedirs(output_folder, exist_ok=True) verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") if is_for_all: - verify_file_name = f"metrics_{verify_file_name}_all" + verify_file_name = f"{verify_file_name}_all" metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx" output_file = os.path.join(output_folder, metrics_file_name) with pd.ExcelWriter(output_file) as writer: @@ -1280,11 +1283,11 @@ def generate_message(message: dict, return message -def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""): +def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = "", zero_equal_none: bool = False): message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""} if gt_value is not None and len(str(gt_value).strip()) > 0: gt_list.append(1) - gt_equal_pred = is_equal(gt_value, pred_value, data_point) + gt_equal_pred = is_equal(gt_value, pred_value, data_point, zero_equal_none=zero_equal_none) if gt_equal_pred: pred_list.append(1) else: @@ -1300,7 +1303,7 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data pred_num = float(pred_value) # round to 2 decimal places pred_value = round(pred_num, 4) - if pred_value == 0: + if zero_equal_none and pred_value == 0: gt_list.append(1) pred_list.append(1) else: @@ -1319,7 +1322,7 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data return message -def is_equal(gt_value, pred_value, data_point: str = ""): +def is_equal(gt_value, pred_value, data_point: str = "", zero_equal_none: bool = False): if gt_value is not None and len(str(gt_value).strip()) > 0: if pred_value is not None and len(str(pred_value).strip()) > 0: if gt_value == "0.0": @@ -1357,7 +1360,7 @@ def is_equal(gt_value, pred_value, data_point: str = ""): gt_value = round(gt_num, 4) except Exception as e: pass - if gt_value == 0: + if zero_equal_none and gt_value == 0: return True return False @@ -1429,6 +1432,7 @@ if __name__ == "__main__": verify_document_list_file_list = [None, "./sample_documents/aus_prospectus_29_documents_sample.txt", "./sample_documents/aus_prospectus_17_documents_sample.txt"] + zero_equal_none = False is_for_all = True for verify_document_list_file in verify_document_list_file_list: calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, @@ -1436,7 +1440,8 @@ if __name__ == "__main__": verify_file_path=verify_file_path, verify_data_sheet=verify_data_sheet, verify_document_list_file = verify_document_list_file, - is_for_all=is_for_all) + is_for_all=is_for_all, + zero_equal_none=zero_equal_none) # for verify_document_list_file in verify_document_list_file_list: # calculate_metrics_by_provider(audit_file_path=audit_file_path, diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index e831d17..ec0b62b 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -366,12 +366,21 @@ ], "performance_fee_costs": [ "Performance fees is share class level data.", - "If the performance fees is with the range, please ignore and output empty.", + "A. If the performance fees is with the range, please ignore and output empty.", "---Example 1 Start---", "Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.", "---Example 1 End---", "The relevant values: 0.00 and 2.18, are in the range, so the output should be:", - "{\"data\": []}" + "{\"data\": []}", + "B If with pure performance fee in table, please extract relevant values", + "---Example Start---", + "\n\nFees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nPerformance fees \nAmounts deducted from your investment in \nrelation to the performance of the product. \nPlatinum International Fund Nil 0.15%\nPlatinum Global Fund (Long Only) Nil 0.24%\n", + "---Example End---", + "a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.", + "b. This example mentioned share classes, please output according to share class.", + "The output should be", + "{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}" + ], "minimum_initial_investment": [ "Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.", diff --git a/main.py b/main.py index f1343e3..2df8420 100644 --- a/main.py +++ b/main.py @@ -1560,8 +1560,8 @@ if __name__ == "__main__": # "544886057", # "550769189", # "553449663"] - special_doc_id_list = ["454036250"] - # special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250"] + # special_doc_id_list = ["414751292"] + # special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = (