diff --git a/calc_metrics.py b/calc_metrics.py index 243c000..4936327 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -559,7 +559,9 @@ def calculate_metrics_based_audit_file(is_strict: bool = False): def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx", audit_data_sheet: str = "Sheet1", verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx", - verify_data_sheet: str = "total_data" + verify_data_sheet: str = "total_data", + verify_document_list_file: str = None, + is_for_all: bool = False ): print("Start to calculate metrics based on DB data file and extracted file...") audit_data_df = pd.DataFrame() @@ -648,16 +650,17 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros pred_minimum_initial_investment_list = [] gt_benchmark_name_list = [] pred_benchmark_name_list = [] - gt_performance_fee_list = [] - pred_performance_fee_list = [] - gt_interposed_vehicle_performance_fee_cost_list = [] - pred_interposed_vehicle_performance_fee_cost_list = [] - gt_buy_spread_list = [] - pred_buy_spread_list = [] - gt_sell_spread_list = [] - pred_sell_spread_list = [] - gt_total_annual_dollar_based_charges_list = [] - pred_total_annual_dollar_based_charges_list = [] + if is_for_all: + gt_performance_fee_list = [] + pred_performance_fee_list = [] + gt_interposed_vehicle_performance_fee_cost_list = [] + pred_interposed_vehicle_performance_fee_cost_list = [] + gt_buy_spread_list = [] + pred_buy_spread_list = [] + gt_sell_spread_list = [] + pred_sell_spread_list = [] + gt_total_annual_dollar_based_charges_list = [] + pred_total_annual_dollar_based_charges_list = [] # gt_performance_fee_costs_list = [] # pred_performance_fee_costs_list = [] @@ -672,6 +675,12 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros # gt_activity_fee_list = [] # pred_activity_fee_list = [] + if verify_document_list_file is not None: + with open(verify_document_list_file, "r", encoding="utf-8") as f: + verify_document_list = f.readlines() + verify_document_list = [int(doc_id.strip()) for doc_id in verify_document_list] + if len(verify_document_list) > 0: + verify_data_df = verify_data_df[verify_data_df["doc_id"].isin(verify_document_list)] document_id_list = verify_data_df["doc_id"].unique().tolist() print(f"Total document count: {len(document_id_list)}") @@ -693,11 +702,12 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros administration_fees = str(row["administration_fees"]) minimum_initial_investment = str(row["minimum_initial_investment"]) benchmark_name = str(row["benchmark_name"]) - performance_fee = str(row["performance_fee"]) - interposed_vehicle_performance_fee_cost = str(row["interposed_vehicle_performance_fee_cost"]) - buy_spread = str(row["buy_spread"]) - sell_spread = str(row["sell_spread"]) - total_annual_dollar_based_charges = str(row["total_annual_dollar_based_charges"]) + if is_for_all: + performance_fee = str(row["performance_fee"]) + interposed_vehicle_performance_fee_cost = str(row["interposed_vehicle_performance_fee_cost"]) + buy_spread = str(row["buy_spread"]) + sell_spread = str(row["sell_spread"]) + total_annual_dollar_based_charges = str(row["total_annual_dollar_based_charges"]) # get the first row which sec_id in doc_verify_data is same as sec_id doc_verify_sec_data = doc_verify_data[doc_verify_data["sec_id"] == sec_id] @@ -710,11 +720,12 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros v_administration_fees = str(doc_verify_sec_row["administration_fees"]) v_minimum_initial_investment = str(doc_verify_sec_row["minimum_initial_investment"]) v_benchmark_name = str(doc_verify_sec_row["benchmark_name"]) - v_performance_fee = str(doc_verify_sec_row["performance_fee"]) - v_interposed_vehicle_performance_fee_cost = str(doc_verify_sec_row["interposed_vehicle_performance_fee_cost"]) - v_buy_spread = str(doc_verify_sec_row["buy_spread"]) - v_sell_spread = str(doc_verify_sec_row["sell_spread"]) - v_total_annual_dollar_based_charges = str(doc_verify_sec_row["total_annual_dollar_based_charges"]) + if is_for_all: + v_performance_fee = str(doc_verify_sec_row["performance_fee"]) + v_interposed_vehicle_performance_fee_cost = str(doc_verify_sec_row["interposed_vehicle_performance_fee_cost"]) + v_buy_spread = str(doc_verify_sec_row["buy_spread"]) + v_sell_spread = str(doc_verify_sec_row["sell_spread"]) + v_total_annual_dollar_based_charges = str(doc_verify_sec_row["total_annual_dollar_based_charges"]) # v_performance_fee_costs = str(doc_verify_sec_row["performance_fee_costs"]) # v_buy_spread = str(doc_verify_sec_row["buy_spread"]) @@ -733,18 +744,19 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "minimum_initial_investment")) message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name") message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "benchmark_name")) - message = get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "performance_fee")) - message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, - gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "interposed_vehicle_performance_fee_cost")) - message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "buy_spread")) - message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "sell_spread")) - message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges, - gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "total_annual_dollar_based_charges")) + if is_for_all: + message = get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "performance_fee")) + message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, + gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "interposed_vehicle_performance_fee_cost")) + message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "buy_spread")) + message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "sell_spread")) + message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges, + gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "total_annual_dollar_based_charges")) # message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list) # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee")) # message = get_gt_pred_by_compare_values(switching_fee, v_switching_fee, gt_switching_fee_list, pred_switching_fee_list) @@ -790,35 +802,36 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros accuracy_benchmark_name = accuracy_score(gt_benchmark_name_list, pred_benchmark_name_list) support_benchmark_name = sum(gt_benchmark_name_list) - precision_performance_fee = precision_score(gt_performance_fee_list, pred_performance_fee_list) - recall_performance_fee = recall_score(gt_performance_fee_list, pred_performance_fee_list) - f1_performance_fee = f1_score(gt_performance_fee_list, pred_performance_fee_list) - accuracy_performance_fee = accuracy_score(gt_performance_fee_list, pred_performance_fee_list) - support_performance_fee = sum(gt_performance_fee_list) - - precision_interposed_vehicle_performance_fee_cost = precision_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) - recall_interposed_vehicle_performance_fee_cost = recall_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) - f1_interposed_vehicle_performance_fee_cost = f1_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) - accuracy_interposed_vehicle_performance_fee_cost = accuracy_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) - support_interposed_vehicle_performance_fee_cost = sum(gt_interposed_vehicle_performance_fee_cost_list) - - precision_buy_spread = precision_score(gt_buy_spread_list, pred_buy_spread_list) - recall_buy_spread = recall_score(gt_buy_spread_list, pred_buy_spread_list) - f1_buy_spread = f1_score(gt_buy_spread_list, pred_buy_spread_list) - accuracy_buy_spread = accuracy_score(gt_buy_spread_list, pred_buy_spread_list) - support_buy_spread = sum(gt_buy_spread_list) - - precision_sell_spread = precision_score(gt_sell_spread_list, pred_sell_spread_list) - recall_sell_spread = recall_score(gt_sell_spread_list, pred_sell_spread_list) - f1_sell_spread = f1_score(gt_sell_spread_list, pred_sell_spread_list) - accuracy_sell_spread = accuracy_score(gt_sell_spread_list, pred_sell_spread_list) - support_buy_spread = sum(gt_sell_spread_list) - - precision_total_annual_dollar_based_charges = precision_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) - recall_total_annual_dollar_based_charges = recall_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) - f1_total_annual_dollar_based_charges = f1_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) - accuracy_total_annual_dollar_based_charges = accuracy_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) - support_total_annual_dollar_based_charges = sum(gt_total_annual_dollar_based_charges_list) + if is_for_all: + precision_performance_fee = precision_score(gt_performance_fee_list, pred_performance_fee_list) + recall_performance_fee = recall_score(gt_performance_fee_list, pred_performance_fee_list) + f1_performance_fee = f1_score(gt_performance_fee_list, pred_performance_fee_list) + accuracy_performance_fee = accuracy_score(gt_performance_fee_list, pred_performance_fee_list) + support_performance_fee = sum(gt_performance_fee_list) + + precision_interposed_vehicle_performance_fee_cost = precision_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + recall_interposed_vehicle_performance_fee_cost = recall_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + f1_interposed_vehicle_performance_fee_cost = f1_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + accuracy_interposed_vehicle_performance_fee_cost = accuracy_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + support_interposed_vehicle_performance_fee_cost = sum(gt_interposed_vehicle_performance_fee_cost_list) + + precision_buy_spread = precision_score(gt_buy_spread_list, pred_buy_spread_list) + recall_buy_spread = recall_score(gt_buy_spread_list, pred_buy_spread_list) + f1_buy_spread = f1_score(gt_buy_spread_list, pred_buy_spread_list) + accuracy_buy_spread = accuracy_score(gt_buy_spread_list, pred_buy_spread_list) + support_buy_spread = sum(gt_buy_spread_list) + + precision_sell_spread = precision_score(gt_sell_spread_list, pred_sell_spread_list) + recall_sell_spread = recall_score(gt_sell_spread_list, pred_sell_spread_list) + f1_sell_spread = f1_score(gt_sell_spread_list, pred_sell_spread_list) + accuracy_sell_spread = accuracy_score(gt_sell_spread_list, pred_sell_spread_list) + support_buy_spread = sum(gt_sell_spread_list) + + precision_total_annual_dollar_based_charges = precision_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + recall_total_annual_dollar_based_charges = recall_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + f1_total_annual_dollar_based_charges = f1_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + accuracy_total_annual_dollar_based_charges = accuracy_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + support_total_annual_dollar_based_charges = sum(gt_total_annual_dollar_based_charges_list) # precision_withdrawal_fee = precision_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) # recall_withdrawal_fee = recall_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) @@ -837,25 +850,32 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros # f1_activity_fee = f1_score(gt_activity_fee_list, pred_activity_fee_list) # accuracy_activity_fee = accuracy_score(gt_activity_fee_list, pred_activity_fee_list) # support_activity_fee = sum(gt_activity_fee_list) - - metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, - {"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, - {"item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, - {"item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, - {"item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name}, - {"item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee}, - {"item": "interposed_vehicle_performance_fee_cost", "precision": precision_interposed_vehicle_performance_fee_cost, "recall": recall_interposed_vehicle_performance_fee_cost, - "f1": f1_interposed_vehicle_performance_fee_cost, "accuracy": accuracy_interposed_vehicle_performance_fee_cost, "support": support_interposed_vehicle_performance_fee_cost}, - {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, - {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, - {"item": "total_annual_dollar_based_charges", "precision": precision_total_annual_dollar_based_charges, "recall": recall_total_annual_dollar_based_charges, - "f1": f1_total_annual_dollar_based_charges, "accuracy": accuracy_total_annual_dollar_based_charges, "support": support_total_annual_dollar_based_charges} - # {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, - # {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, - # {"item": "withdrawal_fee", "precision": precision_withdrawal_fee, "recall": recall_withdrawal_fee, "f1": f1_withdrawal_fee, "accuracy": accuracy_withdrawal_fee, "support": support_withdrawal_fee}, - # {"item": "switching_fee", "precision": precision_switching_fee, "recall": recall_switching_fee, "f1": f1_switching_fee, "accuracy": accuracy_switching_fee, "support": support_switching_fee}, - # {"item": "activity_fee", "precision": precision_activity_fee, "recall": recall_activity_fee, "f1": f1_activity_fee, "accuracy": accuracy_activity_fee, "support": support_activity_fee} - ] + if is_for_all: + metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, + {"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, + {"item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, + {"item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, + {"item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name}, + {"item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee}, + {"item": "interposed_vehicle_performance_fee_cost", "precision": precision_interposed_vehicle_performance_fee_cost, "recall": recall_interposed_vehicle_performance_fee_cost, + "f1": f1_interposed_vehicle_performance_fee_cost, "accuracy": accuracy_interposed_vehicle_performance_fee_cost, "support": support_interposed_vehicle_performance_fee_cost}, + {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, + {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, + {"item": "total_annual_dollar_based_charges", "precision": precision_total_annual_dollar_based_charges, "recall": recall_total_annual_dollar_based_charges, + "f1": f1_total_annual_dollar_based_charges, "accuracy": accuracy_total_annual_dollar_based_charges, "support": support_total_annual_dollar_based_charges} + # {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, + # {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, + # {"item": "withdrawal_fee", "precision": precision_withdrawal_fee, "recall": recall_withdrawal_fee, "f1": f1_withdrawal_fee, "accuracy": accuracy_withdrawal_fee, "support": support_withdrawal_fee}, + # {"item": "switching_fee", "precision": precision_switching_fee, "recall": recall_switching_fee, "f1": f1_switching_fee, "accuracy": accuracy_switching_fee, "support": support_switching_fee}, + # {"item": "activity_fee", "precision": precision_activity_fee, "recall": recall_activity_fee, "f1": f1_activity_fee, "accuracy": accuracy_activity_fee, "support": support_activity_fee} + ] + else: + metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, + {"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, + {"item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, + {"item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, + {"item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name} + ] metrics_data_df = pd.DataFrame(metrics_data) averate_precision = metrics_data_df["precision"].mean() average_recall = metrics_data_df["recall"].mean() @@ -871,7 +891,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros output_folder = r"/data/aus_prospectus/output/metrics_data/" os.makedirs(output_folder, exist_ok=True) verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") - metrics_file_name = f"metrics_{verify_file_name}_4_dps_not_strict.xlsx" + metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx" output_file = os.path.join(output_folder, metrics_file_name) with pd.ExcelWriter(output_file) as writer: metrics_data_df.to_excel(writer, index=False, sheet_name="metrics_data") @@ -1148,12 +1168,14 @@ if __name__ == "__main__": audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" audit_data_sheet: str = "Sheet1" - verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250305160321_ravi.xlsx" + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_46_documents_by_text_20250305170202.xlsx" verify_data_sheet: str = "total_data" + verify_document_list_file: str = "./sample_documents/aus_prospectus_17_documents_sample.txt" calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, audit_data_sheet=audit_data_sheet, verify_file_path=verify_file_path, - verify_data_sheet=verify_data_sheet) + verify_data_sheet=verify_data_sheet, + verify_document_list_file = verify_document_list_file) # set_mapping_to_17_documents_data() # set_mapping_to_ravi_data() diff --git a/core/data_extraction.py b/core/data_extraction.py index 3d8269c..ab880f2 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -575,7 +575,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num != 40: + # if page_num != 21: # continue if page_num in handled_page_num_list: continue diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 78de6d0..04b2fff 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -162,6 +162,26 @@ "The output should be:", "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", "\n", + "B.3 With \"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", sum the values from these 3 columns.", + "---Example Start---", + "Fund \nManagement \nfee 1 \n(% pa) \nIndirect costs1\n(% pa)\nEstimated performance fees2\n(% pa)\nTransaction \ncosts \n(% pa) \nBuy/sell \nspreads (%) \nRecoverable \nexpenses 3 \nEstimated \nother indirect \ncosts \nPerformance \nfees charged \nto the Fund \nby underlying \nmanagers \nPerformance \nfees charged \nby interposed \nvehicles \nipac Life \nChoices \nActive 50 \n0.70 \n0.02 \n0.09 \n0.00 \n0.05 \n0.14 \n0.10/0.10 \nipac Life \nChoices \nActive 70 \n0.79 \n0.01 \n0.08 \n0.00 \n0.05 \n0.17 \n0.10/0.10 \n", + "---Example End---", + "For this case: ", + "a. The table header is with secondary-level header.", + "b. The fund name is before the data row, e.g. ipac Life Choices Active 50", + "c. The data points numbers order in data row, for example: \n0.70 \n0.02 \n0.09 \n0.00 \n0.05 \n0.14 \n0.10/0.10 \n is correct as initial table structure.", + "The 1st number: 0.70 is the management_fee,", + "the 2nd number: 0.02 is the recoverable_expenses,", + "the 3rd number: 0.09 is the indirect_costs", + "the 4th number: 0.00 is the performance_fee,", + "the 5th number: 0.05 is the interposed_vehicle_performance_fee_cost, ", + "the 6th number: 0.14 is the transaction costs, please ignore this number.", + "the 7th number: 0.10 is the buy_spread, ", + "the 8th number: 0.10 is the sell_spread.", + "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.70 + 0.02 + 0.09= 0.81", + "The output should be:", + "{\"data\": [{\"fund name\": \"ipac Life Choices Active 50\", \"share name\": \"ipac Life Choices Active 50\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.7, \"recoverable_expenses\": 0.02, \"indirect_costs\": 0.09, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"ipac Life Choices Active 70\", \"share name\": \"ipac Life Choices Active 70\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.79, \"recoverable_expenses\": 0.01, \"indirect_costs\": 0.08, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", + "\n", "C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", @@ -391,12 +411,12 @@ "b. The fund name is after the data row, e.g. MyNorth Australian Fixed Interest Index", "c. The data points numbers order in data row, for example: \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 is correct as initial table structure.", "The 1st number: 0.20 is the management_fee, the 2nd number and the 3th number: 0.01 0.00 are the indirect costs, ", - "the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the performance_fee by interposed vehicles, ", + "the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ", "the 6th number: 0.00 is the transaction costs, ", "the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.", "The management_fee_and_costs is management_fee + indirect costs = 0.20 + 0.01 + 0.00= 0.21", "The output should be: ", - "{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"performance_fee\": 0.00, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"performance_fee\": 0.00, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", + "{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"performance_fee\": 0.00, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", "\n", "---Example 2 Start---", "Indirect costs \ni \nEstimated performance fees \nii \nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nMyNorth Index \nModerately \nDefensive \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 \nMyNorth Index \nBalanced \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.09/0.09 \n", @@ -407,7 +427,7 @@ "c. The data points numbers order in data row, for example: \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 is correct as initial table structure.", "The 1st number: 0.55 is the management_fee, the 2nd number and the 3th number: 0.00 0.00 are the indirect costs, ", "the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the performance_fee by interposed vehicles, ", - "the 6th number: 0.01 is the transaction costs, ", + "the 6th number: 0.01 is the transaction costs, please ignore this number.", "the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.", "The management_fee_and_costs is management_fee + indirect costs = 0.55 + 0.00 + 0.00= 0.55", "The output should be: ", @@ -433,6 +453,70 @@ "The output should be:", "{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}" ] + }, + { + "keywords": ["Option name \nIndirect costs"], + "prompts": ["Complex management fee and costs rule:", + "If the table with columns:", + "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", + "The management_fee is \"Management fee (% pa)\".", + "The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".", + "The indirect_costs is \"Estimated other indirect costs\"", + "The recoverable_expenses is \"Recoverable expenses\"", + "The performance_fee is \"Peformance fees charged to the option by underlying managers\".", + "The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"", + "The buy_spread and sell_spread are \"Buy/sell spreads\".", + "---Example 1 Start---", + "Option name \nIndirect costs \n(i)\nEstimated performance fees \n(ii)\nManagement \nfee \n(% pa) \n(i) \n(% pa) \n(% pa) \nTransaction \ncosts \n(% pa) \nBuy/sell \nspreads \n(%) \n(iv) \nRecoverable \nexpenses \n(iii) \nEstimated other \nindirect costs \nPerformance \nfees charged to \nthe option \nby underlying \nmanagers \nPerformance \nfees charged \nby interposed \nvehicles \nGenerations Defensive \n0.90 \n0.26 \n0.12 \n0.00 \n0.06 \n0.17 \n0.09/0.08 \nGenerations Moderately \nDefensive \n1.00 \n0.08 \n0.10 \n0.00 \n0.05 \n0.17 \n0.10/0.10 \n", + "---Example 1 End---", + "For this case: ", + "a. The table header is with disorder issue during PDF contents extraction issue.", + "b. The fund name is before the data row, e.g. Generations Defensive", + "c. The data points numbers order in data row, for example: \n0.90 \n0.26 \n0.12 \n0.00 \n0.06 \n0.17 \n0.09/0.08 \n is correct as initial table structure.", + "The 1st number: 0.90 is the management_fee,", + "the 2nd number: 0.26 is the recoverable_expenses,", + "the 3rd number: 0.12 is the indirect_costs", + "the 4th number: 0.00 is the performance_fee,", + "the 5th number: 0.06 is the interposed_vehicle_performance_fee_cost, ", + "the 6th number: 0.17 is the transaction costs, please ignore this number.", + "the 7th number: 0.09 is the buy_spread, ", + "the 8th number: 0.08 is the sell_spread.", + "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.90 + 0.26 + 0.12= 1.28", + "The output should be: ", + "{\"data\": [{\"fund name\": \"Generations Defensive\", \"share name\": \"Generations Defensive\", \"management_fee_and_costs\": 1.28, \"management_fee\": 0.9, \"recoverable_expenses\": 0.26, \"indirect_costs\": 0.12, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.06, \"buy_spread\": 0.09, \"sell_spread\": 0.08}, {\"fund name\": \"Generations Moderately Defensive\", \"share name\": \"Generations Moderately Defensive\", \"management_fee_and_costs\": 1.18, \"management_fee\": 1, \"recoverable_expenses\": 0.08, \"indirect_costs\": 0.1,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" + ] + }, + { + "keywords": "Management \nfee (i) \n(% pa) \nIndirect costs (i) \n(% pa)", + "prompts": ["Complex management fee and costs rule:", + "If the table with columns:", + "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", + "The management_fee is \"Management fee (% pa)\".", + "The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".", + "The indirect_costs is \"Estimated other indirect costs\"", + "The recoverable_expenses is \"Recoverable expenses\"", + "The performance_fee is \"Peformance fees charged to the Investment Option by underlying managers\".", + "The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"", + "The buy_spread and sell_spread are \"Buy/sell spreads\".", + "---Example 1 Start---", + "Investment Option \nManagement \nfee (i) \n(% pa) \nIndirect costs (i) \n(% pa) \nEstimated performance fees (ii) \n(% pa) \nTransaction \ncosts (% pa) \nBuy/sell \nspreads (%) \nRecoverable \nexpenses (iii) \nEstimated \nother \nindirect costs \nPerformance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \nPerformance fees \ncharged by \ninterposed \nvehicles \nNorth Active Defensive \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \nNorth Active Moderately \nDefensive \n0.72 \n0.07 \n0.04 \n0.00 \n0.01 \n0.14 \n0.09/0.09 \nNorth Index Growth \n0.45 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.06/0.06 \nNorth Index High Growth \n0.45 \n0.00 \n0.01 \n0.00 \n0.00 \n0.01 \n0.06/0.07 \n", + "---Example 1 End---", + "For this case: ", + "a. The table header is with secondary-level header.", + "b. The fund name is before the data row, e.g. North Active Defensive", + "c. The data points numbers order in data row, for example: \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \n is correct as initial table structure.", + "The 1st number: 0.62 is the management_fee,", + "the 2nd number: 0.18 is the recoverable_expenses,", + "the 3rd number: 0.05 is the indirect_costs", + "the 4th number: 0.00 is the performance_fee,", + "the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ", + "the 6th number: 0.14 is the transaction costs, please ignore this number.", + "the 7th number: 0.08 is the buy_spread, ", + "the 8th number: 0.08 is the sell_spread.", + "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.62 + 0.18 + 0.05= 0.85", + "The output should be: ", + "{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}" + ] } ] } diff --git a/main.py b/main.py index 1171af8..e28c426 100644 --- a/main.py +++ b/main.py @@ -1504,9 +1504,9 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - re_run_extract_data = False - re_run_mapping_data = False - force_save_total_data = True + re_run_extract_data = True + re_run_mapping_data = True + force_save_total_data = False doc_source = "aus_prospectus" # doc_source = "emea_ar" if doc_source == "aus_prospectus": @@ -1525,7 +1525,7 @@ if __name__ == "__main__": # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" - # special_doc_id_list: list = ["384508026"] + # special_doc_id_list: list = ["539261734"] # special_doc_id_list: list = ["401212184"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"