diff --git a/.gitignore b/.gitignore index c1f6286..e627f60 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ /utils/__pycache__ /__pycache__/*.pyc /core/__pycache__/*.pyc -/test_calc_metrics.py /test_metrics /data /sample_documents/japan_prospectus.txt diff --git a/calc_metrics.py b/calc_metrics.py new file mode 100644 index 0000000..243c000 --- /dev/null +++ b/calc_metrics.py @@ -0,0 +1,1173 @@ +import os +from time import sleep +import pandas as pd +from glob import glob +from tqdm import tqdm +import numpy as np +from datetime import datetime +import re +import json +import traceback +from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score +import requests +import fitz +from copy import deepcopy +from utils.similarity import Similarity +from core.auz_nz.hybrid_solution_script import final_function_to_match + + +def calc_metrics(ground_truth_file: str, prediction_file: str): + """ + Calculate metrics by comparing ground truth and prediction files + """ + if not os.path.exists(ground_truth_file): + raise FileNotFoundError(f"File not found: {ground_truth_file}") + if not os.path.exists(prediction_file): + raise FileNotFoundError(f"File not found: {prediction_file}") + + ground_truth_df = pd.read_excel(ground_truth_file) + prediction_df = pd.read_excel(prediction_file) + + gt_auum_list = [] + pred_auum_list = [] + + gt_tor_list = [] + pred_tor_list = [] + columns = ["fund_name", "auum", "tor"] + # Check whether the ground truth file contains the same values as the prediction file + # The purpose is to calculate Recall + for gt_index, gt_row in ground_truth_df.iterrows(): + gt_fund_name = gt_row["fund_name"] + gt_auum = gt_row["auum"] + gt_tor = gt_row["tor"] + find_auum_flag = False + find_tor_flag = False + for pred_index, pred_row in prediction_df.iterrows(): + pred_fund_name = pred_row["fund_name"] + pred_auum = pred_row["auum"] + pred_tor = pred_row["tor"] + if gt_fund_name == pred_fund_name: + if gt_auum == pred_auum: + find_auum_flag = True + if gt_tor == pred_tor: + find_tor_flag = True + break + + if find_auum_flag: + gt_auum_list.append(1) + pred_auum_list.append(1) + else: + gt_auum_list.append(1) + pred_auum_list.append(0) + + if find_tor_flag: + gt_tor_list.append(1) + pred_tor_list.append(1) + else: + gt_tor_list.append(1) + pred_tor_list.append(0) + + # Check whether the prediction file contains the same values as the ground truth file + # The purpose is to calculate Precision + for pred_index, pred_row in prediction_df.iterrows(): + pred_fund_name = pred_row["fund_name"] + pred_auum = pred_row["auum"] + pred_tor = pred_row["tor"] + find_auum_flag = False + find_tor_flag = False + for gt_index, gt_row in ground_truth_df.iterrows(): + gt_fund_name = gt_row["fund_name"] + gt_auum = gt_row["auum"] + gt_tor = gt_row["tor"] + if pred_fund_name == gt_fund_name: + if pred_auum == gt_auum: + find_auum_flag = True + if pred_tor == gt_tor: + find_tor_flag = True + break + if not find_auum_flag: + gt_auum_list.append(0) + pred_auum_list.append(1) + + if not find_tor_flag: + gt_tor_list.append(0) + pred_tor_list.append(1) + + precision_auum = precision_score(gt_auum_list, pred_auum_list) + recall_auum = recall_score(gt_auum_list, pred_auum_list) + f1_auum = f1_score(gt_auum_list, pred_auum_list) + accuracy_auum = accuracy_score(gt_auum_list, pred_auum_list) + + precision_tor = precision_score(gt_tor_list, pred_tor_list) + recall_tor = recall_score(gt_tor_list, pred_tor_list) + f1_tor = f1_score(gt_tor_list, pred_tor_list) + accuracy_tor = accuracy_score(gt_tor_list, pred_tor_list) + + print(f"AUUM Support: {sum(gt_auum_list)}") + print(f"F1 AUUM: {f1_auum}") + print(f"Precision AUUM: {precision_auum}") + print(f"Recall AUUM: {recall_auum}") + print(f"Accuracy AUUM: {accuracy_auum}\n") + + print(f"TOR Support: {sum(gt_tor_list)}") + print(f"F1 TOR: {f1_tor}") + print(f"Precision TOR: {precision_tor}") + print(f"Recall TOR: {recall_tor}") + print(f"Accuracy TOR: {accuracy_tor}") + + +def transform_pdf_2_image(): + """ + Transform pdf to image. + """ + import fitz + + folder = r"/Users/bhe/OneDrive - MORNINGSTAR INC/Personal Document/US_Life/pay/" + pdf_file = r"Pay_Date_2025-02-14.pdf" + pdf_path = os.path.join(folder, pdf_file) + pdf_doc = fitz.open(pdf_path) + + pdf_file_pure_name = pdf_file.replace(".pdf", "") + for page_num in range(pdf_doc.page_count): + page = pdf_doc.load_page(page_num) + image = page.get_pixmap(dpi=300) + image_path = os.path.join(folder, f"{pdf_file_pure_name}_{page_num}.png") + image.save(image_path) + + +def invoke_api_demo(doc_id: str = "407881493"): + headers = {"connection": "keep-alive", "content-type": "application/json"} + data = { + "doc_id": doc_id, + } + print(f"Start to invoke API for document: {doc_id}") + # url = 'https://internal-ts00006-stg-dcms-gpt-765982576.us-east-1.elb.amazonaws.com/automation/api/model/us_ar' + url = "http://127.0.0.1:8080/automation/api/model/emea_ar" + try: + response = requests.post(url, json=data, headers=headers) + print("API response status code: {0}".format(response.status_code)) + json_data = json.loads(response.text) + print(json_data) + data_folder = r"/data/emea_ar/output/extract_data_by_api/" + os.makedirs(data_folder, exist_ok=True) + json_file = os.path.join(data_folder, f"{doc_id}.json") + with open(json_file, "w", encoding="utf-8") as f: + json.dump(json_data, f, indent=4) + except Exception as e: + print("Meet exception: {0}".format(e)) + + +def batch_run_documents(): + document_id_list = [ + "292989214", + "316237292", + "321733631", + "323390570", + "327956364", + "333207452", + "334718372", + "344636875", + "362246081", + "366179419", + "380945052", + "382366116", + "387202452", + "389171486", + "391456740", + "391736837", + "394778487", + "401684600", + "402113224", + "402181770", + "402397014", + "405803396", + "445102363", + "445256897", + "448265376", + "449555622", + "449623976", + "458291624", + "458359181", + "463081566", + "469138353", + "471641628", + "476492237", + "478585901", + "478586066", + "479042264", + "479793787", + "481475385", + "483617247", + "486378555", + "486383912", + "492121213", + "497497599", + "502693599", + "502821436", + "503194284", + "506559375", + "507967525", + "508854243", + "509845549", + "520879048", + "529925114", + ] + for doc_id in document_id_list: + invoke_api_demo(doc_id) + + +def remove_ter_ogc_performance_fee_annotation(): + data_folder = r"/data/emea_ar/output/extract_data_by_api/" + os.makedirs(data_folder, exist_ok=True) + # get all of json files from the folder + json_files = glob(os.path.join(data_folder, "*.json")) + remove_dp_list = ["ter", "ogc", "performance_fee"] + for json_file in json_files: + with open(json_file, "r", encoding="utf-8") as f: + json_data = json.load(f) + annotation_data_list = json_data["annotation_data"] + remove_data_list = [] + for annotation_data in annotation_data_list: + if annotation_data["data_point"] in remove_dp_list: + remove_data_list.append(annotation_data) + if len(remove_data_list) > 0: + for remove_data in remove_data_list: + if remove_data in annotation_data_list: + annotation_data_list.remove(remove_data) + with open(json_file, "w", encoding="utf-8") as f: + json.dump(json_data, f, indent=4) + + +def output_part_of_pages(pdf_file: str, page_list: list, output_folder: str): + """ + Output part of pages from a pdf file to new pdf file. + :param pdf_file: str, the path of the pdf file. + :param page_list: list, the page number list. + :param output_folder: str, the output folder. + """ + pdf_doc = fitz.open(pdf_file) + pdf_file_pure_name = os.path.basename(pdf_file).replace(".pdf", "") + new_pdf = fitz.open() + print(f"output pages: {page_list} for {pdf_file_pure_name}") + for page_index in page_list: + new_pdf.insert_pdf(pdf_doc, from_page=page_index, to_page=page_index) + if output_folder is None or len(output_folder) == 0: + output_folder = r"./data/emea_ar/output/pdf_part/" + os.makedirs(output_folder, exist_ok=True) + new_pdf.save(os.path.join(output_folder, f"{pdf_file_pure_name}_part.pdf")) + + +def calculate_metrics_based_audit_file(is_strict: bool = False): + print("Start to calculate metrics based on audit file and verify file...") + audit_file_path = ( + r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/Audited file_phase2.xlsx" + ) + audit_data_sheets = ["Mayank - revised ", "Prathamesh - Revised"] + audit_fields = [ + "doc_id", + "fund_name", + "management_fee_and_costs", + "management_fee", + "performance_fee", + "performance_fee_costs", + "buy_spread", + "sell_spread", + "minimum_initial_investment", + "recoverable_expenses", + "indirect_costs" + ] + audit_data_list = [] + for audit_data_sheet in audit_data_sheets: + sub_audit_data_df = pd.read_excel(audit_file_path, sheet_name=audit_data_sheet) + sub_audit_data_df = sub_audit_data_df[audit_fields] + audit_data_list.append(sub_audit_data_df) + audit_data_df = pd.concat(audit_data_list, ignore_index=True) + audit_data_df = audit_data_df.drop_duplicates() + audit_data_df.fillna("", inplace=True) + audit_data_df.reset_index(drop=True, inplace=True) + + verify_file_path = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250205134704.xlsx" + verify_data_sheet = "total_data" + verify_fields = [ + "DocumentId", + "raw_fund_name", + "management_fee_and_costs", + "management_fee", + "performance_fee", + "performance_fee_costs", + "buy_spread", + "sell_spread", + "minimum_initial_investment", + "recoverable_expenses", + "indirect_costs" + ] + verify_data_df = pd.read_excel(verify_file_path, sheet_name=verify_data_sheet) + verify_data_df = verify_data_df[verify_fields] + verify_data_df = verify_data_df.drop_duplicates() + verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id", "raw_fund_name": "fund_name"}) + verify_data_df.fillna("", inplace=True) + verify_data_df.reset_index(drop=True, inplace=True) + + if len(audit_data_df) == 0 or len(verify_data_df) == 0: + print("No data to calculate metrics.") + return + + # Calculate metrics + gt_management_fee_and_costs_list = [] + pred_management_fee_and_costs_list = [] + gt_management_fee_list = [] + pred_management_fee_list = [] + gt_performance_fee_list = [] + pred_performance_fee_list = [] + gt_performance_fee_costs_list = [] + pred_performance_fee_costs_list = [] + gt_buy_spread_list = [] + pred_buy_spread_list = [] + gt_sell_spread_list = [] + pred_sell_spread_list = [] + gt_minimum_initial_investment_list = [] + pred_minimum_initial_investment_list = [] + gt_recoverable_expenses_list = [] + pred_recoverable_expenses_list = [] + gt_indirect_costs_list = [] + pred_indirect_costs_list = [] + + document_id_list = audit_data_df["doc_id"].unique().tolist() + + print(f"Total document count: {len(document_id_list)}") + print("Construct ground truth and prediction data...") + similarity = Similarity() + for document_id in document_id_list: + doc_audit_data = audit_data_df[audit_data_df["doc_id"] == document_id] + doc_verify_data = verify_data_df[verify_data_df["doc_id"] == document_id] + for index, row in doc_audit_data.iterrows(): + fund_name = row["fund_name"] + fund_name_split = fund_name.lower().split() + management_fee_and_costs = str(row["management_fee_and_costs"]) + management_fee = str(row["management_fee"]) + performance_fee = str(row["performance_fee"]) + performance_fee_costs = str(row["performance_fee_costs"]) + buy_spread = str(row["buy_spread"]) + sell_spread = str(row["sell_spread"]) + minimum_initial_investment = str(row["minimum_initial_investment"]) + recoverable_expenses = str(row["recoverable_expenses"]) + indirect_costs = str(row["indirect_costs"]) + find_flag = False + for idx, r in doc_verify_data.iterrows(): + v_fund_name = r["fund_name"] + if fund_name == v_fund_name: + find_flag = True + else: + v_fund_name_split = v_fund_name.lower().split() + name_similarity = similarity.jaccard_similarity(fund_name_split, v_fund_name_split) + if name_similarity > 0.8: + find_flag = True + if find_flag: + v_management_fee_and_costs = str(r["management_fee_and_costs"]) + v_management_fee = str(r["management_fee"]) + v_performance_fee = str(r["performance_fee"]) + v_performance_fee_costs = str(r["performance_fee_costs"]) + v_buy_spread = str(r["buy_spread"]) + v_sell_spread = str(r["sell_spread"]) + v_minimum_initial_investment = str(r["minimum_initial_investment"]) + v_recoverable_expenses = str(r["recoverable_expenses"]) + v_indirect_costs = str(r["indirect_costs"]) + + get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list) + get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list) + get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list) + get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list) + get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list) + get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + get_gt_pred_by_compare_values(recoverable_expenses, v_recoverable_expenses, gt_recoverable_expenses_list, pred_recoverable_expenses_list) + get_gt_pred_by_compare_values(indirect_costs, v_indirect_costs, gt_indirect_costs_list, pred_indirect_costs_list) + break + if not find_flag: + if management_fee_and_costs is not None and len(management_fee_and_costs) > 0: + gt_management_fee_and_costs_list.append(1) + pred_management_fee_and_costs_list.append(0) + if management_fee is not None and len(management_fee) > 0: + gt_management_fee_list.append(1) + pred_management_fee_list.append(0) + if performance_fee is not None and len(performance_fee) > 0: + gt_performance_fee_list.append(1) + pred_performance_fee_list.append(0) + if performance_fee_costs is not None and len(performance_fee_costs) > 0: + gt_performance_fee_costs_list.append(1) + pred_performance_fee_costs_list.append(0) + if buy_spread is not None and len(buy_spread) > 0: + gt_buy_spread_list.append(1) + pred_buy_spread_list.append(0) + if sell_spread is not None and len(sell_spread) > 0: + gt_sell_spread_list.append(1) + pred_sell_spread_list.append(0) + if minimum_initial_investment is not None and len(minimum_initial_investment) > 0: + gt_minimum_initial_investment_list.append(1) + pred_minimum_initial_investment_list.append(0) + if recoverable_expenses is not None and len(recoverable_expenses) > 0: + gt_recoverable_expenses_list.append(1) + pred_recoverable_expenses_list.append(0) + if indirect_costs is not None and len(indirect_costs) > 0: + gt_indirect_costs_list.append(1) + pred_indirect_costs_list.append(0) + + if is_strict: + for idx, r in doc_verify_data.iterrows(): + v_fund_name = r["fund_name"] + find_flag = False + for index, row in doc_audit_data.iterrows(): + fund_name = row["fund_name"] + if fund_name == v_fund_name: + find_flag = True + else: + v_fund_name_split = v_fund_name.lower().split() + fund_name_split = fund_name.lower().split() + name_similarity = similarity.jaccard_similarity(fund_name_split, v_fund_name_split) + if name_similarity > 0.8: + find_flag = True + if find_flag: + break + if not find_flag: + v_management_fee_and_costs = str(r["management_fee_and_costs"]) + v_management_fee = str(r["management_fee"]) + v_performance_fee = str(r["performance_fee"]) + v_performance_fee_costs = str(r["performance_fee_costs"]) + v_buy_spread = str(r["buy_spread"]) + v_sell_spread = str(r["sell_spread"]) + v_minimum_initial_investment = str(r["minimum_initial_investment"]) + v_recoverable_expenses = str(r["recoverable_expenses"]) + v_indirect_costs = str(r["indirect_costs"]) + + if v_management_fee_and_costs is not None and len(v_management_fee_and_costs) > 0: + gt_management_fee_and_costs_list.append(0) + pred_management_fee_and_costs_list.append(1) + if v_management_fee is not None and len(v_management_fee) > 0: + gt_management_fee_list.append(0) + pred_management_fee_list.append(1) + if v_performance_fee is not None and len(v_performance_fee) > 0: + gt_performance_fee_list.append(0) + pred_performance_fee_list.append(1) + if v_performance_fee_costs is not None and len(v_performance_fee_costs) > 0: + gt_performance_fee_costs_list.append(0) + pred_performance_fee_costs_list.append(1) + if v_buy_spread is not None and len(v_buy_spread) > 0: + gt_buy_spread_list.append(0) + pred_buy_spread_list.append(1) + if v_sell_spread is not None and len(v_sell_spread) > 0: + gt_sell_spread_list.append(0) + pred_sell_spread_list.append(1) + if v_minimum_initial_investment is not None and len(v_minimum_initial_investment) > 0: + gt_minimum_initial_investment_list.append(0) + pred_minimum_initial_investment_list.append(1) + if v_recoverable_expenses is not None and len(v_recoverable_expenses) > 0: + gt_recoverable_expenses_list.append(0) + pred_recoverable_expenses_list.append(1) + if v_indirect_costs is not None and len(v_indirect_costs) > 0: + gt_indirect_costs_list.append(0) + pred_indirect_costs_list.append(1) + + # calculate metrics + print("Calculate metrics...") + precision_management_fee_and_costs = precision_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + recall_management_fee_and_costs = recall_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + f1_management_fee_and_costs = f1_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + accuracy_management_fee_and_costs = accuracy_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + support_management_fee_and_costs = sum(gt_management_fee_and_costs_list) + + precision_management_fee = precision_score(gt_management_fee_list, pred_management_fee_list) + recall_management_fee = recall_score(gt_management_fee_list, pred_management_fee_list) + f1_management_fee = f1_score(gt_management_fee_list, pred_management_fee_list) + accuracy_management_fee = accuracy_score(gt_management_fee_list, pred_management_fee_list) + support_management_fee = sum(gt_management_fee_list) + + precision_performance_fee = precision_score(gt_performance_fee_list, pred_performance_fee_list) + recall_performance_fee = recall_score(gt_performance_fee_list, pred_performance_fee_list) + f1_performance_fee = f1_score(gt_performance_fee_list, pred_performance_fee_list) + accuracy_performance_fee = accuracy_score(gt_performance_fee_list, pred_performance_fee_list) + support_performance_fee = sum(gt_performance_fee_list) + + precision_performance_fee_costs = precision_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + recall_performance_fee_costs = recall_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + f1_performance_fee_costs = f1_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + accuracy_performance_fee_costs = accuracy_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + support_performance_fee_costs = sum(gt_performance_fee_costs_list) + + precision_buy_spread = precision_score(gt_buy_spread_list, pred_buy_spread_list) + recall_buy_spread = recall_score(gt_buy_spread_list, pred_buy_spread_list) + f1_buy_spread = f1_score(gt_buy_spread_list, pred_buy_spread_list) + accuracy_buy_spread = accuracy_score(gt_buy_spread_list, pred_buy_spread_list) + support_buy_spread = sum(gt_buy_spread_list) + + precision_sell_spread = precision_score(gt_sell_spread_list, pred_sell_spread_list) + recall_sell_spread = recall_score(gt_sell_spread_list, pred_sell_spread_list) + f1_sell_spread = f1_score(gt_sell_spread_list, pred_sell_spread_list) + accuracy_sell_spread = accuracy_score(gt_sell_spread_list, pred_sell_spread_list) + support_buy_spread = sum(gt_sell_spread_list) + + precision_minimum_initial_investment = precision_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + recall_minimum_initial_investment = recall_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + f1_minimum_initial_investment = f1_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + accuracy_minimum_initial_investment = accuracy_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + support_minimum_initial_investment = sum(gt_minimum_initial_investment_list) + + precision_recoverable_expenses = precision_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list) + recall_recoverable_expenses = recall_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list) + f1_recoverable_expenses = f1_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list) + accuracy_recoverable_expenses = accuracy_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list) + support_recoverable_expenses = sum(gt_recoverable_expenses_list) + + precision_indirect_costs = precision_score(gt_indirect_costs_list, pred_indirect_costs_list) + recall_indirect_costs = recall_score(gt_indirect_costs_list, pred_indirect_costs_list) + f1_indirect_costs = f1_score(gt_indirect_costs_list, pred_indirect_costs_list) + accuracy_indirect_costs = accuracy_score(gt_indirect_costs_list, pred_indirect_costs_list) + support_indirect_costs = sum(gt_indirect_costs_list) + + metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, + {"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, + {"item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee}, + {"item": "performance_fee_costs", "precision": precision_performance_fee_costs, "recall": recall_performance_fee_costs, "f1": f1_performance_fee_costs, "accuracy": accuracy_performance_fee_costs, "support": support_performance_fee_costs}, + {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, + {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, + {"item": "minimum_initial_investment", "precision": precision_minimum_initial_investment, "recall": recall_minimum_initial_investment, "f1": f1_minimum_initial_investment, "accuracy": accuracy_minimum_initial_investment, "support": support_minimum_initial_investment}, + {"item": "recoverable_expenses", "precision": precision_recoverable_expenses, "recall": recall_recoverable_expenses, "f1": f1_recoverable_expenses, "accuracy": accuracy_recoverable_expenses, "support": support_recoverable_expenses}, + {"item": "indirect_costs", "precision": precision_indirect_costs, "recall": recall_indirect_costs, "f1": f1_indirect_costs, "accuracy": accuracy_indirect_costs, "support": support_indirect_costs}] + metrics_data_df = pd.DataFrame(metrics_data) + averate_precision = metrics_data_df["precision"].mean() + average_recall = metrics_data_df["recall"].mean() + average_f1 = metrics_data_df["f1"].mean() + average_accuracy = metrics_data_df["accuracy"].mean() + sum_support = metrics_data_df["support"].sum() + metrics_data.append({"item": "average_score", "precision": averate_precision, "recall": average_recall, "f1": average_f1, "accuracy": average_accuracy, "support": sum_support}) + metrics_data_df = pd.DataFrame(metrics_data) + metrics_data_df = metrics_data_df[['item', 'f1', 'precision', 'recall', 'accuracy', 'support']] + + # output metrics data to Excel file + print("Output metrics data to Excel file...") + output_folder = r"/data/aus_prospectus/output/metrics_data/" + os.makedirs(output_folder, exist_ok=True) + verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") + if is_strict: + verify_file_name = f"metrics_{verify_file_name}_revised_strict.xlsx" + else: + verify_file_name = f"metrics_{verify_file_name}_revised_not_strict.xlsx" + output_file = os.path.join(output_folder, verify_file_name) + with pd.ExcelWriter(output_file) as writer: + metrics_data_df.to_excel(writer, index=False) + + +def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx", + audit_data_sheet: str = "Sheet1", + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx", + verify_data_sheet: str = "total_data" + ): + print("Start to calculate metrics based on DB data file and extracted file...") + audit_data_df = pd.DataFrame() + verify_data_df = pd.DataFrame() + + audit_fields = [ + "DocumentId", + "FundLegalName", + "FundId", + "FundClassLegalName", + "FundClassId", + "management_fee_and_costs", + "management_fee", + "administration_fees", + "minimum_initial_investment", + "benchmark_name", + "performance_fee", + "interposed_vehicle_performance_fee_cost", + "buy_spread", + "sell_spread", + "total_annual_dollar_based_charges" + # "withdrawal_fee", + # "switching_fee", + # "activity_fee", + + ] + audit_data_df = pd.read_excel(audit_file_path, sheet_name=audit_data_sheet) + audit_data_df = audit_data_df[audit_fields] + audit_data_df = audit_data_df.drop_duplicates() + audit_data_df = audit_data_df.rename(columns={"DocumentId": "doc_id", + "FundLegalName": "fund_name", + "FundId": "fund_id", + "FundClassLegalName": "sec_name", + "FundClassId": "sec_id"}) + audit_data_df.fillna("", inplace=True) + audit_data_df.reset_index(drop=True, inplace=True) + + # verify_file_path = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250205134704.xlsx" + # ravi_verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" + # verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" + verify_fields = [ + "DocumentId", + "raw_fund_name", + "fund_id", + "fund_name", + "raw_share_name", + "sec_id", + "sec_name", + "management_fee_and_costs", + "management_fee", + "administration_fees", + "minimum_initial_investment", + "benchmark_name", + "performance_fee", + "interposed_vehicle_performance_fee_cost", + "buy_spread", + "sell_spread", + "total_annual_dollar_based_charges" + # "withdrawal_fee", + # "switching_fee", + # "activity_fee" + ] + verify_data_df = pd.read_excel(verify_file_path, sheet_name=verify_data_sheet) + # ravi_verify_data_df = pd.read_excel(ravi_verify_file_path, sheet_name=verify_data_sheet) + + # only get raw_verify_data_df data which sec_id is equal with sec_id in ravi_verify_data_df + # verify_data_df = raw_verify_data_df[raw_verify_data_df["sec_id"].isin(ravi_verify_data_df["sec_id"])] + verify_data_df = verify_data_df[verify_fields] + verify_data_df = verify_data_df.drop_duplicates() + verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"}) + verify_data_df.fillna("", inplace=True) + verify_data_df.reset_index(drop=True, inplace=True) + + if len(audit_data_df) == 0 or len(verify_data_df) == 0: + print("No data to calculate metrics.") + return + + # Calculate metrics + gt_management_fee_and_costs_list = [] + pred_management_fee_and_costs_list = [] + gt_management_fee_list = [] + pred_management_fee_list = [] + gt_administration_fees_list = [] + pred_administration_fees_list = [] + gt_minimum_initial_investment_list = [] + pred_minimum_initial_investment_list = [] + gt_benchmark_name_list = [] + pred_benchmark_name_list = [] + gt_performance_fee_list = [] + pred_performance_fee_list = [] + gt_interposed_vehicle_performance_fee_cost_list = [] + pred_interposed_vehicle_performance_fee_cost_list = [] + gt_buy_spread_list = [] + pred_buy_spread_list = [] + gt_sell_spread_list = [] + pred_sell_spread_list = [] + gt_total_annual_dollar_based_charges_list = [] + pred_total_annual_dollar_based_charges_list = [] + + # gt_performance_fee_costs_list = [] + # pred_performance_fee_costs_list = [] + # gt_buy_spread_list = [] + # pred_buy_spread_list = [] + # gt_sell_spread_list = [] + # pred_sell_spread_list = [] + # gt_withdrawal_fee_list = [] + # pred_withdrawal_fee_list = [] + # gt_switching_fee_list = [] + # pred_switching_fee_list = [] + # gt_activity_fee_list = [] + # pred_activity_fee_list = [] + + document_id_list = verify_data_df["doc_id"].unique().tolist() + + print(f"Total document count: {len(document_id_list)}") + print("Construct ground truth and prediction data...") + # similarity = Similarity() + message_list = [] + for document_id in document_id_list: + doc_audit_data = audit_data_df[audit_data_df["doc_id"] == document_id] + audit_sec_id_list = [doc_sec_id for doc_sec_id + in doc_audit_data["sec_id"].unique().tolist() + if len(doc_sec_id) > 0] + # get doc_verify_data which doc_id is same as document_id and sec_id in audit_sec_id_list + doc_verify_data = verify_data_df[(verify_data_df["doc_id"] == document_id) & (verify_data_df["sec_id"].isin(audit_sec_id_list))] + for index, row in doc_audit_data.iterrows(): + fund_name = row["fund_name"] + sec_id = row["sec_id"] + management_fee_and_costs = str(row["management_fee_and_costs"]) + management_fee = str(row["management_fee"]) + administration_fees = str(row["administration_fees"]) + minimum_initial_investment = str(row["minimum_initial_investment"]) + benchmark_name = str(row["benchmark_name"]) + performance_fee = str(row["performance_fee"]) + interposed_vehicle_performance_fee_cost = str(row["interposed_vehicle_performance_fee_cost"]) + buy_spread = str(row["buy_spread"]) + sell_spread = str(row["sell_spread"]) + total_annual_dollar_based_charges = str(row["total_annual_dollar_based_charges"]) + + # get the first row which sec_id in doc_verify_data is same as sec_id + doc_verify_sec_data = doc_verify_data[doc_verify_data["sec_id"] == sec_id] + if len(doc_verify_sec_data) == 0: + continue + doc_verify_sec_row = doc_verify_sec_data.iloc[0] + raw_fund_name = doc_verify_sec_row["raw_fund_name"] + v_management_fee_and_costs = str(doc_verify_sec_row["management_fee_and_costs"]) + v_management_fee = str(doc_verify_sec_row["management_fee"]) + v_administration_fees = str(doc_verify_sec_row["administration_fees"]) + v_minimum_initial_investment = str(doc_verify_sec_row["minimum_initial_investment"]) + v_benchmark_name = str(doc_verify_sec_row["benchmark_name"]) + v_performance_fee = str(doc_verify_sec_row["performance_fee"]) + v_interposed_vehicle_performance_fee_cost = str(doc_verify_sec_row["interposed_vehicle_performance_fee_cost"]) + v_buy_spread = str(doc_verify_sec_row["buy_spread"]) + v_sell_spread = str(doc_verify_sec_row["sell_spread"]) + v_total_annual_dollar_based_charges = str(doc_verify_sec_row["total_annual_dollar_based_charges"]) + + # v_performance_fee_costs = str(doc_verify_sec_row["performance_fee_costs"]) + # v_buy_spread = str(doc_verify_sec_row["buy_spread"]) + # v_sell_spread = str(doc_verify_sec_row["sell_spread"]) + # v_withdrawal_fee = str(doc_verify_sec_row["withdrawal_fee"]) + # v_switching_fee = str(doc_verify_sec_row["switching_fee"]) + # v_activity_fee = str(doc_verify_sec_row["activity_fee"]) + + message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee_and_costs")) + message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee")) + message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "administration_fees")) + message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "minimum_initial_investment")) + message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "benchmark_name")) + message = get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "performance_fee")) + message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, + gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "interposed_vehicle_performance_fee_cost")) + message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "buy_spread")) + message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "sell_spread")) + message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges, + gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "total_annual_dollar_based_charges")) + # message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list) + # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee")) + # message = get_gt_pred_by_compare_values(switching_fee, v_switching_fee, gt_switching_fee_list, pred_switching_fee_list) + # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "switching_fee")) + # message = get_gt_pred_by_compare_values(activity_fee, v_activity_fee, gt_activity_fee_list, pred_activity_fee_list) + # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "activity_fee")) + + message_data_df = pd.DataFrame(message_list) + message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name', 'data_point', 'gt_value', 'pred_value', 'error']] + # order by doc_id, raw_fund_name, data_point + message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_fund_name', 'data_point']) + message_data_df.reset_index(drop=True, inplace=True) + + # calculate metrics + print("Calculate metrics...") + precision_management_fee_and_costs = precision_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + recall_management_fee_and_costs = recall_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + f1_management_fee_and_costs = f1_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + accuracy_management_fee_and_costs = accuracy_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + support_management_fee_and_costs = sum(gt_management_fee_and_costs_list) + + precision_management_fee = precision_score(gt_management_fee_list, pred_management_fee_list) + recall_management_fee = recall_score(gt_management_fee_list, pred_management_fee_list) + f1_management_fee = f1_score(gt_management_fee_list, pred_management_fee_list) + accuracy_management_fee = accuracy_score(gt_management_fee_list, pred_management_fee_list) + support_management_fee = sum(gt_management_fee_list) + + precision_administration_fees = precision_score(gt_administration_fees_list, pred_administration_fees_list) + recall_administration_fees = recall_score(gt_administration_fees_list, pred_administration_fees_list) + f1_administration_fees = f1_score(gt_administration_fees_list, pred_administration_fees_list) + accuracy_administration_fees = accuracy_score(gt_administration_fees_list, pred_administration_fees_list) + support_administration_fees = sum(gt_administration_fees_list) + + precision_miminimum_initial_investment = precision_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + recall_miminimum_initial_investment = recall_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + f1_miminimum_initial_investment = f1_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + accuracy_miminimum_initial_investment = accuracy_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + support_miminimum_initial_investment = sum(gt_minimum_initial_investment_list) + + precision_benchmark_name = precision_score(gt_benchmark_name_list, pred_benchmark_name_list) + recall_benchmark_name = recall_score(gt_benchmark_name_list, pred_benchmark_name_list) + f1_benchmark_name = f1_score(gt_benchmark_name_list, pred_benchmark_name_list) + accuracy_benchmark_name = accuracy_score(gt_benchmark_name_list, pred_benchmark_name_list) + support_benchmark_name = sum(gt_benchmark_name_list) + + precision_performance_fee = precision_score(gt_performance_fee_list, pred_performance_fee_list) + recall_performance_fee = recall_score(gt_performance_fee_list, pred_performance_fee_list) + f1_performance_fee = f1_score(gt_performance_fee_list, pred_performance_fee_list) + accuracy_performance_fee = accuracy_score(gt_performance_fee_list, pred_performance_fee_list) + support_performance_fee = sum(gt_performance_fee_list) + + precision_interposed_vehicle_performance_fee_cost = precision_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + recall_interposed_vehicle_performance_fee_cost = recall_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + f1_interposed_vehicle_performance_fee_cost = f1_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + accuracy_interposed_vehicle_performance_fee_cost = accuracy_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + support_interposed_vehicle_performance_fee_cost = sum(gt_interposed_vehicle_performance_fee_cost_list) + + precision_buy_spread = precision_score(gt_buy_spread_list, pred_buy_spread_list) + recall_buy_spread = recall_score(gt_buy_spread_list, pred_buy_spread_list) + f1_buy_spread = f1_score(gt_buy_spread_list, pred_buy_spread_list) + accuracy_buy_spread = accuracy_score(gt_buy_spread_list, pred_buy_spread_list) + support_buy_spread = sum(gt_buy_spread_list) + + precision_sell_spread = precision_score(gt_sell_spread_list, pred_sell_spread_list) + recall_sell_spread = recall_score(gt_sell_spread_list, pred_sell_spread_list) + f1_sell_spread = f1_score(gt_sell_spread_list, pred_sell_spread_list) + accuracy_sell_spread = accuracy_score(gt_sell_spread_list, pred_sell_spread_list) + support_buy_spread = sum(gt_sell_spread_list) + + precision_total_annual_dollar_based_charges = precision_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + recall_total_annual_dollar_based_charges = recall_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + f1_total_annual_dollar_based_charges = f1_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + accuracy_total_annual_dollar_based_charges = accuracy_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + support_total_annual_dollar_based_charges = sum(gt_total_annual_dollar_based_charges_list) + + # precision_withdrawal_fee = precision_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) + # recall_withdrawal_fee = recall_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) + # f1_withdrawal_fee = f1_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) + # accuracy_withdrawal_fee = accuracy_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) + # support_withdrawal_fee = sum(gt_withdrawal_fee_list) + + # precision_switching_fee = precision_score(gt_switching_fee_list, pred_switching_fee_list) + # recall_switching_fee = recall_score(gt_switching_fee_list, pred_switching_fee_list) + # f1_switching_fee = f1_score(gt_switching_fee_list, pred_switching_fee_list) + # accuracy_switching_fee = accuracy_score(gt_switching_fee_list, pred_switching_fee_list) + # support_switching_fee = sum(gt_switching_fee_list) + + # precision_activity_fee = precision_score(gt_activity_fee_list, pred_activity_fee_list) + # recall_activity_fee = recall_score(gt_activity_fee_list, pred_activity_fee_list) + # f1_activity_fee = f1_score(gt_activity_fee_list, pred_activity_fee_list) + # accuracy_activity_fee = accuracy_score(gt_activity_fee_list, pred_activity_fee_list) + # support_activity_fee = sum(gt_activity_fee_list) + + metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, + {"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, + {"item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, + {"item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, + {"item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name}, + {"item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee}, + {"item": "interposed_vehicle_performance_fee_cost", "precision": precision_interposed_vehicle_performance_fee_cost, "recall": recall_interposed_vehicle_performance_fee_cost, + "f1": f1_interposed_vehicle_performance_fee_cost, "accuracy": accuracy_interposed_vehicle_performance_fee_cost, "support": support_interposed_vehicle_performance_fee_cost}, + {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, + {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, + {"item": "total_annual_dollar_based_charges", "precision": precision_total_annual_dollar_based_charges, "recall": recall_total_annual_dollar_based_charges, + "f1": f1_total_annual_dollar_based_charges, "accuracy": accuracy_total_annual_dollar_based_charges, "support": support_total_annual_dollar_based_charges} + # {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, + # {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, + # {"item": "withdrawal_fee", "precision": precision_withdrawal_fee, "recall": recall_withdrawal_fee, "f1": f1_withdrawal_fee, "accuracy": accuracy_withdrawal_fee, "support": support_withdrawal_fee}, + # {"item": "switching_fee", "precision": precision_switching_fee, "recall": recall_switching_fee, "f1": f1_switching_fee, "accuracy": accuracy_switching_fee, "support": support_switching_fee}, + # {"item": "activity_fee", "precision": precision_activity_fee, "recall": recall_activity_fee, "f1": f1_activity_fee, "accuracy": accuracy_activity_fee, "support": support_activity_fee} + ] + metrics_data_df = pd.DataFrame(metrics_data) + averate_precision = metrics_data_df["precision"].mean() + average_recall = metrics_data_df["recall"].mean() + average_f1 = metrics_data_df["f1"].mean() + average_accuracy = metrics_data_df["accuracy"].mean() + sum_support = metrics_data_df["support"].sum() + metrics_data.append({"item": "average_score", "precision": averate_precision, "recall": average_recall, "f1": average_f1, "accuracy": average_accuracy, "support": sum_support}) + metrics_data_df = pd.DataFrame(metrics_data) + metrics_data_df = metrics_data_df[['item', 'f1', 'precision', 'recall', 'accuracy', 'support']] + + # output metrics data to Excel file + print("Output metrics data to Excel file...") + output_folder = r"/data/aus_prospectus/output/metrics_data/" + os.makedirs(output_folder, exist_ok=True) + verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") + metrics_file_name = f"metrics_{verify_file_name}_4_dps_not_strict.xlsx" + output_file = os.path.join(output_folder, metrics_file_name) + with pd.ExcelWriter(output_file) as writer: + metrics_data_df.to_excel(writer, index=False, sheet_name="metrics_data") + message_data_df.to_excel(writer, index=False, sheet_name="message_data") + + +def generate_message(message: dict, doc_id: str, sec_id: str, fund_legal_name: str, raw_fund_name: str, datapoint: str): + message["data_point"] = datapoint + message["fund_legal_name"] = fund_legal_name + message["raw_fund_name"] = raw_fund_name + message["sec_id"] = sec_id + message["doc_id"] = str(doc_id) + return message + + +def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""): + message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""} + if gt_value is not None and len(str(gt_value)) > 0: + gt_list.append(1) + gt_equal_pred = is_equal(gt_value, pred_value, data_point) + if gt_equal_pred: + pred_list.append(1) + else: + pred_list.append(0) + message["error"] = "pred_value is not equal to gt_value" + if pred_value is not None and len(str(pred_value)) > 0: + pred_list.append(1) + gt_list.append(0) + else: + if pred_value is not None and len(str(pred_value)) > 0: + gt_list.append(0) + pred_list.append(1) + message["error"] = "gt_value is empty, but pred_value is not empty" + # else: + # gt_list.append(1) + # pred_list.append(1) + return message + + +def is_equal(gt_value, pred_value, data_point: str = ""): + if gt_value is not None and len(str(gt_value)) > 0 and \ + pred_value is not None and len(str(pred_value)) > 0: + if gt_value == pred_value: + return True + if data_point == "benchmark_name": + gt_value = clean_text(gt_value) + pred_value = clean_text(pred_value) + if gt_value == pred_value or gt_value in pred_value or pred_value in gt_value: + return True + similarity = Similarity() + jacard_score = similarity.jaccard_similarity(gt_value.lower().split(), pred_value.lower().split()) + if jacard_score > 0.8: + return True + return False + + +def clean_text(text: str): + if text is None or len(text) == 0: + return text + text = re.sub(r"\W", " ", text) + text = re.sub(r"\s+", " ", text) + return text + + +def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx", + data_sheet: str = "Sheet1", + raw_name_column: str = "raw_share_name", + mapping_file_path: str = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx", + mapping_sheet: str = "document_mapping", + raw_name_mapping_column: str = None, + output_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"): + data_df = pd.read_excel(data_file_path, sheet_name=data_sheet) + data_df["fund_id"] = "" + data_df["fund_name"] = "" + data_df["sec_id"] = "" + data_df["sec_name"] = "" + + mapping_data = pd.read_excel(mapping_file_path, sheet_name=mapping_sheet) + + doc_id_list = data_df["doc_id"].unique().tolist() + for doc_id in doc_id_list: + doc_data = data_df[data_df["doc_id"] == doc_id] + raw_name_list = doc_data[raw_name_column].unique().tolist() + + doc_mapping_data = mapping_data[mapping_data["DocumentId"] == doc_id] + if len(doc_mapping_data) == 0: + continue + provider_name = doc_mapping_data["CompanyName"].values[0] + if raw_name_mapping_column is not None and raw_name_mapping_column == "FundLegalName": + doc_db_name_list = doc_mapping_data[raw_name_mapping_column].unique().tolist() + for raw_name in raw_name_list: + find_df = doc_mapping_data[doc_mapping_data[raw_name_mapping_column] == raw_name] + if find_df is not None and len(find_df) == 1: + sec_id = find_df["FundClassId"].values[0] + sec_name = find_df["FundClassLegalName"].values[0] + fund_id = find_df["FundId"].values[0] + fund_name = find_df["FundLegalName"].values[0] + # update doc_data which raw_share_name is same as raw_share_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_id"] = sec_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_name"] = sec_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_id"] = fund_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_name"] = fund_name + else: + doc_db_name_list = doc_mapping_data["FundClassLegalName"].unique().tolist() + all_match_result = get_raw_name_db_match_result(doc_id, + provider_name, + raw_name_list, + doc_db_name_list, + iter_count=60) + for raw_share_name in raw_name_list: + if all_match_result.get(raw_share_name) is not None: + matched_db_share_name = all_match_result[raw_share_name] + if ( + matched_db_share_name is not None + and len(matched_db_share_name) > 0 + ): + # get SecId from self.doc_fund_class_mapping + find_share_df = doc_mapping_data[doc_mapping_data["FundClassLegalName"] == matched_db_share_name] + if find_share_df is not None and len(find_share_df) > 0: + sec_id = find_share_df["FundClassId"].values[0] + fund_id = find_share_df["FundId"].values[0] + fund_name = find_share_df["FundLegalName"].values[0] + # update doc_data which raw_share_name is same as raw_share_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_id"] = sec_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_name"] = matched_db_share_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_id"] = fund_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_name"] = fund_name + try: + data_df = data_df[["doc_id", + "raw_fund_name", + "fund_id", + "fund_name", + "raw_share_name", + "sec_id", + "sec_name", + "management_fee_and_costs", + "management_fee", + "administration_fees", + "minimum_initial_investment", + "benchmark_name", + "performance_fee", + "performance_fee_charged", + "buy_spread", + "sell_spread", + "total_annual_dollar_based_charges", + "interposed_vehicle_performance_fee_cost", + "establishment_fee", + "contribution_fee", + "withdrawal_fee", + "exit_fee", + "switching_fee", + "activity_fee", + "hurdle_rate", + "analyst_name" + ]] + except Exception as e: + print(e) + + with open(output_file_path, "wb") as file: + data_df.to_excel(file, index=False) + + +def get_raw_name_db_match_result( + doc_id: str, provider_name: str, raw_name_list: list, doc_share_name_list: list, iter_count: int = 30 + ): + # split raw_name_list into several parts which each part is with 30 elements + # The reason to split is to avoid invoke token limitation issues from CahtGPT + raw_name_list_parts = [ + raw_name_list[i : i + iter_count] + for i in range(0, len(raw_name_list), iter_count) + ] + all_match_result = {} + doc_share_name_list = deepcopy(doc_share_name_list) + for raw_name_list in raw_name_list_parts: + match_result, doc_share_name_list = get_final_function_to_match( + doc_id, provider_name, raw_name_list, doc_share_name_list + ) + all_match_result.update(match_result) + return all_match_result + +def get_final_function_to_match(doc_id, provider_name, raw_name_list, db_name_list): + if len(db_name_list) == 0: + match_result = {} + for raw_name in raw_name_list: + match_result[raw_name] = "" + else: + match_result = final_function_to_match( + doc_id=doc_id, + pred_list=raw_name_list, + db_list=db_name_list, + provider_name=provider_name, + doc_source="aus_prospectus" + ) + matched_name_list = list(match_result.values()) + db_name_list = remove_matched_names(db_name_list, matched_name_list) + return match_result, db_name_list + +def remove_matched_names(target_name_list: list, matched_name_list: list): + if len(matched_name_list) == 0: + return target_name_list + + matched_name_list = list(set(matched_name_list)) + matched_name_list = [ + value for value in matched_name_list if value is not None and len(value) > 0 + ] + for matched_name in matched_name_list: + if ( + matched_name is not None + and len(matched_name) > 0 + and matched_name in target_name_list + ): + target_name_list.remove(matched_name) + return target_name_list + + +def set_mapping_to_ravi_data(): + data_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx" + data_sheet = "Sheet1" + mapping_file_path = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" + mapping_sheet = "document_mapping" + output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" + set_mapping_to_raw_name_data(data_file_path, data_sheet, mapping_file_path, mapping_sheet, output_file_path) + + +def set_mapping_to_data_side_documents_data(): + # data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/Audited file_phase2.xlsx" + # data_sheet = "all" + # mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" + # mapping_sheet = "document_mapping" + # output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/audited_file_phase2_with_mapping.xlsx" + + data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx" + data_sheet = "ground_truth" + raw_name_column = "raw_share_name" + mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" + mapping_sheet = "document_mapping" + raw_name_mapping_column = None + output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" + set_mapping_to_raw_name_data(data_file_path=data_file_path, + data_sheet=data_sheet, + raw_name_column=raw_name_column, + mapping_file_path=mapping_file_path, + mapping_sheet=mapping_sheet, + raw_name_mapping_column=raw_name_mapping_column, + output_file_path=output_file_path) + + +def adjust_data_file(source_file: str, + targe_file: str): + source_data = pd.read_excel(source_file, sheet_name="Sheet1") + source_doc_id_list = source_data["DocumentId"].unique().tolist() + + target_data = pd.read_excel(targe_file, sheet_name="Sheet1") + #remove target_data which doc_id is in source_doc_id_list + target_data = target_data[~target_data["DocumentId"].isin(source_doc_id_list)] + # concat source_data and target_data + target_data = pd.concat([source_data, target_data], ignore_index=True) + with open(targe_file, "wb") as file: + target_data.to_excel(file, index=False) + + +if __name__ == "__main__": + # adjust_column_order() + # set_mapping_to_data_side_documents_data() + + # source_file = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx" + # target_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" + # adjust_data_file(source_file=source_file, targe_file=target_file) + + # audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx" + # audit_data_sheet: str = "Sheet1" + # verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx" + # verify_data_sheet: str = "total_data" + + audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" + audit_data_sheet: str = "Sheet1" + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250305160321_ravi.xlsx" + verify_data_sheet: str = "total_data" + calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, + audit_data_sheet=audit_data_sheet, + verify_file_path=verify_file_path, + verify_data_sheet=verify_data_sheet) + + # set_mapping_to_17_documents_data() + # set_mapping_to_ravi_data() + + # calculate_metrics_based_audit_file(is_strict=True) + # calculate_metrics_based_audit_file(is_strict=False) + # remove_ter_ogc_performance_fee_annotation() + # batch_run_documents() + # transform_pdf_2_image() + # ground_truth_file = "./test_metrics/ground_truth.xlsx" + # prediction_file = "./test_metrics/prediction.xlsx" + # calc_metrics(ground_truth_file, prediction_file) + + # pdf_file = r"./data/emea_ar/pdf/532438210.pdf" + # page_list = [25, 26, 27, 28, 29] + # output_folder = r"./data/emea_ar/output/pdf_part/" + # output_part_of_pages(pdf_file, page_list, output_folder)