2025-03-05 15:57:02 +00:00
|
|
|
import os
|
|
|
|
|
from time import sleep
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from glob import glob
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
import numpy as np
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
import re
|
|
|
|
|
import json
|
|
|
|
|
import traceback
|
|
|
|
|
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
|
|
|
|
|
import requests
|
|
|
|
|
import fitz
|
|
|
|
|
from utils.similarity import Similarity
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calc_metrics(ground_truth_file: str, prediction_file: str):
|
|
|
|
|
"""
|
|
|
|
|
Calculate metrics by comparing ground truth and prediction files
|
|
|
|
|
"""
|
|
|
|
|
if not os.path.exists(ground_truth_file):
|
|
|
|
|
raise FileNotFoundError(f"File not found: {ground_truth_file}")
|
|
|
|
|
if not os.path.exists(prediction_file):
|
|
|
|
|
raise FileNotFoundError(f"File not found: {prediction_file}")
|
|
|
|
|
|
|
|
|
|
ground_truth_df = pd.read_excel(ground_truth_file)
|
|
|
|
|
prediction_df = pd.read_excel(prediction_file)
|
|
|
|
|
|
|
|
|
|
gt_auum_list = []
|
|
|
|
|
pred_auum_list = []
|
|
|
|
|
|
|
|
|
|
gt_tor_list = []
|
|
|
|
|
pred_tor_list = []
|
|
|
|
|
columns = ["fund_name", "auum", "tor"]
|
|
|
|
|
# Check whether the ground truth file contains the same values as the prediction file
|
|
|
|
|
# The purpose is to calculate Recall
|
|
|
|
|
for gt_index, gt_row in ground_truth_df.iterrows():
|
|
|
|
|
gt_fund_name = gt_row["fund_name"]
|
|
|
|
|
gt_auum = gt_row["auum"]
|
|
|
|
|
gt_tor = gt_row["tor"]
|
|
|
|
|
find_auum_flag = False
|
|
|
|
|
find_tor_flag = False
|
|
|
|
|
for pred_index, pred_row in prediction_df.iterrows():
|
|
|
|
|
pred_fund_name = pred_row["fund_name"]
|
|
|
|
|
pred_auum = pred_row["auum"]
|
|
|
|
|
pred_tor = pred_row["tor"]
|
|
|
|
|
if gt_fund_name == pred_fund_name:
|
|
|
|
|
if gt_auum == pred_auum:
|
|
|
|
|
find_auum_flag = True
|
|
|
|
|
if gt_tor == pred_tor:
|
|
|
|
|
find_tor_flag = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if find_auum_flag:
|
|
|
|
|
gt_auum_list.append(1)
|
|
|
|
|
pred_auum_list.append(1)
|
|
|
|
|
else:
|
|
|
|
|
gt_auum_list.append(1)
|
|
|
|
|
pred_auum_list.append(0)
|
|
|
|
|
|
|
|
|
|
if find_tor_flag:
|
|
|
|
|
gt_tor_list.append(1)
|
|
|
|
|
pred_tor_list.append(1)
|
|
|
|
|
else:
|
|
|
|
|
gt_tor_list.append(1)
|
|
|
|
|
pred_tor_list.append(0)
|
|
|
|
|
|
|
|
|
|
# Check whether the prediction file contains the same values as the ground truth file
|
|
|
|
|
# The purpose is to calculate Precision
|
|
|
|
|
for pred_index, pred_row in prediction_df.iterrows():
|
|
|
|
|
pred_fund_name = pred_row["fund_name"]
|
|
|
|
|
pred_auum = pred_row["auum"]
|
|
|
|
|
pred_tor = pred_row["tor"]
|
|
|
|
|
find_auum_flag = False
|
|
|
|
|
find_tor_flag = False
|
|
|
|
|
for gt_index, gt_row in ground_truth_df.iterrows():
|
|
|
|
|
gt_fund_name = gt_row["fund_name"]
|
|
|
|
|
gt_auum = gt_row["auum"]
|
|
|
|
|
gt_tor = gt_row["tor"]
|
|
|
|
|
if pred_fund_name == gt_fund_name:
|
|
|
|
|
if pred_auum == gt_auum:
|
|
|
|
|
find_auum_flag = True
|
|
|
|
|
if pred_tor == gt_tor:
|
|
|
|
|
find_tor_flag = True
|
|
|
|
|
break
|
|
|
|
|
if not find_auum_flag:
|
|
|
|
|
gt_auum_list.append(0)
|
|
|
|
|
pred_auum_list.append(1)
|
|
|
|
|
|
|
|
|
|
if not find_tor_flag:
|
|
|
|
|
gt_tor_list.append(0)
|
|
|
|
|
pred_tor_list.append(1)
|
|
|
|
|
|
|
|
|
|
precision_auum = precision_score(gt_auum_list, pred_auum_list)
|
|
|
|
|
recall_auum = recall_score(gt_auum_list, pred_auum_list)
|
|
|
|
|
f1_auum = f1_score(gt_auum_list, pred_auum_list)
|
|
|
|
|
accuracy_auum = accuracy_score(gt_auum_list, pred_auum_list)
|
|
|
|
|
|
|
|
|
|
precision_tor = precision_score(gt_tor_list, pred_tor_list)
|
|
|
|
|
recall_tor = recall_score(gt_tor_list, pred_tor_list)
|
|
|
|
|
f1_tor = f1_score(gt_tor_list, pred_tor_list)
|
|
|
|
|
accuracy_tor = accuracy_score(gt_tor_list, pred_tor_list)
|
|
|
|
|
|
|
|
|
|
print(f"AUUM Support: {sum(gt_auum_list)}")
|
|
|
|
|
print(f"F1 AUUM: {f1_auum}")
|
|
|
|
|
print(f"Precision AUUM: {precision_auum}")
|
|
|
|
|
print(f"Recall AUUM: {recall_auum}")
|
|
|
|
|
print(f"Accuracy AUUM: {accuracy_auum}\n")
|
|
|
|
|
|
|
|
|
|
print(f"TOR Support: {sum(gt_tor_list)}")
|
|
|
|
|
print(f"F1 TOR: {f1_tor}")
|
|
|
|
|
print(f"Precision TOR: {precision_tor}")
|
|
|
|
|
print(f"Recall TOR: {recall_tor}")
|
|
|
|
|
print(f"Accuracy TOR: {accuracy_tor}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def transform_pdf_2_image():
|
|
|
|
|
"""
|
|
|
|
|
Transform pdf to image.
|
|
|
|
|
"""
|
|
|
|
|
import fitz
|
|
|
|
|
|
|
|
|
|
folder = r"/Users/bhe/OneDrive - MORNINGSTAR INC/Personal Document/US_Life/pay/"
|
|
|
|
|
pdf_file = r"Pay_Date_2025-02-14.pdf"
|
|
|
|
|
pdf_path = os.path.join(folder, pdf_file)
|
|
|
|
|
pdf_doc = fitz.open(pdf_path)
|
|
|
|
|
|
|
|
|
|
pdf_file_pure_name = pdf_file.replace(".pdf", "")
|
|
|
|
|
for page_num in range(pdf_doc.page_count):
|
|
|
|
|
page = pdf_doc.load_page(page_num)
|
|
|
|
|
image = page.get_pixmap(dpi=300)
|
|
|
|
|
image_path = os.path.join(folder, f"{pdf_file_pure_name}_{page_num}.png")
|
|
|
|
|
image.save(image_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def invoke_api_demo(doc_id: str = "407881493"):
|
|
|
|
|
headers = {"connection": "keep-alive", "content-type": "application/json"}
|
|
|
|
|
data = {
|
|
|
|
|
"doc_id": doc_id,
|
|
|
|
|
}
|
|
|
|
|
print(f"Start to invoke API for document: {doc_id}")
|
|
|
|
|
# url = 'https://internal-ts00006-stg-dcms-gpt-765982576.us-east-1.elb.amazonaws.com/automation/api/model/us_ar'
|
|
|
|
|
url = "http://127.0.0.1:8080/automation/api/model/emea_ar"
|
|
|
|
|
try:
|
|
|
|
|
response = requests.post(url, json=data, headers=headers)
|
|
|
|
|
print("API response status code: {0}".format(response.status_code))
|
|
|
|
|
json_data = json.loads(response.text)
|
|
|
|
|
print(json_data)
|
|
|
|
|
data_folder = r"/data/emea_ar/output/extract_data_by_api/"
|
|
|
|
|
os.makedirs(data_folder, exist_ok=True)
|
|
|
|
|
json_file = os.path.join(data_folder, f"{doc_id}.json")
|
|
|
|
|
with open(json_file, "w", encoding="utf-8") as f:
|
|
|
|
|
json.dump(json_data, f, indent=4)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print("Meet exception: {0}".format(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def batch_run_documents():
|
|
|
|
|
document_id_list = [
|
|
|
|
|
"292989214",
|
|
|
|
|
"316237292",
|
|
|
|
|
"321733631",
|
|
|
|
|
"323390570",
|
|
|
|
|
"327956364",
|
|
|
|
|
"333207452",
|
|
|
|
|
"334718372",
|
|
|
|
|
"344636875",
|
|
|
|
|
"362246081",
|
|
|
|
|
"366179419",
|
|
|
|
|
"380945052",
|
|
|
|
|
"382366116",
|
|
|
|
|
"387202452",
|
|
|
|
|
"389171486",
|
|
|
|
|
"391456740",
|
|
|
|
|
"391736837",
|
|
|
|
|
"394778487",
|
|
|
|
|
"401684600",
|
|
|
|
|
"402113224",
|
|
|
|
|
"402181770",
|
|
|
|
|
"402397014",
|
|
|
|
|
"405803396",
|
|
|
|
|
"445102363",
|
|
|
|
|
"445256897",
|
|
|
|
|
"448265376",
|
|
|
|
|
"449555622",
|
|
|
|
|
"449623976",
|
|
|
|
|
"458291624",
|
|
|
|
|
"458359181",
|
|
|
|
|
"463081566",
|
|
|
|
|
"469138353",
|
|
|
|
|
"471641628",
|
|
|
|
|
"476492237",
|
|
|
|
|
"478585901",
|
|
|
|
|
"478586066",
|
|
|
|
|
"479042264",
|
|
|
|
|
"479793787",
|
|
|
|
|
"481475385",
|
|
|
|
|
"483617247",
|
|
|
|
|
"486378555",
|
|
|
|
|
"486383912",
|
|
|
|
|
"492121213",
|
|
|
|
|
"497497599",
|
|
|
|
|
"502693599",
|
|
|
|
|
"502821436",
|
|
|
|
|
"503194284",
|
|
|
|
|
"506559375",
|
|
|
|
|
"507967525",
|
|
|
|
|
"508854243",
|
|
|
|
|
"509845549",
|
|
|
|
|
"520879048",
|
|
|
|
|
"529925114",
|
|
|
|
|
]
|
|
|
|
|
for doc_id in document_id_list:
|
|
|
|
|
invoke_api_demo(doc_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def remove_ter_ogc_performance_fee_annotation():
|
|
|
|
|
data_folder = r"/data/emea_ar/output/extract_data_by_api/"
|
|
|
|
|
os.makedirs(data_folder, exist_ok=True)
|
|
|
|
|
# get all of json files from the folder
|
|
|
|
|
json_files = glob(os.path.join(data_folder, "*.json"))
|
|
|
|
|
remove_dp_list = ["ter", "ogc", "performance_fee"]
|
|
|
|
|
for json_file in json_files:
|
|
|
|
|
with open(json_file, "r", encoding="utf-8") as f:
|
|
|
|
|
json_data = json.load(f)
|
|
|
|
|
annotation_data_list = json_data["annotation_data"]
|
|
|
|
|
remove_data_list = []
|
|
|
|
|
for annotation_data in annotation_data_list:
|
|
|
|
|
if annotation_data["data_point"] in remove_dp_list:
|
|
|
|
|
remove_data_list.append(annotation_data)
|
|
|
|
|
if len(remove_data_list) > 0:
|
|
|
|
|
for remove_data in remove_data_list:
|
|
|
|
|
if remove_data in annotation_data_list:
|
|
|
|
|
annotation_data_list.remove(remove_data)
|
|
|
|
|
with open(json_file, "w", encoding="utf-8") as f:
|
|
|
|
|
json.dump(json_data, f, indent=4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def output_part_of_pages(pdf_file: str, page_list: list, output_folder: str):
|
|
|
|
|
"""
|
|
|
|
|
Output part of pages from a pdf file to new pdf file.
|
|
|
|
|
:param pdf_file: str, the path of the pdf file.
|
|
|
|
|
:param page_list: list, the page number list.
|
|
|
|
|
:param output_folder: str, the output folder.
|
|
|
|
|
"""
|
|
|
|
|
pdf_doc = fitz.open(pdf_file)
|
|
|
|
|
pdf_file_pure_name = os.path.basename(pdf_file).replace(".pdf", "")
|
|
|
|
|
new_pdf = fitz.open()
|
|
|
|
|
print(f"output pages: {page_list} for {pdf_file_pure_name}")
|
|
|
|
|
for page_index in page_list:
|
|
|
|
|
new_pdf.insert_pdf(pdf_doc, from_page=page_index, to_page=page_index)
|
|
|
|
|
if output_folder is None or len(output_folder) == 0:
|
|
|
|
|
output_folder = r"./data/emea_ar/output/pdf_part/"
|
|
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
|
|
|
new_pdf.save(os.path.join(output_folder, f"{pdf_file_pure_name}_part.pdf"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_metrics_based_audit_file(is_strict: bool = False):
|
|
|
|
|
print("Start to calculate metrics based on audit file and verify file...")
|
|
|
|
|
audit_file_path = (
|
|
|
|
|
r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/Audited file_phase2.xlsx"
|
|
|
|
|
)
|
|
|
|
|
audit_data_sheets = ["Mayank - revised ", "Prathamesh - Revised"]
|
|
|
|
|
audit_fields = [
|
|
|
|
|
"doc_id",
|
|
|
|
|
"fund_name",
|
|
|
|
|
"management_fee_and_costs",
|
|
|
|
|
"management_fee",
|
|
|
|
|
"performance_fee",
|
|
|
|
|
"performance_fee_costs",
|
|
|
|
|
"buy_spread",
|
|
|
|
|
"sell_spread",
|
|
|
|
|
"minimum_initial_investment",
|
|
|
|
|
"recoverable_expenses",
|
|
|
|
|
"indirect_costs"
|
|
|
|
|
]
|
|
|
|
|
audit_data_list = []
|
|
|
|
|
for audit_data_sheet in audit_data_sheets:
|
|
|
|
|
sub_audit_data_df = pd.read_excel(audit_file_path, sheet_name=audit_data_sheet)
|
|
|
|
|
sub_audit_data_df = sub_audit_data_df[audit_fields]
|
|
|
|
|
audit_data_list.append(sub_audit_data_df)
|
|
|
|
|
audit_data_df = pd.concat(audit_data_list, ignore_index=True)
|
|
|
|
|
audit_data_df = audit_data_df.drop_duplicates()
|
|
|
|
|
audit_data_df.fillna("", inplace=True)
|
|
|
|
|
audit_data_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
verify_file_path = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250205134704.xlsx"
|
|
|
|
|
verify_data_sheet = "total_data"
|
|
|
|
|
verify_fields = [
|
|
|
|
|
"DocumentId",
|
|
|
|
|
"raw_fund_name",
|
|
|
|
|
"management_fee_and_costs",
|
|
|
|
|
"management_fee",
|
|
|
|
|
"performance_fee",
|
|
|
|
|
"performance_fee_costs",
|
|
|
|
|
"buy_spread",
|
|
|
|
|
"sell_spread",
|
|
|
|
|
"minimum_initial_investment",
|
|
|
|
|
"recoverable_expenses",
|
|
|
|
|
"indirect_costs"
|
|
|
|
|
]
|
|
|
|
|
verify_data_df = pd.read_excel(verify_file_path, sheet_name=verify_data_sheet)
|
|
|
|
|
verify_data_df = verify_data_df[verify_fields]
|
|
|
|
|
verify_data_df = verify_data_df.drop_duplicates()
|
|
|
|
|
verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id", "raw_fund_name": "fund_name"})
|
|
|
|
|
verify_data_df.fillna("", inplace=True)
|
|
|
|
|
verify_data_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
if len(audit_data_df) == 0 or len(verify_data_df) == 0:
|
|
|
|
|
print("No data to calculate metrics.")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# Calculate metrics
|
|
|
|
|
gt_management_fee_and_costs_list = []
|
|
|
|
|
pred_management_fee_and_costs_list = []
|
|
|
|
|
gt_management_fee_list = []
|
|
|
|
|
pred_management_fee_list = []
|
|
|
|
|
gt_performance_fee_list = []
|
|
|
|
|
pred_performance_fee_list = []
|
|
|
|
|
gt_performance_fee_costs_list = []
|
|
|
|
|
pred_performance_fee_costs_list = []
|
|
|
|
|
gt_buy_spread_list = []
|
|
|
|
|
pred_buy_spread_list = []
|
|
|
|
|
gt_sell_spread_list = []
|
|
|
|
|
pred_sell_spread_list = []
|
|
|
|
|
gt_minimum_initial_investment_list = []
|
|
|
|
|
pred_minimum_initial_investment_list = []
|
|
|
|
|
gt_recoverable_expenses_list = []
|
|
|
|
|
pred_recoverable_expenses_list = []
|
|
|
|
|
gt_indirect_costs_list = []
|
|
|
|
|
pred_indirect_costs_list = []
|
|
|
|
|
|
|
|
|
|
document_id_list = audit_data_df["doc_id"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
print(f"Total document count: {len(document_id_list)}")
|
|
|
|
|
print("Construct ground truth and prediction data...")
|
|
|
|
|
similarity = Similarity()
|
|
|
|
|
for document_id in document_id_list:
|
|
|
|
|
doc_audit_data = audit_data_df[audit_data_df["doc_id"] == document_id]
|
|
|
|
|
doc_verify_data = verify_data_df[verify_data_df["doc_id"] == document_id]
|
|
|
|
|
for index, row in doc_audit_data.iterrows():
|
|
|
|
|
fund_name = row["fund_name"]
|
|
|
|
|
fund_name_split = fund_name.lower().split()
|
|
|
|
|
management_fee_and_costs = str(row["management_fee_and_costs"])
|
|
|
|
|
management_fee = str(row["management_fee"])
|
|
|
|
|
performance_fee = str(row["performance_fee"])
|
|
|
|
|
performance_fee_costs = str(row["performance_fee_costs"])
|
|
|
|
|
buy_spread = str(row["buy_spread"])
|
|
|
|
|
sell_spread = str(row["sell_spread"])
|
|
|
|
|
minimum_initial_investment = str(row["minimum_initial_investment"])
|
|
|
|
|
recoverable_expenses = str(row["recoverable_expenses"])
|
|
|
|
|
indirect_costs = str(row["indirect_costs"])
|
|
|
|
|
find_flag = False
|
|
|
|
|
for idx, r in doc_verify_data.iterrows():
|
|
|
|
|
v_fund_name = r["fund_name"]
|
|
|
|
|
if fund_name == v_fund_name:
|
|
|
|
|
find_flag = True
|
|
|
|
|
else:
|
|
|
|
|
v_fund_name_split = v_fund_name.lower().split()
|
|
|
|
|
name_similarity = similarity.jaccard_similarity(fund_name_split, v_fund_name_split)
|
|
|
|
|
if name_similarity > 0.8:
|
|
|
|
|
find_flag = True
|
|
|
|
|
if find_flag:
|
|
|
|
|
v_management_fee_and_costs = str(r["management_fee_and_costs"])
|
|
|
|
|
v_management_fee = str(r["management_fee"])
|
|
|
|
|
v_performance_fee = str(r["performance_fee"])
|
|
|
|
|
v_performance_fee_costs = str(r["performance_fee_costs"])
|
|
|
|
|
v_buy_spread = str(r["buy_spread"])
|
|
|
|
|
v_sell_spread = str(r["sell_spread"])
|
|
|
|
|
v_minimum_initial_investment = str(r["minimum_initial_investment"])
|
|
|
|
|
v_recoverable_expenses = str(r["recoverable_expenses"])
|
|
|
|
|
v_indirect_costs = str(r["indirect_costs"])
|
|
|
|
|
|
|
|
|
|
get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list)
|
|
|
|
|
get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list)
|
|
|
|
|
get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list)
|
|
|
|
|
get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list)
|
|
|
|
|
get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list)
|
|
|
|
|
get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list)
|
|
|
|
|
get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list)
|
|
|
|
|
get_gt_pred_by_compare_values(recoverable_expenses, v_recoverable_expenses, gt_recoverable_expenses_list, pred_recoverable_expenses_list)
|
|
|
|
|
get_gt_pred_by_compare_values(indirect_costs, v_indirect_costs, gt_indirect_costs_list, pred_indirect_costs_list)
|
|
|
|
|
break
|
|
|
|
|
if not find_flag:
|
|
|
|
|
if management_fee_and_costs is not None and len(management_fee_and_costs) > 0:
|
|
|
|
|
gt_management_fee_and_costs_list.append(1)
|
|
|
|
|
pred_management_fee_and_costs_list.append(0)
|
|
|
|
|
if management_fee is not None and len(management_fee) > 0:
|
|
|
|
|
gt_management_fee_list.append(1)
|
|
|
|
|
pred_management_fee_list.append(0)
|
|
|
|
|
if performance_fee is not None and len(performance_fee) > 0:
|
|
|
|
|
gt_performance_fee_list.append(1)
|
|
|
|
|
pred_performance_fee_list.append(0)
|
|
|
|
|
if performance_fee_costs is not None and len(performance_fee_costs) > 0:
|
|
|
|
|
gt_performance_fee_costs_list.append(1)
|
|
|
|
|
pred_performance_fee_costs_list.append(0)
|
|
|
|
|
if buy_spread is not None and len(buy_spread) > 0:
|
|
|
|
|
gt_buy_spread_list.append(1)
|
|
|
|
|
pred_buy_spread_list.append(0)
|
|
|
|
|
if sell_spread is not None and len(sell_spread) > 0:
|
|
|
|
|
gt_sell_spread_list.append(1)
|
|
|
|
|
pred_sell_spread_list.append(0)
|
|
|
|
|
if minimum_initial_investment is not None and len(minimum_initial_investment) > 0:
|
|
|
|
|
gt_minimum_initial_investment_list.append(1)
|
|
|
|
|
pred_minimum_initial_investment_list.append(0)
|
|
|
|
|
if recoverable_expenses is not None and len(recoverable_expenses) > 0:
|
|
|
|
|
gt_recoverable_expenses_list.append(1)
|
|
|
|
|
pred_recoverable_expenses_list.append(0)
|
|
|
|
|
if indirect_costs is not None and len(indirect_costs) > 0:
|
|
|
|
|
gt_indirect_costs_list.append(1)
|
|
|
|
|
pred_indirect_costs_list.append(0)
|
|
|
|
|
|
|
|
|
|
if is_strict:
|
|
|
|
|
for idx, r in doc_verify_data.iterrows():
|
|
|
|
|
v_fund_name = r["fund_name"]
|
|
|
|
|
find_flag = False
|
|
|
|
|
for index, row in doc_audit_data.iterrows():
|
|
|
|
|
fund_name = row["fund_name"]
|
|
|
|
|
if fund_name == v_fund_name:
|
|
|
|
|
find_flag = True
|
|
|
|
|
else:
|
|
|
|
|
v_fund_name_split = v_fund_name.lower().split()
|
|
|
|
|
fund_name_split = fund_name.lower().split()
|
|
|
|
|
name_similarity = similarity.jaccard_similarity(fund_name_split, v_fund_name_split)
|
|
|
|
|
if name_similarity > 0.8:
|
|
|
|
|
find_flag = True
|
|
|
|
|
if find_flag:
|
|
|
|
|
break
|
|
|
|
|
if not find_flag:
|
|
|
|
|
v_management_fee_and_costs = str(r["management_fee_and_costs"])
|
|
|
|
|
v_management_fee = str(r["management_fee"])
|
|
|
|
|
v_performance_fee = str(r["performance_fee"])
|
|
|
|
|
v_performance_fee_costs = str(r["performance_fee_costs"])
|
|
|
|
|
v_buy_spread = str(r["buy_spread"])
|
|
|
|
|
v_sell_spread = str(r["sell_spread"])
|
|
|
|
|
v_minimum_initial_investment = str(r["minimum_initial_investment"])
|
|
|
|
|
v_recoverable_expenses = str(r["recoverable_expenses"])
|
|
|
|
|
v_indirect_costs = str(r["indirect_costs"])
|
|
|
|
|
|
|
|
|
|
if v_management_fee_and_costs is not None and len(v_management_fee_and_costs) > 0:
|
|
|
|
|
gt_management_fee_and_costs_list.append(0)
|
|
|
|
|
pred_management_fee_and_costs_list.append(1)
|
|
|
|
|
if v_management_fee is not None and len(v_management_fee) > 0:
|
|
|
|
|
gt_management_fee_list.append(0)
|
|
|
|
|
pred_management_fee_list.append(1)
|
|
|
|
|
if v_performance_fee is not None and len(v_performance_fee) > 0:
|
|
|
|
|
gt_performance_fee_list.append(0)
|
|
|
|
|
pred_performance_fee_list.append(1)
|
|
|
|
|
if v_performance_fee_costs is not None and len(v_performance_fee_costs) > 0:
|
|
|
|
|
gt_performance_fee_costs_list.append(0)
|
|
|
|
|
pred_performance_fee_costs_list.append(1)
|
|
|
|
|
if v_buy_spread is not None and len(v_buy_spread) > 0:
|
|
|
|
|
gt_buy_spread_list.append(0)
|
|
|
|
|
pred_buy_spread_list.append(1)
|
|
|
|
|
if v_sell_spread is not None and len(v_sell_spread) > 0:
|
|
|
|
|
gt_sell_spread_list.append(0)
|
|
|
|
|
pred_sell_spread_list.append(1)
|
|
|
|
|
if v_minimum_initial_investment is not None and len(v_minimum_initial_investment) > 0:
|
|
|
|
|
gt_minimum_initial_investment_list.append(0)
|
|
|
|
|
pred_minimum_initial_investment_list.append(1)
|
|
|
|
|
if v_recoverable_expenses is not None and len(v_recoverable_expenses) > 0:
|
|
|
|
|
gt_recoverable_expenses_list.append(0)
|
|
|
|
|
pred_recoverable_expenses_list.append(1)
|
|
|
|
|
if v_indirect_costs is not None and len(v_indirect_costs) > 0:
|
|
|
|
|
gt_indirect_costs_list.append(0)
|
|
|
|
|
pred_indirect_costs_list.append(1)
|
|
|
|
|
|
|
|
|
|
# calculate metrics
|
|
|
|
|
print("Calculate metrics...")
|
|
|
|
|
precision_management_fee_and_costs = precision_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list)
|
|
|
|
|
recall_management_fee_and_costs = recall_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list)
|
|
|
|
|
f1_management_fee_and_costs = f1_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list)
|
|
|
|
|
accuracy_management_fee_and_costs = accuracy_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list)
|
|
|
|
|
support_management_fee_and_costs = sum(gt_management_fee_and_costs_list)
|
|
|
|
|
|
|
|
|
|
precision_management_fee = precision_score(gt_management_fee_list, pred_management_fee_list)
|
|
|
|
|
recall_management_fee = recall_score(gt_management_fee_list, pred_management_fee_list)
|
|
|
|
|
f1_management_fee = f1_score(gt_management_fee_list, pred_management_fee_list)
|
|
|
|
|
accuracy_management_fee = accuracy_score(gt_management_fee_list, pred_management_fee_list)
|
|
|
|
|
support_management_fee = sum(gt_management_fee_list)
|
|
|
|
|
|
|
|
|
|
precision_performance_fee = precision_score(gt_performance_fee_list, pred_performance_fee_list)
|
|
|
|
|
recall_performance_fee = recall_score(gt_performance_fee_list, pred_performance_fee_list)
|
|
|
|
|
f1_performance_fee = f1_score(gt_performance_fee_list, pred_performance_fee_list)
|
|
|
|
|
accuracy_performance_fee = accuracy_score(gt_performance_fee_list, pred_performance_fee_list)
|
|
|
|
|
support_performance_fee = sum(gt_performance_fee_list)
|
|
|
|
|
|
|
|
|
|
precision_performance_fee_costs = precision_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list)
|
|
|
|
|
recall_performance_fee_costs = recall_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list)
|
|
|
|
|
f1_performance_fee_costs = f1_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list)
|
|
|
|
|
accuracy_performance_fee_costs = accuracy_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list)
|
|
|
|
|
support_performance_fee_costs = sum(gt_performance_fee_costs_list)
|
|
|
|
|
|
|
|
|
|
precision_buy_spread = precision_score(gt_buy_spread_list, pred_buy_spread_list)
|
|
|
|
|
recall_buy_spread = recall_score(gt_buy_spread_list, pred_buy_spread_list)
|
|
|
|
|
f1_buy_spread = f1_score(gt_buy_spread_list, pred_buy_spread_list)
|
|
|
|
|
accuracy_buy_spread = accuracy_score(gt_buy_spread_list, pred_buy_spread_list)
|
|
|
|
|
support_buy_spread = sum(gt_buy_spread_list)
|
|
|
|
|
|
|
|
|
|
precision_sell_spread = precision_score(gt_sell_spread_list, pred_sell_spread_list)
|
|
|
|
|
recall_sell_spread = recall_score(gt_sell_spread_list, pred_sell_spread_list)
|
|
|
|
|
f1_sell_spread = f1_score(gt_sell_spread_list, pred_sell_spread_list)
|
|
|
|
|
accuracy_sell_spread = accuracy_score(gt_sell_spread_list, pred_sell_spread_list)
|
|
|
|
|
support_buy_spread = sum(gt_sell_spread_list)
|
|
|
|
|
|
|
|
|
|
precision_minimum_initial_investment = precision_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list)
|
|
|
|
|
recall_minimum_initial_investment = recall_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list)
|
|
|
|
|
f1_minimum_initial_investment = f1_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list)
|
|
|
|
|
accuracy_minimum_initial_investment = accuracy_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list)
|
|
|
|
|
support_minimum_initial_investment = sum(gt_minimum_initial_investment_list)
|
|
|
|
|
|
|
|
|
|
precision_recoverable_expenses = precision_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list)
|
|
|
|
|
recall_recoverable_expenses = recall_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list)
|
|
|
|
|
f1_recoverable_expenses = f1_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list)
|
|
|
|
|
accuracy_recoverable_expenses = accuracy_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list)
|
|
|
|
|
support_recoverable_expenses = sum(gt_recoverable_expenses_list)
|
|
|
|
|
|
|
|
|
|
precision_indirect_costs = precision_score(gt_indirect_costs_list, pred_indirect_costs_list)
|
|
|
|
|
recall_indirect_costs = recall_score(gt_indirect_costs_list, pred_indirect_costs_list)
|
|
|
|
|
f1_indirect_costs = f1_score(gt_indirect_costs_list, pred_indirect_costs_list)
|
|
|
|
|
accuracy_indirect_costs = accuracy_score(gt_indirect_costs_list, pred_indirect_costs_list)
|
|
|
|
|
support_indirect_costs = sum(gt_indirect_costs_list)
|
|
|
|
|
|
|
|
|
|
metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs},
|
|
|
|
|
{"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee},
|
|
|
|
|
{"item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee},
|
|
|
|
|
{"item": "performance_fee_costs", "precision": precision_performance_fee_costs, "recall": recall_performance_fee_costs, "f1": f1_performance_fee_costs, "accuracy": accuracy_performance_fee_costs, "support": support_performance_fee_costs},
|
|
|
|
|
{"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread},
|
|
|
|
|
{"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread},
|
|
|
|
|
{"item": "minimum_initial_investment", "precision": precision_minimum_initial_investment, "recall": recall_minimum_initial_investment, "f1": f1_minimum_initial_investment, "accuracy": accuracy_minimum_initial_investment, "support": support_minimum_initial_investment},
|
|
|
|
|
{"item": "recoverable_expenses", "precision": precision_recoverable_expenses, "recall": recall_recoverable_expenses, "f1": f1_recoverable_expenses, "accuracy": accuracy_recoverable_expenses, "support": support_recoverable_expenses},
|
|
|
|
|
{"item": "indirect_costs", "precision": precision_indirect_costs, "recall": recall_indirect_costs, "f1": f1_indirect_costs, "accuracy": accuracy_indirect_costs, "support": support_indirect_costs}]
|
|
|
|
|
metrics_data_df = pd.DataFrame(metrics_data)
|
|
|
|
|
averate_precision = metrics_data_df["precision"].mean()
|
|
|
|
|
average_recall = metrics_data_df["recall"].mean()
|
|
|
|
|
average_f1 = metrics_data_df["f1"].mean()
|
|
|
|
|
average_accuracy = metrics_data_df["accuracy"].mean()
|
|
|
|
|
sum_support = metrics_data_df["support"].sum()
|
|
|
|
|
metrics_data.append({"item": "average_score", "precision": averate_precision, "recall": average_recall, "f1": average_f1, "accuracy": average_accuracy, "support": sum_support})
|
|
|
|
|
metrics_data_df = pd.DataFrame(metrics_data)
|
|
|
|
|
metrics_data_df = metrics_data_df[['item', 'f1', 'precision', 'recall', 'accuracy', 'support']]
|
|
|
|
|
|
|
|
|
|
# output metrics data to Excel file
|
|
|
|
|
print("Output metrics data to Excel file...")
|
|
|
|
|
output_folder = r"/data/aus_prospectus/output/metrics_data/"
|
|
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
|
|
|
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
|
|
|
|
|
if is_strict:
|
|
|
|
|
verify_file_name = f"metrics_{verify_file_name}_revised_strict.xlsx"
|
|
|
|
|
else:
|
|
|
|
|
verify_file_name = f"metrics_{verify_file_name}_revised_not_strict.xlsx"
|
|
|
|
|
output_file = os.path.join(output_folder, verify_file_name)
|
|
|
|
|
with pd.ExcelWriter(output_file) as writer:
|
|
|
|
|
metrics_data_df.to_excel(writer, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx",
|
|
|
|
|
audit_data_sheet: str = "Sheet1",
|
|
|
|
|
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx",
|
2025-03-05 23:21:13 +00:00
|
|
|
verify_data_sheet: str = "total_data",
|
|
|
|
|
verify_document_list_file: str = None,
|
2025-03-13 04:45:52 +00:00
|
|
|
is_for_all: bool = False,
|
|
|
|
|
zero_equal_none: bool = False
|
2025-03-05 15:57:02 +00:00
|
|
|
):
|
|
|
|
|
print("Start to calculate metrics based on DB data file and extracted file...")
|
|
|
|
|
audit_data_df = pd.DataFrame()
|
|
|
|
|
verify_data_df = pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
audit_fields = [
|
2025-03-13 16:53:27 +00:00
|
|
|
"doc_id",
|
|
|
|
|
"fund_name",
|
|
|
|
|
"fund_id",
|
|
|
|
|
"sec_name",
|
|
|
|
|
"sec_id",
|
2025-03-05 15:57:02 +00:00
|
|
|
"management_fee_and_costs",
|
|
|
|
|
"management_fee",
|
|
|
|
|
"administration_fees",
|
|
|
|
|
"minimum_initial_investment",
|
|
|
|
|
"benchmark_name",
|
2025-03-11 22:15:39 +00:00
|
|
|
"performance_fee_costs",
|
2025-03-05 15:57:02 +00:00
|
|
|
"interposed_vehicle_performance_fee_cost",
|
|
|
|
|
"buy_spread",
|
|
|
|
|
"sell_spread",
|
|
|
|
|
"total_annual_dollar_based_charges"
|
|
|
|
|
# "withdrawal_fee",
|
|
|
|
|
# "switching_fee",
|
|
|
|
|
# "activity_fee",
|
|
|
|
|
|
|
|
|
|
]
|
|
|
|
|
audit_data_df = pd.read_excel(audit_file_path, sheet_name=audit_data_sheet)
|
|
|
|
|
audit_data_df = audit_data_df[audit_fields]
|
|
|
|
|
audit_data_df = audit_data_df.drop_duplicates()
|
2025-03-13 16:53:27 +00:00
|
|
|
# audit_data_df = audit_data_df.rename(columns={"DocumentId": "doc_id",
|
|
|
|
|
# "FundLegalName": "fund_name",
|
|
|
|
|
# "FundId": "fund_id",
|
|
|
|
|
# "FundClassLegalName": "sec_name",
|
|
|
|
|
# "FundClassId": "sec_id"})
|
2025-03-05 15:57:02 +00:00
|
|
|
audit_data_df.fillna("", inplace=True)
|
|
|
|
|
audit_data_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
# verify_file_path = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250205134704.xlsx"
|
|
|
|
|
# ravi_verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"
|
|
|
|
|
# verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"
|
|
|
|
|
verify_fields = [
|
2025-03-10 21:00:17 +00:00
|
|
|
"doc_id",
|
2025-03-05 15:57:02 +00:00
|
|
|
"raw_fund_name",
|
|
|
|
|
"fund_id",
|
|
|
|
|
"fund_name",
|
|
|
|
|
"raw_share_name",
|
|
|
|
|
"sec_id",
|
|
|
|
|
"sec_name",
|
|
|
|
|
"management_fee_and_costs",
|
|
|
|
|
"management_fee",
|
|
|
|
|
"administration_fees",
|
|
|
|
|
"minimum_initial_investment",
|
|
|
|
|
"benchmark_name",
|
2025-03-11 22:15:39 +00:00
|
|
|
"performance_fee_costs",
|
2025-03-05 15:57:02 +00:00
|
|
|
"interposed_vehicle_performance_fee_cost",
|
|
|
|
|
"buy_spread",
|
|
|
|
|
"sell_spread",
|
|
|
|
|
"total_annual_dollar_based_charges"
|
|
|
|
|
# "withdrawal_fee",
|
|
|
|
|
# "switching_fee",
|
|
|
|
|
# "activity_fee"
|
|
|
|
|
]
|
|
|
|
|
verify_data_df = pd.read_excel(verify_file_path, sheet_name=verify_data_sheet)
|
|
|
|
|
# ravi_verify_data_df = pd.read_excel(ravi_verify_file_path, sheet_name=verify_data_sheet)
|
|
|
|
|
|
|
|
|
|
# only get raw_verify_data_df data which sec_id is equal with sec_id in ravi_verify_data_df
|
|
|
|
|
# verify_data_df = raw_verify_data_df[raw_verify_data_df["sec_id"].isin(ravi_verify_data_df["sec_id"])]
|
|
|
|
|
verify_data_df = verify_data_df[verify_fields]
|
|
|
|
|
verify_data_df = verify_data_df.drop_duplicates()
|
2025-03-10 21:00:17 +00:00
|
|
|
# verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"})
|
2025-03-05 15:57:02 +00:00
|
|
|
verify_data_df.fillna("", inplace=True)
|
|
|
|
|
verify_data_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
if len(audit_data_df) == 0 or len(verify_data_df) == 0:
|
|
|
|
|
print("No data to calculate metrics.")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# Calculate metrics
|
|
|
|
|
gt_management_fee_and_costs_list = []
|
|
|
|
|
pred_management_fee_and_costs_list = []
|
|
|
|
|
gt_management_fee_list = []
|
|
|
|
|
pred_management_fee_list = []
|
|
|
|
|
gt_administration_fees_list = []
|
|
|
|
|
pred_administration_fees_list = []
|
|
|
|
|
gt_minimum_initial_investment_list = []
|
|
|
|
|
pred_minimum_initial_investment_list = []
|
|
|
|
|
gt_benchmark_name_list = []
|
|
|
|
|
pred_benchmark_name_list = []
|
2025-03-05 23:21:13 +00:00
|
|
|
if is_for_all:
|
2025-03-11 22:15:39 +00:00
|
|
|
gt_performance_fee_costs_list = []
|
|
|
|
|
pred_performance_fee_costs_list = []
|
2025-03-05 23:21:13 +00:00
|
|
|
gt_interposed_vehicle_performance_fee_cost_list = []
|
|
|
|
|
pred_interposed_vehicle_performance_fee_cost_list = []
|
|
|
|
|
gt_buy_spread_list = []
|
|
|
|
|
pred_buy_spread_list = []
|
|
|
|
|
gt_sell_spread_list = []
|
|
|
|
|
pred_sell_spread_list = []
|
|
|
|
|
gt_total_annual_dollar_based_charges_list = []
|
|
|
|
|
pred_total_annual_dollar_based_charges_list = []
|
2025-03-05 15:57:02 +00:00
|
|
|
|
|
|
|
|
# gt_performance_fee_costs_list = []
|
|
|
|
|
# pred_performance_fee_costs_list = []
|
|
|
|
|
# gt_buy_spread_list = []
|
|
|
|
|
# pred_buy_spread_list = []
|
|
|
|
|
# gt_sell_spread_list = []
|
|
|
|
|
# pred_sell_spread_list = []
|
|
|
|
|
# gt_withdrawal_fee_list = []
|
|
|
|
|
# pred_withdrawal_fee_list = []
|
|
|
|
|
# gt_switching_fee_list = []
|
|
|
|
|
# pred_switching_fee_list = []
|
|
|
|
|
# gt_activity_fee_list = []
|
|
|
|
|
# pred_activity_fee_list = []
|
|
|
|
|
|
2025-03-05 23:21:13 +00:00
|
|
|
if verify_document_list_file is not None:
|
|
|
|
|
with open(verify_document_list_file, "r", encoding="utf-8") as f:
|
|
|
|
|
verify_document_list = f.readlines()
|
|
|
|
|
verify_document_list = [int(doc_id.strip()) for doc_id in verify_document_list]
|
|
|
|
|
if len(verify_document_list) > 0:
|
|
|
|
|
verify_data_df = verify_data_df[verify_data_df["doc_id"].isin(verify_document_list)]
|
2025-03-05 15:57:02 +00:00
|
|
|
document_id_list = verify_data_df["doc_id"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
print(f"Total document count: {len(document_id_list)}")
|
|
|
|
|
print("Construct ground truth and prediction data...")
|
|
|
|
|
# similarity = Similarity()
|
|
|
|
|
message_list = []
|
|
|
|
|
for document_id in document_id_list:
|
|
|
|
|
doc_audit_data = audit_data_df[audit_data_df["doc_id"] == document_id]
|
|
|
|
|
audit_sec_id_list = [doc_sec_id for doc_sec_id
|
|
|
|
|
in doc_audit_data["sec_id"].unique().tolist()
|
|
|
|
|
if len(doc_sec_id) > 0]
|
|
|
|
|
# get doc_verify_data which doc_id is same as document_id and sec_id in audit_sec_id_list
|
|
|
|
|
doc_verify_data = verify_data_df[(verify_data_df["doc_id"] == document_id) & (verify_data_df["sec_id"].isin(audit_sec_id_list))]
|
|
|
|
|
for index, row in doc_audit_data.iterrows():
|
|
|
|
|
fund_name = row["fund_name"]
|
|
|
|
|
sec_id = row["sec_id"]
|
|
|
|
|
management_fee_and_costs = str(row["management_fee_and_costs"])
|
|
|
|
|
management_fee = str(row["management_fee"])
|
|
|
|
|
administration_fees = str(row["administration_fees"])
|
|
|
|
|
minimum_initial_investment = str(row["minimum_initial_investment"])
|
|
|
|
|
benchmark_name = str(row["benchmark_name"])
|
2025-03-05 23:21:13 +00:00
|
|
|
if is_for_all:
|
2025-03-11 22:15:39 +00:00
|
|
|
performance_fee_costs = str(row["performance_fee_costs"])
|
2025-03-05 23:21:13 +00:00
|
|
|
interposed_vehicle_performance_fee_cost = str(row["interposed_vehicle_performance_fee_cost"])
|
|
|
|
|
buy_spread = str(row["buy_spread"])
|
|
|
|
|
sell_spread = str(row["sell_spread"])
|
|
|
|
|
total_annual_dollar_based_charges = str(row["total_annual_dollar_based_charges"])
|
2025-03-05 15:57:02 +00:00
|
|
|
|
|
|
|
|
# get the first row which sec_id in doc_verify_data is same as sec_id
|
|
|
|
|
doc_verify_sec_data = doc_verify_data[doc_verify_data["sec_id"] == sec_id]
|
|
|
|
|
if len(doc_verify_sec_data) == 0:
|
|
|
|
|
continue
|
|
|
|
|
doc_verify_sec_row = doc_verify_sec_data.iloc[0]
|
|
|
|
|
raw_fund_name = doc_verify_sec_row["raw_fund_name"]
|
2025-03-09 03:40:00 +00:00
|
|
|
raw_share_name = doc_verify_sec_row["raw_share_name"]
|
2025-03-05 15:57:02 +00:00
|
|
|
v_management_fee_and_costs = str(doc_verify_sec_row["management_fee_and_costs"])
|
|
|
|
|
v_management_fee = str(doc_verify_sec_row["management_fee"])
|
|
|
|
|
v_administration_fees = str(doc_verify_sec_row["administration_fees"])
|
|
|
|
|
v_minimum_initial_investment = str(doc_verify_sec_row["minimum_initial_investment"])
|
|
|
|
|
v_benchmark_name = str(doc_verify_sec_row["benchmark_name"])
|
2025-03-05 23:21:13 +00:00
|
|
|
if is_for_all:
|
2025-03-11 22:15:39 +00:00
|
|
|
v_performance_fee_costs = str(doc_verify_sec_row["performance_fee_costs"])
|
2025-03-05 23:21:13 +00:00
|
|
|
v_interposed_vehicle_performance_fee_cost = str(doc_verify_sec_row["interposed_vehicle_performance_fee_cost"])
|
|
|
|
|
v_buy_spread = str(doc_verify_sec_row["buy_spread"])
|
|
|
|
|
v_sell_spread = str(doc_verify_sec_row["sell_spread"])
|
|
|
|
|
v_total_annual_dollar_based_charges = str(doc_verify_sec_row["total_annual_dollar_based_charges"])
|
2025-03-05 15:57:02 +00:00
|
|
|
|
|
|
|
|
# v_performance_fee_costs = str(doc_verify_sec_row["performance_fee_costs"])
|
|
|
|
|
# v_buy_spread = str(doc_verify_sec_row["buy_spread"])
|
|
|
|
|
# v_sell_spread = str(doc_verify_sec_row["sell_spread"])
|
|
|
|
|
# v_withdrawal_fee = str(doc_verify_sec_row["withdrawal_fee"])
|
|
|
|
|
# v_switching_fee = str(doc_verify_sec_row["switching_fee"])
|
|
|
|
|
# v_activity_fee = str(doc_verify_sec_row["activity_fee"])
|
|
|
|
|
|
2025-03-13 04:45:52 +00:00
|
|
|
message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs", zero_equal_none=zero_equal_none)
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs"))
|
2025-03-13 04:45:52 +00:00
|
|
|
message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee", zero_equal_none=zero_equal_none)
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee"))
|
2025-03-13 04:45:52 +00:00
|
|
|
message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees", zero_equal_none=zero_equal_none)
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees"))
|
2025-03-13 04:45:52 +00:00
|
|
|
message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment", zero_equal_none=zero_equal_none)
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment"))
|
2025-03-13 04:45:52 +00:00
|
|
|
message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name", zero_equal_none=zero_equal_none)
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name"))
|
2025-03-05 23:21:13 +00:00
|
|
|
if is_for_all:
|
2025-03-13 04:45:52 +00:00
|
|
|
message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list, zero_equal_none=zero_equal_none)
|
2025-03-11 22:15:39 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee_costs"))
|
2025-03-05 23:21:13 +00:00
|
|
|
message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost,
|
2025-03-13 04:45:52 +00:00
|
|
|
gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list,
|
|
|
|
|
zero_equal_none=zero_equal_none)
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost"))
|
2025-03-13 04:45:52 +00:00
|
|
|
message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list, zero_equal_none=zero_equal_none)
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread"))
|
2025-03-13 04:45:52 +00:00
|
|
|
message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list, zero_equal_none=zero_equal_none)
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread"))
|
2025-03-05 23:21:13 +00:00
|
|
|
message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges,
|
2025-03-13 04:45:52 +00:00
|
|
|
gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list,
|
|
|
|
|
zero_equal_none=zero_equal_none)
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges"))
|
2025-03-05 15:57:02 +00:00
|
|
|
# message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list)
|
|
|
|
|
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee"))
|
|
|
|
|
# message = get_gt_pred_by_compare_values(switching_fee, v_switching_fee, gt_switching_fee_list, pred_switching_fee_list)
|
|
|
|
|
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "switching_fee"))
|
|
|
|
|
# message = get_gt_pred_by_compare_values(activity_fee, v_activity_fee, gt_activity_fee_list, pred_activity_fee_list)
|
|
|
|
|
# message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "activity_fee"))
|
|
|
|
|
|
|
|
|
|
message_data_df = pd.DataFrame(message_list)
|
2025-03-09 03:40:00 +00:00
|
|
|
message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name',
|
|
|
|
|
'raw_share_name', 'data_point', 'gt_value', 'pred_value', 'error']]
|
2025-03-05 15:57:02 +00:00
|
|
|
# order by doc_id, raw_fund_name, data_point
|
2025-03-09 03:40:00 +00:00
|
|
|
message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_share_name', 'data_point'])
|
2025-03-05 15:57:02 +00:00
|
|
|
message_data_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
# calculate metrics
|
|
|
|
|
print("Calculate metrics...")
|
|
|
|
|
precision_management_fee_and_costs = precision_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list)
|
|
|
|
|
recall_management_fee_and_costs = recall_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list)
|
|
|
|
|
f1_management_fee_and_costs = f1_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list)
|
|
|
|
|
accuracy_management_fee_and_costs = accuracy_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list)
|
|
|
|
|
support_management_fee_and_costs = sum(gt_management_fee_and_costs_list)
|
|
|
|
|
|
|
|
|
|
precision_management_fee = precision_score(gt_management_fee_list, pred_management_fee_list)
|
|
|
|
|
recall_management_fee = recall_score(gt_management_fee_list, pred_management_fee_list)
|
|
|
|
|
f1_management_fee = f1_score(gt_management_fee_list, pred_management_fee_list)
|
|
|
|
|
accuracy_management_fee = accuracy_score(gt_management_fee_list, pred_management_fee_list)
|
|
|
|
|
support_management_fee = sum(gt_management_fee_list)
|
|
|
|
|
|
|
|
|
|
precision_administration_fees = precision_score(gt_administration_fees_list, pred_administration_fees_list)
|
|
|
|
|
recall_administration_fees = recall_score(gt_administration_fees_list, pred_administration_fees_list)
|
|
|
|
|
f1_administration_fees = f1_score(gt_administration_fees_list, pred_administration_fees_list)
|
|
|
|
|
accuracy_administration_fees = accuracy_score(gt_administration_fees_list, pred_administration_fees_list)
|
|
|
|
|
support_administration_fees = sum(gt_administration_fees_list)
|
|
|
|
|
|
|
|
|
|
precision_miminimum_initial_investment = precision_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list)
|
|
|
|
|
recall_miminimum_initial_investment = recall_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list)
|
|
|
|
|
f1_miminimum_initial_investment = f1_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list)
|
|
|
|
|
accuracy_miminimum_initial_investment = accuracy_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list)
|
|
|
|
|
support_miminimum_initial_investment = sum(gt_minimum_initial_investment_list)
|
|
|
|
|
|
|
|
|
|
precision_benchmark_name = precision_score(gt_benchmark_name_list, pred_benchmark_name_list)
|
|
|
|
|
recall_benchmark_name = recall_score(gt_benchmark_name_list, pred_benchmark_name_list)
|
|
|
|
|
f1_benchmark_name = f1_score(gt_benchmark_name_list, pred_benchmark_name_list)
|
|
|
|
|
accuracy_benchmark_name = accuracy_score(gt_benchmark_name_list, pred_benchmark_name_list)
|
|
|
|
|
support_benchmark_name = sum(gt_benchmark_name_list)
|
|
|
|
|
|
2025-03-05 23:21:13 +00:00
|
|
|
if is_for_all:
|
2025-03-11 22:15:39 +00:00
|
|
|
precision_performance_fee_costs = precision_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list)
|
|
|
|
|
recall_performance_fee_costs = recall_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list)
|
|
|
|
|
f1_performance_fee_costs = f1_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list)
|
|
|
|
|
accuracy_performance_fee_costs = accuracy_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list)
|
|
|
|
|
support_performance_fee_costs = sum(gt_performance_fee_costs_list)
|
2025-03-05 23:21:13 +00:00
|
|
|
|
|
|
|
|
precision_interposed_vehicle_performance_fee_cost = precision_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list)
|
|
|
|
|
recall_interposed_vehicle_performance_fee_cost = recall_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list)
|
|
|
|
|
f1_interposed_vehicle_performance_fee_cost = f1_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list)
|
|
|
|
|
accuracy_interposed_vehicle_performance_fee_cost = accuracy_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list)
|
|
|
|
|
support_interposed_vehicle_performance_fee_cost = sum(gt_interposed_vehicle_performance_fee_cost_list)
|
|
|
|
|
|
|
|
|
|
precision_buy_spread = precision_score(gt_buy_spread_list, pred_buy_spread_list)
|
|
|
|
|
recall_buy_spread = recall_score(gt_buy_spread_list, pred_buy_spread_list)
|
|
|
|
|
f1_buy_spread = f1_score(gt_buy_spread_list, pred_buy_spread_list)
|
|
|
|
|
accuracy_buy_spread = accuracy_score(gt_buy_spread_list, pred_buy_spread_list)
|
|
|
|
|
support_buy_spread = sum(gt_buy_spread_list)
|
|
|
|
|
|
|
|
|
|
precision_sell_spread = precision_score(gt_sell_spread_list, pred_sell_spread_list)
|
|
|
|
|
recall_sell_spread = recall_score(gt_sell_spread_list, pred_sell_spread_list)
|
|
|
|
|
f1_sell_spread = f1_score(gt_sell_spread_list, pred_sell_spread_list)
|
|
|
|
|
accuracy_sell_spread = accuracy_score(gt_sell_spread_list, pred_sell_spread_list)
|
|
|
|
|
support_buy_spread = sum(gt_sell_spread_list)
|
|
|
|
|
|
|
|
|
|
precision_total_annual_dollar_based_charges = precision_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list)
|
|
|
|
|
recall_total_annual_dollar_based_charges = recall_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list)
|
|
|
|
|
f1_total_annual_dollar_based_charges = f1_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list)
|
|
|
|
|
accuracy_total_annual_dollar_based_charges = accuracy_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list)
|
|
|
|
|
support_total_annual_dollar_based_charges = sum(gt_total_annual_dollar_based_charges_list)
|
2025-03-05 15:57:02 +00:00
|
|
|
|
|
|
|
|
# precision_withdrawal_fee = precision_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list)
|
|
|
|
|
# recall_withdrawal_fee = recall_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list)
|
|
|
|
|
# f1_withdrawal_fee = f1_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list)
|
|
|
|
|
# accuracy_withdrawal_fee = accuracy_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list)
|
|
|
|
|
# support_withdrawal_fee = sum(gt_withdrawal_fee_list)
|
|
|
|
|
|
|
|
|
|
# precision_switching_fee = precision_score(gt_switching_fee_list, pred_switching_fee_list)
|
|
|
|
|
# recall_switching_fee = recall_score(gt_switching_fee_list, pred_switching_fee_list)
|
|
|
|
|
# f1_switching_fee = f1_score(gt_switching_fee_list, pred_switching_fee_list)
|
|
|
|
|
# accuracy_switching_fee = accuracy_score(gt_switching_fee_list, pred_switching_fee_list)
|
|
|
|
|
# support_switching_fee = sum(gt_switching_fee_list)
|
|
|
|
|
|
|
|
|
|
# precision_activity_fee = precision_score(gt_activity_fee_list, pred_activity_fee_list)
|
|
|
|
|
# recall_activity_fee = recall_score(gt_activity_fee_list, pred_activity_fee_list)
|
|
|
|
|
# f1_activity_fee = f1_score(gt_activity_fee_list, pred_activity_fee_list)
|
|
|
|
|
# accuracy_activity_fee = accuracy_score(gt_activity_fee_list, pred_activity_fee_list)
|
|
|
|
|
# support_activity_fee = sum(gt_activity_fee_list)
|
2025-03-05 23:21:13 +00:00
|
|
|
if is_for_all:
|
|
|
|
|
metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs},
|
|
|
|
|
{"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee},
|
|
|
|
|
{"item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees},
|
|
|
|
|
{"item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment},
|
|
|
|
|
{"item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name},
|
2025-03-11 22:15:39 +00:00
|
|
|
{"item": "performance_fee_costs", "precision": precision_performance_fee_costs, "recall": recall_performance_fee_costs, "f1": f1_performance_fee_costs, "accuracy": accuracy_performance_fee_costs, "support": support_performance_fee_costs},
|
2025-03-05 23:21:13 +00:00
|
|
|
{"item": "interposed_vehicle_performance_fee_cost", "precision": precision_interposed_vehicle_performance_fee_cost, "recall": recall_interposed_vehicle_performance_fee_cost,
|
|
|
|
|
"f1": f1_interposed_vehicle_performance_fee_cost, "accuracy": accuracy_interposed_vehicle_performance_fee_cost, "support": support_interposed_vehicle_performance_fee_cost},
|
|
|
|
|
{"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread},
|
|
|
|
|
{"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread},
|
|
|
|
|
{"item": "total_annual_dollar_based_charges", "precision": precision_total_annual_dollar_based_charges, "recall": recall_total_annual_dollar_based_charges,
|
|
|
|
|
"f1": f1_total_annual_dollar_based_charges, "accuracy": accuracy_total_annual_dollar_based_charges, "support": support_total_annual_dollar_based_charges}
|
|
|
|
|
# {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread},
|
|
|
|
|
# {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread},
|
|
|
|
|
# {"item": "withdrawal_fee", "precision": precision_withdrawal_fee, "recall": recall_withdrawal_fee, "f1": f1_withdrawal_fee, "accuracy": accuracy_withdrawal_fee, "support": support_withdrawal_fee},
|
|
|
|
|
# {"item": "switching_fee", "precision": precision_switching_fee, "recall": recall_switching_fee, "f1": f1_switching_fee, "accuracy": accuracy_switching_fee, "support": support_switching_fee},
|
|
|
|
|
# {"item": "activity_fee", "precision": precision_activity_fee, "recall": recall_activity_fee, "f1": f1_activity_fee, "accuracy": accuracy_activity_fee, "support": support_activity_fee}
|
|
|
|
|
]
|
|
|
|
|
else:
|
|
|
|
|
metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs},
|
|
|
|
|
{"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee},
|
|
|
|
|
{"item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees},
|
|
|
|
|
{"item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment},
|
|
|
|
|
{"item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name}
|
|
|
|
|
]
|
2025-03-05 15:57:02 +00:00
|
|
|
metrics_data_df = pd.DataFrame(metrics_data)
|
|
|
|
|
averate_precision = metrics_data_df["precision"].mean()
|
|
|
|
|
average_recall = metrics_data_df["recall"].mean()
|
|
|
|
|
average_f1 = metrics_data_df["f1"].mean()
|
|
|
|
|
average_accuracy = metrics_data_df["accuracy"].mean()
|
|
|
|
|
sum_support = metrics_data_df["support"].sum()
|
|
|
|
|
metrics_data.append({"item": "average_score", "precision": averate_precision, "recall": average_recall, "f1": average_f1, "accuracy": average_accuracy, "support": sum_support})
|
|
|
|
|
metrics_data_df = pd.DataFrame(metrics_data)
|
|
|
|
|
metrics_data_df = metrics_data_df[['item', 'f1', 'precision', 'recall', 'accuracy', 'support']]
|
|
|
|
|
|
|
|
|
|
# output metrics data to Excel file
|
|
|
|
|
print("Output metrics data to Excel file...")
|
|
|
|
|
output_folder = r"/data/aus_prospectus/output/metrics_data/"
|
|
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
|
|
|
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
|
2025-03-07 21:02:12 +00:00
|
|
|
if is_for_all:
|
2025-03-13 04:45:52 +00:00
|
|
|
verify_file_name = f"{verify_file_name}_all"
|
2025-03-13 07:59:19 +00:00
|
|
|
if zero_equal_none:
|
|
|
|
|
verify_file_name = f"{verify_file_name}_zero_equal_none"
|
2025-03-05 23:21:13 +00:00
|
|
|
metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx"
|
2025-03-05 15:57:02 +00:00
|
|
|
output_file = os.path.join(output_folder, metrics_file_name)
|
|
|
|
|
with pd.ExcelWriter(output_file) as writer:
|
|
|
|
|
metrics_data_df.to_excel(writer, index=False, sheet_name="metrics_data")
|
|
|
|
|
message_data_df.to_excel(writer, index=False, sheet_name="message_data")
|
|
|
|
|
|
|
|
|
|
|
2025-03-07 21:02:12 +00:00
|
|
|
def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx",
|
|
|
|
|
audit_data_sheet: str = "Sheet1",
|
|
|
|
|
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx",
|
|
|
|
|
verify_data_sheet: str = "total_data",
|
|
|
|
|
verify_document_list_file: str = None,
|
|
|
|
|
is_for_all: bool = False
|
|
|
|
|
):
|
|
|
|
|
print("Start to calculate metrics based on DB data file and extracted file...")
|
|
|
|
|
audit_data_df = pd.DataFrame()
|
|
|
|
|
verify_data_df = pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
audit_fields = [
|
|
|
|
|
"DocumentId",
|
|
|
|
|
"provider_id",
|
|
|
|
|
"provider_name",
|
|
|
|
|
"FundLegalName",
|
|
|
|
|
"FundId",
|
|
|
|
|
"FundClassLegalName",
|
|
|
|
|
"FundClassId",
|
|
|
|
|
"management_fee_and_costs",
|
|
|
|
|
"management_fee",
|
|
|
|
|
"administration_fees",
|
|
|
|
|
"minimum_initial_investment",
|
|
|
|
|
"benchmark_name",
|
2025-03-11 22:15:39 +00:00
|
|
|
"performance_fee_costs",
|
2025-03-07 21:02:12 +00:00
|
|
|
"interposed_vehicle_performance_fee_cost",
|
|
|
|
|
"buy_spread",
|
|
|
|
|
"sell_spread",
|
|
|
|
|
"total_annual_dollar_based_charges"
|
|
|
|
|
]
|
|
|
|
|
audit_data_df = pd.read_excel(audit_file_path, sheet_name=audit_data_sheet)
|
|
|
|
|
audit_data_df = audit_data_df[audit_fields]
|
|
|
|
|
audit_data_df = audit_data_df.drop_duplicates()
|
|
|
|
|
audit_data_df = audit_data_df.rename(columns={"DocumentId": "doc_id",
|
|
|
|
|
"FundLegalName": "fund_name",
|
|
|
|
|
"FundId": "fund_id",
|
|
|
|
|
"FundClassLegalName": "sec_name",
|
|
|
|
|
"FundClassId": "sec_id"})
|
|
|
|
|
audit_data_df.fillna("", inplace=True)
|
|
|
|
|
audit_data_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
verify_fields = [
|
2025-03-11 22:15:39 +00:00
|
|
|
"doc_id",
|
2025-03-07 21:02:12 +00:00
|
|
|
"raw_fund_name",
|
|
|
|
|
"fund_id",
|
|
|
|
|
"fund_name",
|
|
|
|
|
"raw_share_name",
|
|
|
|
|
"sec_id",
|
|
|
|
|
"sec_name",
|
|
|
|
|
"management_fee_and_costs",
|
|
|
|
|
"management_fee",
|
|
|
|
|
"administration_fees",
|
|
|
|
|
"minimum_initial_investment",
|
|
|
|
|
"benchmark_name",
|
2025-03-11 22:15:39 +00:00
|
|
|
"performance_fee_costs",
|
2025-03-07 21:02:12 +00:00
|
|
|
"interposed_vehicle_performance_fee_cost",
|
|
|
|
|
"buy_spread",
|
|
|
|
|
"sell_spread",
|
|
|
|
|
"total_annual_dollar_based_charges"
|
|
|
|
|
]
|
|
|
|
|
verify_data_df = pd.read_excel(verify_file_path, sheet_name=verify_data_sheet)
|
|
|
|
|
verify_data_df = verify_data_df[verify_fields]
|
|
|
|
|
verify_data_df = verify_data_df.drop_duplicates()
|
|
|
|
|
verify_data_df.fillna("", inplace=True)
|
|
|
|
|
verify_data_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
if len(audit_data_df) == 0 or len(verify_data_df) == 0:
|
|
|
|
|
print("No data to calculate metrics.")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# Calculate metrics
|
|
|
|
|
if verify_document_list_file is not None:
|
|
|
|
|
with open(verify_document_list_file, "r", encoding="utf-8") as f:
|
|
|
|
|
verify_document_list = f.readlines()
|
|
|
|
|
verify_document_list = [int(doc_id.strip()) for doc_id in verify_document_list]
|
|
|
|
|
if len(verify_document_list) > 0:
|
|
|
|
|
verify_data_df = verify_data_df[verify_data_df["doc_id"].isin(verify_document_list)]
|
|
|
|
|
document_id_list = verify_data_df["doc_id"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
print(f"Total document count: {len(document_id_list)}")
|
|
|
|
|
print("Construct ground truth and prediction data...")
|
|
|
|
|
# similarity = Similarity()
|
|
|
|
|
message_list = []
|
|
|
|
|
provider_gt_pred_data = {}
|
|
|
|
|
for document_id in document_id_list:
|
|
|
|
|
doc_audit_data = audit_data_df[audit_data_df["doc_id"] == document_id]
|
|
|
|
|
provider_id = doc_audit_data["provider_id"].iloc[0]
|
|
|
|
|
provider_name = doc_audit_data["provider_name"].iloc[0]
|
|
|
|
|
if provider_id not in list(provider_gt_pred_data.keys()):
|
|
|
|
|
provider_gt_pred_data[provider_id] = {"provider_name": provider_name,
|
|
|
|
|
"gt_management_fee_and_costs_list": [],
|
|
|
|
|
"pred_management_fee_and_costs_list": [],
|
|
|
|
|
"gt_management_fee_list": [],
|
|
|
|
|
"pred_management_fee_list": [],
|
|
|
|
|
"gt_administration_fees_list": [],
|
|
|
|
|
"pred_administration_fees_list": [],
|
|
|
|
|
"gt_minimum_initial_investment_list": [],
|
|
|
|
|
"pred_minimum_initial_investment_list": [],
|
|
|
|
|
"gt_benchmark_name_list": [],
|
|
|
|
|
"pred_benchmark_name_list": []}
|
|
|
|
|
if is_for_all:
|
2025-03-11 22:15:39 +00:00
|
|
|
provider_gt_pred_data[provider_id].update({"gt_performance_fee_costs_list": [],
|
|
|
|
|
"pred_performance_fee_costs_list": [],
|
2025-03-07 21:02:12 +00:00
|
|
|
"gt_interposed_vehicle_performance_fee_cost_list": [],
|
|
|
|
|
"pred_interposed_vehicle_performance_fee_cost_list": [],
|
|
|
|
|
"gt_buy_spread_list": [],
|
|
|
|
|
"pred_buy_spread_list": [],
|
|
|
|
|
"gt_sell_spread_list": [],
|
|
|
|
|
"pred_sell_spread_list": [],
|
|
|
|
|
"gt_total_annual_dollar_based_charges_list": [],
|
|
|
|
|
"pred_total_annual_dollar_based_charges_list": []})
|
|
|
|
|
audit_sec_id_list = [doc_sec_id for doc_sec_id
|
|
|
|
|
in doc_audit_data["sec_id"].unique().tolist()
|
|
|
|
|
if len(doc_sec_id) > 0]
|
|
|
|
|
# get doc_verify_data which doc_id is same as document_id and sec_id in audit_sec_id_list
|
|
|
|
|
doc_verify_data = verify_data_df[(verify_data_df["doc_id"] == document_id) & (verify_data_df["sec_id"].isin(audit_sec_id_list))]
|
|
|
|
|
for index, row in doc_audit_data.iterrows():
|
|
|
|
|
fund_name = row["fund_name"]
|
|
|
|
|
sec_id = row["sec_id"]
|
|
|
|
|
management_fee_and_costs = str(row["management_fee_and_costs"])
|
|
|
|
|
management_fee = str(row["management_fee"])
|
|
|
|
|
administration_fees = str(row["administration_fees"])
|
|
|
|
|
minimum_initial_investment = str(row["minimum_initial_investment"])
|
|
|
|
|
benchmark_name = str(row["benchmark_name"])
|
|
|
|
|
if is_for_all:
|
2025-03-11 22:15:39 +00:00
|
|
|
performance_fee_costs = str(row["performance_fee_costs"])
|
2025-03-07 21:02:12 +00:00
|
|
|
interposed_vehicle_performance_fee_cost = str(row["interposed_vehicle_performance_fee_cost"])
|
|
|
|
|
buy_spread = str(row["buy_spread"])
|
|
|
|
|
sell_spread = str(row["sell_spread"])
|
|
|
|
|
total_annual_dollar_based_charges = str(row["total_annual_dollar_based_charges"])
|
|
|
|
|
|
|
|
|
|
# get the first row which sec_id in doc_verify_data is same as sec_id
|
|
|
|
|
doc_verify_sec_data = doc_verify_data[doc_verify_data["sec_id"] == sec_id]
|
|
|
|
|
if len(doc_verify_sec_data) == 0:
|
|
|
|
|
continue
|
|
|
|
|
doc_verify_sec_row = doc_verify_sec_data.iloc[0]
|
|
|
|
|
raw_fund_name = doc_verify_sec_row["raw_fund_name"]
|
2025-03-09 03:40:00 +00:00
|
|
|
raw_share_name = doc_verify_sec_row["raw_share_name"]
|
2025-03-07 21:02:12 +00:00
|
|
|
v_management_fee_and_costs = str(doc_verify_sec_row["management_fee_and_costs"])
|
|
|
|
|
v_management_fee = str(doc_verify_sec_row["management_fee"])
|
|
|
|
|
v_administration_fees = str(doc_verify_sec_row["administration_fees"])
|
|
|
|
|
v_minimum_initial_investment = str(doc_verify_sec_row["minimum_initial_investment"])
|
|
|
|
|
v_benchmark_name = str(doc_verify_sec_row["benchmark_name"])
|
|
|
|
|
if is_for_all:
|
2025-03-11 22:15:39 +00:00
|
|
|
v_performance_fee_costs = str(doc_verify_sec_row["performance_fee_costs"])
|
2025-03-07 21:02:12 +00:00
|
|
|
v_interposed_vehicle_performance_fee_cost = str(doc_verify_sec_row["interposed_vehicle_performance_fee_cost"])
|
|
|
|
|
v_buy_spread = str(doc_verify_sec_row["buy_spread"])
|
|
|
|
|
v_sell_spread = str(doc_verify_sec_row["sell_spread"])
|
|
|
|
|
v_total_annual_dollar_based_charges = str(doc_verify_sec_row["total_annual_dollar_based_charges"])
|
|
|
|
|
|
|
|
|
|
message = get_gt_pred_by_compare_values(management_fee_and_costs,
|
|
|
|
|
v_management_fee_and_costs,
|
|
|
|
|
provider_gt_pred_data[provider_id]["gt_management_fee_and_costs_list"],
|
|
|
|
|
provider_gt_pred_data[provider_id]["pred_management_fee_and_costs_list"],
|
|
|
|
|
data_point="management_fee_and_costs")
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs"))
|
2025-03-07 21:02:12 +00:00
|
|
|
message = get_gt_pred_by_compare_values(management_fee,
|
|
|
|
|
v_management_fee,
|
|
|
|
|
provider_gt_pred_data[provider_id]["gt_management_fee_list"],
|
|
|
|
|
provider_gt_pred_data[provider_id]["pred_management_fee_list"],
|
|
|
|
|
data_point="management_fee")
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee"))
|
2025-03-07 21:02:12 +00:00
|
|
|
message = get_gt_pred_by_compare_values(administration_fees,
|
|
|
|
|
v_administration_fees,
|
|
|
|
|
provider_gt_pred_data[provider_id]["gt_administration_fees_list"],
|
|
|
|
|
provider_gt_pred_data[provider_id]["pred_administration_fees_list"],
|
|
|
|
|
data_point="administration_fees")
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees"))
|
2025-03-07 21:02:12 +00:00
|
|
|
message = get_gt_pred_by_compare_values(minimum_initial_investment,
|
|
|
|
|
v_minimum_initial_investment,
|
|
|
|
|
provider_gt_pred_data[provider_id]["gt_minimum_initial_investment_list"],
|
|
|
|
|
provider_gt_pred_data[provider_id]["pred_minimum_initial_investment_list"],
|
|
|
|
|
data_point="minimum_initial_investment")
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment"))
|
2025-03-07 21:02:12 +00:00
|
|
|
message = get_gt_pred_by_compare_values(benchmark_name,
|
|
|
|
|
v_benchmark_name,
|
|
|
|
|
provider_gt_pred_data[provider_id]["gt_benchmark_name_list"],
|
|
|
|
|
provider_gt_pred_data[provider_id]["pred_benchmark_name_list"],
|
|
|
|
|
data_point="benchmark_name")
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name"))
|
2025-03-07 21:02:12 +00:00
|
|
|
if is_for_all:
|
2025-03-11 22:15:39 +00:00
|
|
|
message = get_gt_pred_by_compare_values(performance_fee_costs,
|
|
|
|
|
v_performance_fee_costs,
|
|
|
|
|
provider_gt_pred_data[provider_id]["gt_performance_fee_costs_list"],
|
|
|
|
|
provider_gt_pred_data[provider_id]["pred_performance_fee_costs_list"],
|
|
|
|
|
data_point="performance_fee_costs")
|
|
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee_costs"))
|
2025-03-07 21:02:12 +00:00
|
|
|
message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost,
|
|
|
|
|
v_interposed_vehicle_performance_fee_cost,
|
|
|
|
|
provider_gt_pred_data[provider_id]["gt_interposed_vehicle_performance_fee_cost_list"],
|
|
|
|
|
provider_gt_pred_data[provider_id]["pred_interposed_vehicle_performance_fee_cost_list"],
|
|
|
|
|
data_point="interposed_vehicle_performance_fee_cost")
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost"))
|
2025-03-07 21:02:12 +00:00
|
|
|
message = get_gt_pred_by_compare_values(buy_spread,
|
|
|
|
|
v_buy_spread,
|
|
|
|
|
provider_gt_pred_data[provider_id]["gt_buy_spread_list"],
|
|
|
|
|
provider_gt_pred_data[provider_id]["pred_buy_spread_list"],
|
|
|
|
|
data_point="buy_spread")
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread"))
|
2025-03-07 21:02:12 +00:00
|
|
|
message = get_gt_pred_by_compare_values(sell_spread,
|
|
|
|
|
v_sell_spread,
|
|
|
|
|
provider_gt_pred_data[provider_id]["gt_sell_spread_list"],
|
|
|
|
|
provider_gt_pred_data[provider_id]["pred_sell_spread_list"],
|
|
|
|
|
data_point="sell_spread")
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread"))
|
2025-03-07 21:02:12 +00:00
|
|
|
message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges,
|
|
|
|
|
v_total_annual_dollar_based_charges,
|
|
|
|
|
provider_gt_pred_data[provider_id]["gt_total_annual_dollar_based_charges_list"],
|
|
|
|
|
provider_gt_pred_data[provider_id]["pred_total_annual_dollar_based_charges_list"],
|
|
|
|
|
data_point="total_annual_dollar_based_charges")
|
2025-03-09 03:40:00 +00:00
|
|
|
message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges"))
|
2025-03-07 21:02:12 +00:00
|
|
|
|
|
|
|
|
message_data_df = pd.DataFrame(message_list)
|
2025-03-09 03:40:00 +00:00
|
|
|
message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name',
|
|
|
|
|
'raw_share_name', 'data_point', 'gt_value', 'pred_value', 'error']]
|
2025-03-07 21:02:12 +00:00
|
|
|
# order by doc_id, raw_fund_name, data_point
|
2025-03-09 03:40:00 +00:00
|
|
|
message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_share_name', 'data_point'])
|
2025-03-07 21:02:12 +00:00
|
|
|
message_data_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
# calculate metrics
|
|
|
|
|
print("Calculate metrics...")
|
|
|
|
|
provider_metrics_list = []
|
|
|
|
|
for provider_id, gt_pred_data in provider_gt_pred_data.items():
|
|
|
|
|
provider_name = gt_pred_data["provider_name"]
|
|
|
|
|
precision_management_fee_and_costs = precision_score(gt_pred_data["gt_management_fee_and_costs_list"],
|
|
|
|
|
gt_pred_data["pred_management_fee_and_costs_list"])
|
|
|
|
|
recall_management_fee_and_costs = recall_score(gt_pred_data["gt_management_fee_and_costs_list"], gt_pred_data["pred_management_fee_and_costs_list"])
|
|
|
|
|
f1_management_fee_and_costs = f1_score(gt_pred_data["gt_management_fee_and_costs_list"], gt_pred_data["pred_management_fee_and_costs_list"])
|
|
|
|
|
accuracy_management_fee_and_costs = accuracy_score(gt_pred_data["gt_management_fee_and_costs_list"], gt_pred_data["pred_management_fee_and_costs_list"])
|
|
|
|
|
support_management_fee_and_costs = sum(gt_pred_data["gt_management_fee_and_costs_list"])
|
|
|
|
|
|
|
|
|
|
precision_management_fee = precision_score(gt_pred_data["gt_management_fee_list"], gt_pred_data["pred_management_fee_list"])
|
|
|
|
|
recall_management_fee = recall_score(gt_pred_data["gt_management_fee_list"], gt_pred_data["pred_management_fee_list"])
|
|
|
|
|
f1_management_fee = f1_score(gt_pred_data["gt_management_fee_list"], gt_pred_data["pred_management_fee_list"])
|
|
|
|
|
accuracy_management_fee = accuracy_score(gt_pred_data["gt_management_fee_list"], gt_pred_data["pred_management_fee_list"])
|
|
|
|
|
support_management_fee = sum(gt_pred_data["gt_management_fee_list"])
|
|
|
|
|
|
|
|
|
|
precision_administration_fees = precision_score(gt_pred_data["gt_administration_fees_list"], gt_pred_data["pred_administration_fees_list"])
|
|
|
|
|
recall_administration_fees = recall_score(gt_pred_data["gt_administration_fees_list"], gt_pred_data["pred_administration_fees_list"])
|
|
|
|
|
f1_administration_fees = f1_score(gt_pred_data["gt_administration_fees_list"], gt_pred_data["pred_administration_fees_list"])
|
|
|
|
|
accuracy_administration_fees = accuracy_score(gt_pred_data["gt_administration_fees_list"], gt_pred_data["pred_administration_fees_list"])
|
|
|
|
|
support_administration_fees = sum(gt_pred_data["gt_administration_fees_list"])
|
|
|
|
|
|
|
|
|
|
precision_miminimum_initial_investment = precision_score(gt_pred_data["gt_minimum_initial_investment_list"],
|
|
|
|
|
gt_pred_data["pred_minimum_initial_investment_list"])
|
|
|
|
|
recall_miminimum_initial_investment = recall_score(gt_pred_data["gt_minimum_initial_investment_list"],
|
|
|
|
|
gt_pred_data["pred_minimum_initial_investment_list"])
|
|
|
|
|
f1_miminimum_initial_investment = f1_score(gt_pred_data["gt_minimum_initial_investment_list"],
|
|
|
|
|
gt_pred_data["pred_minimum_initial_investment_list"])
|
|
|
|
|
accuracy_miminimum_initial_investment = accuracy_score(gt_pred_data["gt_minimum_initial_investment_list"],
|
|
|
|
|
gt_pred_data["pred_minimum_initial_investment_list"])
|
|
|
|
|
support_miminimum_initial_investment = sum(gt_pred_data["gt_minimum_initial_investment_list"])
|
|
|
|
|
|
|
|
|
|
precision_benchmark_name = precision_score(gt_pred_data["gt_benchmark_name_list"],
|
|
|
|
|
gt_pred_data["pred_benchmark_name_list"])
|
|
|
|
|
recall_benchmark_name = recall_score(gt_pred_data["gt_benchmark_name_list"],
|
|
|
|
|
gt_pred_data["pred_benchmark_name_list"])
|
|
|
|
|
f1_benchmark_name = f1_score(gt_pred_data["gt_benchmark_name_list"],
|
|
|
|
|
gt_pred_data["pred_benchmark_name_list"])
|
|
|
|
|
accuracy_benchmark_name = accuracy_score(gt_pred_data["gt_benchmark_name_list"],
|
|
|
|
|
gt_pred_data["pred_benchmark_name_list"])
|
|
|
|
|
support_benchmark_name = sum(gt_pred_data["gt_benchmark_name_list"])
|
|
|
|
|
|
|
|
|
|
if is_for_all:
|
2025-03-11 22:15:39 +00:00
|
|
|
precision_performance_fee_costs = precision_score(gt_pred_data["gt_performance_fee_costs_list"],
|
|
|
|
|
gt_pred_data["pred_performance_fee_costs_list"])
|
|
|
|
|
recall_performance_fee_costs = recall_score(gt_pred_data["gt_performance_fee_costs_list"],
|
|
|
|
|
gt_pred_data["pred_performance_fee_costs_list"])
|
|
|
|
|
f1_performance_fee_costs = f1_score(gt_pred_data["gt_performance_fee_costs_list"],
|
|
|
|
|
gt_pred_data["pred_performance_fee_costs_list"])
|
|
|
|
|
accuracy_performance_fee_costs = accuracy_score(gt_pred_data["gt_performance_fee_costs_list"],
|
|
|
|
|
gt_pred_data["pred_performance_fee_costs_list"])
|
|
|
|
|
support_performance_fee_costs = sum(gt_pred_data["gt_performance_fee_costs_list"])
|
2025-03-07 21:02:12 +00:00
|
|
|
|
|
|
|
|
precision_interposed_vehicle_performance_fee_cost = precision_score(gt_pred_data["gt_interposed_vehicle_performance_fee_cost_list"],
|
|
|
|
|
gt_pred_data["pred_interposed_vehicle_performance_fee_cost_list"])
|
|
|
|
|
recall_interposed_vehicle_performance_fee_cost = recall_score(gt_pred_data["gt_interposed_vehicle_performance_fee_cost_list"],
|
|
|
|
|
gt_pred_data["pred_interposed_vehicle_performance_fee_cost_list"])
|
|
|
|
|
f1_interposed_vehicle_performance_fee_cost = f1_score(gt_pred_data["gt_interposed_vehicle_performance_fee_cost_list"],
|
|
|
|
|
gt_pred_data["pred_interposed_vehicle_performance_fee_cost_list"])
|
|
|
|
|
accuracy_interposed_vehicle_performance_fee_cost = accuracy_score(gt_pred_data["gt_interposed_vehicle_performance_fee_cost_list"],
|
|
|
|
|
gt_pred_data["pred_interposed_vehicle_performance_fee_cost_list"])
|
|
|
|
|
support_interposed_vehicle_performance_fee_cost = sum(gt_pred_data["gt_interposed_vehicle_performance_fee_cost_list"])
|
|
|
|
|
|
|
|
|
|
precision_buy_spread = precision_score(gt_pred_data["gt_buy_spread_list"],
|
|
|
|
|
gt_pred_data["pred_buy_spread_list"])
|
|
|
|
|
recall_buy_spread = recall_score(gt_pred_data["gt_buy_spread_list"],
|
|
|
|
|
gt_pred_data["pred_buy_spread_list"])
|
|
|
|
|
f1_buy_spread = f1_score(gt_pred_data["gt_buy_spread_list"],
|
|
|
|
|
gt_pred_data["pred_buy_spread_list"])
|
|
|
|
|
accuracy_buy_spread = accuracy_score(gt_pred_data["gt_buy_spread_list"],
|
|
|
|
|
gt_pred_data["pred_buy_spread_list"])
|
|
|
|
|
support_buy_spread = sum(gt_pred_data["gt_buy_spread_list"])
|
|
|
|
|
|
|
|
|
|
precision_sell_spread = precision_score(gt_pred_data["gt_sell_spread_list"],
|
|
|
|
|
gt_pred_data["pred_sell_spread_list"])
|
|
|
|
|
recall_sell_spread = recall_score(gt_pred_data["gt_sell_spread_list"],
|
|
|
|
|
gt_pred_data["pred_sell_spread_list"])
|
|
|
|
|
f1_sell_spread = f1_score(gt_pred_data["gt_sell_spread_list"],
|
|
|
|
|
gt_pred_data["pred_sell_spread_list"])
|
|
|
|
|
accuracy_sell_spread = accuracy_score(gt_pred_data["gt_sell_spread_list"],
|
|
|
|
|
gt_pred_data["pred_sell_spread_list"])
|
|
|
|
|
support_buy_spread = sum(gt_pred_data["gt_sell_spread_list"])
|
|
|
|
|
|
|
|
|
|
precision_total_annual_dollar_based_charges = precision_score(gt_pred_data["gt_total_annual_dollar_based_charges_list"],
|
|
|
|
|
gt_pred_data["pred_total_annual_dollar_based_charges_list"])
|
|
|
|
|
recall_total_annual_dollar_based_charges = recall_score(gt_pred_data["gt_total_annual_dollar_based_charges_list"],
|
|
|
|
|
gt_pred_data["pred_total_annual_dollar_based_charges_list"])
|
|
|
|
|
f1_total_annual_dollar_based_charges = f1_score(gt_pred_data["gt_total_annual_dollar_based_charges_list"],
|
|
|
|
|
gt_pred_data["pred_total_annual_dollar_based_charges_list"])
|
|
|
|
|
accuracy_total_annual_dollar_based_charges = accuracy_score(gt_pred_data["gt_total_annual_dollar_based_charges_list"],
|
|
|
|
|
gt_pred_data["pred_total_annual_dollar_based_charges_list"])
|
|
|
|
|
support_total_annual_dollar_based_charges = sum(gt_pred_data["gt_total_annual_dollar_based_charges_list"])
|
|
|
|
|
|
|
|
|
|
if is_for_all:
|
|
|
|
|
metrics_data = [{"provider_id": provider_id, "provider_name": provider_name, "item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs},
|
|
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee},
|
|
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees},
|
|
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment},
|
|
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name},
|
2025-03-11 22:15:39 +00:00
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "performance_fee_costs", "precision": precision_performance_fee_costs, "recall": recall_performance_fee_costs, "f1": f1_performance_fee_costs, "accuracy": accuracy_performance_fee_costs, "support": support_performance_fee_costs},
|
2025-03-07 21:02:12 +00:00
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "interposed_vehicle_performance_fee_cost", "precision": precision_interposed_vehicle_performance_fee_cost, "recall": recall_interposed_vehicle_performance_fee_cost,
|
|
|
|
|
"f1": f1_interposed_vehicle_performance_fee_cost, "accuracy": accuracy_interposed_vehicle_performance_fee_cost, "support": support_interposed_vehicle_performance_fee_cost},
|
|
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread},
|
|
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread},
|
|
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "total_annual_dollar_based_charges", "precision": precision_total_annual_dollar_based_charges, "recall": recall_total_annual_dollar_based_charges,
|
|
|
|
|
"f1": f1_total_annual_dollar_based_charges, "accuracy": accuracy_total_annual_dollar_based_charges, "support": support_total_annual_dollar_based_charges}
|
|
|
|
|
]
|
|
|
|
|
else:
|
|
|
|
|
metrics_data = [{"provider_id": provider_id, "provider_name": provider_name, "item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs},
|
|
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee},
|
|
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees},
|
|
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment},
|
|
|
|
|
{"provider_id": provider_id, "provider_name": provider_name, "item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name}
|
|
|
|
|
]
|
|
|
|
|
metrics_data_df = pd.DataFrame(metrics_data)
|
|
|
|
|
averate_precision = metrics_data_df["precision"].mean()
|
|
|
|
|
average_recall = metrics_data_df["recall"].mean()
|
|
|
|
|
average_f1 = metrics_data_df["f1"].mean()
|
|
|
|
|
average_accuracy = metrics_data_df["accuracy"].mean()
|
|
|
|
|
sum_support = metrics_data_df["support"].sum()
|
|
|
|
|
metrics_data.append({"provider_id": provider_id, "provider_name": provider_name, "item": "average_score", "precision": averate_precision, "recall": average_recall, "f1": average_f1, "accuracy": average_accuracy, "support": sum_support})
|
|
|
|
|
metrics_data_df = pd.DataFrame(metrics_data)
|
|
|
|
|
metrics_data_df = metrics_data_df[["provider_id", "provider_name", "item", "f1", "precision", "recall", "accuracy", "support"]]
|
|
|
|
|
provider_metrics_list.append(metrics_data_df)
|
|
|
|
|
|
|
|
|
|
all_provider_metrics_df = pd.concat(provider_metrics_list)
|
|
|
|
|
all_provider_metrics_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
# output metrics data to Excel file
|
|
|
|
|
print("Output metrics data to Excel file...")
|
|
|
|
|
output_folder = r"/data/aus_prospectus/output/metrics_data/"
|
|
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
|
|
|
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
|
|
|
|
|
if is_for_all:
|
|
|
|
|
verify_file_name = f"{verify_file_name}_all"
|
|
|
|
|
metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_for_providers.xlsx"
|
|
|
|
|
output_file = os.path.join(output_folder, metrics_file_name)
|
|
|
|
|
with pd.ExcelWriter(output_file) as writer:
|
|
|
|
|
all_provider_metrics_df.to_excel(writer, index=False, sheet_name="metrics_data")
|
|
|
|
|
message_data_df.to_excel(writer, index=False, sheet_name="message_data")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-03-09 03:40:00 +00:00
|
|
|
def generate_message(message: dict,
|
|
|
|
|
doc_id: str,
|
|
|
|
|
sec_id: str,
|
|
|
|
|
fund_legal_name: str,
|
|
|
|
|
raw_fund_name: str,
|
|
|
|
|
raw_share_name: str,
|
|
|
|
|
datapoint: str):
|
2025-03-05 15:57:02 +00:00
|
|
|
message["data_point"] = datapoint
|
|
|
|
|
message["fund_legal_name"] = fund_legal_name
|
|
|
|
|
message["raw_fund_name"] = raw_fund_name
|
2025-03-09 03:40:00 +00:00
|
|
|
message["raw_share_name"] = raw_share_name
|
2025-03-05 15:57:02 +00:00
|
|
|
message["sec_id"] = sec_id
|
|
|
|
|
message["doc_id"] = str(doc_id)
|
|
|
|
|
return message
|
|
|
|
|
|
|
|
|
|
|
2025-03-13 04:45:52 +00:00
|
|
|
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = "", zero_equal_none: bool = False):
|
2025-03-05 15:57:02 +00:00
|
|
|
message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""}
|
2025-03-12 22:24:39 +00:00
|
|
|
if gt_value is not None and len(str(gt_value).strip()) > 0:
|
2025-03-05 15:57:02 +00:00
|
|
|
gt_list.append(1)
|
2025-03-13 04:45:52 +00:00
|
|
|
gt_equal_pred = is_equal(gt_value, pred_value, data_point, zero_equal_none=zero_equal_none)
|
2025-03-05 15:57:02 +00:00
|
|
|
if gt_equal_pred:
|
|
|
|
|
pred_list.append(1)
|
|
|
|
|
else:
|
|
|
|
|
pred_list.append(0)
|
|
|
|
|
message["error"] = "pred_value is not equal to gt_value"
|
2025-03-12 22:24:39 +00:00
|
|
|
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
2025-03-05 15:57:02 +00:00
|
|
|
pred_list.append(1)
|
|
|
|
|
gt_list.append(0)
|
|
|
|
|
else:
|
2025-03-12 22:24:39 +00:00
|
|
|
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
2025-03-13 02:52:00 +00:00
|
|
|
if data_point not in ["benchmark_name"]:
|
|
|
|
|
try:
|
|
|
|
|
pred_num = float(pred_value)
|
|
|
|
|
# round to 2 decimal places
|
|
|
|
|
pred_value = round(pred_num, 4)
|
2025-03-13 04:45:52 +00:00
|
|
|
if zero_equal_none and pred_value == 0:
|
2025-03-13 02:52:00 +00:00
|
|
|
gt_list.append(1)
|
|
|
|
|
pred_list.append(1)
|
|
|
|
|
else:
|
|
|
|
|
gt_list.append(0)
|
|
|
|
|
pred_list.append(1)
|
|
|
|
|
message["error"] = "gt_value is empty, but pred_value is not empty"
|
|
|
|
|
except Exception as e:
|
|
|
|
|
pass
|
|
|
|
|
else:
|
|
|
|
|
gt_list.append(0)
|
|
|
|
|
pred_list.append(1)
|
|
|
|
|
message["error"] = "gt_value is empty, but pred_value is not empty"
|
2025-03-05 15:57:02 +00:00
|
|
|
# else:
|
|
|
|
|
# gt_list.append(1)
|
|
|
|
|
# pred_list.append(1)
|
|
|
|
|
return message
|
|
|
|
|
|
|
|
|
|
|
2025-03-13 04:45:52 +00:00
|
|
|
def is_equal(gt_value, pred_value, data_point: str = "", zero_equal_none: bool = False):
|
2025-03-13 02:52:00 +00:00
|
|
|
if gt_value is not None and len(str(gt_value).strip()) > 0:
|
|
|
|
|
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
|
|
|
|
if gt_value == "0.0":
|
|
|
|
|
gt_value = "0"
|
|
|
|
|
if pred_value == "0.0":
|
|
|
|
|
pred_value = "0"
|
|
|
|
|
if data_point not in ["benchmark_name"]:
|
|
|
|
|
try:
|
|
|
|
|
gt_num = float(gt_value)
|
|
|
|
|
# round to 2 decimal places
|
|
|
|
|
gt_value = round(gt_num, 4)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
pass
|
|
|
|
|
try:
|
|
|
|
|
pred_value = float(pred_value)
|
|
|
|
|
pred_value = round(pred_value, 4)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
pass
|
|
|
|
|
if gt_value == pred_value:
|
2025-03-05 15:57:02 +00:00
|
|
|
return True
|
2025-03-13 02:52:00 +00:00
|
|
|
if data_point == "benchmark_name":
|
|
|
|
|
gt_value = clean_text(gt_value)
|
|
|
|
|
pred_value = clean_text(pred_value)
|
|
|
|
|
if gt_value == pred_value or gt_value in pred_value or pred_value in gt_value:
|
|
|
|
|
return True
|
|
|
|
|
similarity = Similarity()
|
|
|
|
|
jacard_score = similarity.jaccard_similarity(gt_value.lower().split(), pred_value.lower().split())
|
|
|
|
|
if jacard_score > 0.8:
|
|
|
|
|
return True
|
|
|
|
|
else:
|
|
|
|
|
if data_point not in ["benchmark_name"]:
|
|
|
|
|
try:
|
|
|
|
|
gt_num = float(gt_value)
|
|
|
|
|
# round to 2 decimal places
|
|
|
|
|
gt_value = round(gt_num, 4)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
pass
|
2025-03-13 04:45:52 +00:00
|
|
|
if zero_equal_none and gt_value == 0:
|
2025-03-13 02:52:00 +00:00
|
|
|
return True
|
2025-03-05 15:57:02 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_text(text: str):
|
|
|
|
|
if text is None or len(text) == 0:
|
|
|
|
|
return text
|
|
|
|
|
text = re.sub(r"\W", " ", text)
|
|
|
|
|
text = re.sub(r"\s+", " ", text)
|
|
|
|
|
return text
|
2025-03-11 22:15:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_inference_data():
|
2025-03-17 20:07:02 +00:00
|
|
|
file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317_Ravi.xlsx"
|
|
|
|
|
file2 = r"/data/aus_prospectus/output/merged_data/docs/excel/merged_420339794.xlsx"
|
2025-03-11 22:15:39 +00:00
|
|
|
columns = [
|
|
|
|
|
"doc_id",
|
2025-03-17 20:07:02 +00:00
|
|
|
"effective_date",
|
2025-03-11 22:15:39 +00:00
|
|
|
"raw_fund_name",
|
2025-03-17 20:07:02 +00:00
|
|
|
"raw_share_name",
|
|
|
|
|
"raw_name",
|
2025-03-11 22:15:39 +00:00
|
|
|
"fund_id",
|
|
|
|
|
"fund_name",
|
|
|
|
|
"sec_id",
|
|
|
|
|
"sec_name",
|
2025-03-17 20:07:02 +00:00
|
|
|
"page_index",
|
2025-03-11 22:15:39 +00:00
|
|
|
"management_fee_and_costs",
|
|
|
|
|
"management_fee",
|
|
|
|
|
"administration_fees",
|
|
|
|
|
"performance_fee_costs",
|
|
|
|
|
"interposed_vehicle_performance_fee_cost",
|
|
|
|
|
"buy_spread",
|
|
|
|
|
"sell_spread",
|
2025-03-17 20:07:02 +00:00
|
|
|
"total_annual_dollar_based_charges",
|
|
|
|
|
"minimum_initial_investment",
|
|
|
|
|
"benchmark_name",
|
|
|
|
|
"indirect_costs",
|
|
|
|
|
"recoverable_expenses",
|
|
|
|
|
"change_recoverable_expenses"
|
2025-03-11 22:15:39 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
file1_data_df = pd.read_excel(file1, sheet_name="total_mapping_data")
|
|
|
|
|
file1_data_df = file1_data_df[columns]
|
2025-03-17 20:07:02 +00:00
|
|
|
# remove the rows which doc_id is 420339794 from file1_data_df
|
|
|
|
|
file1_data_df = file1_data_df[file1_data_df["doc_id"] != 420339794]
|
|
|
|
|
|
|
|
|
|
file2_data_df = pd.read_excel(file2, sheet_name="merged_data")
|
2025-03-11 22:15:39 +00:00
|
|
|
file2_data_df = file2_data_df[columns]
|
|
|
|
|
total_data_df = pd.concat([file1_data_df, file2_data_df])
|
|
|
|
|
total_data_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
output_folder = r"/data/aus_prospectus/output/mapping_data/total/"
|
2025-03-17 20:07:02 +00:00
|
|
|
output_file = os.path.join(output_folder, "mapping_data_info_46_documents_by_text_20250317_Ravi_modified.xlsx")
|
2025-03-11 22:15:39 +00:00
|
|
|
with pd.ExcelWriter(output_file) as f:
|
|
|
|
|
total_data_df.to_excel(f, index=False, sheet_name="total_mapping_data")
|
|
|
|
|
|
2025-03-05 15:57:02 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2025-03-17 20:07:02 +00:00
|
|
|
merge_inference_data()
|
2025-03-05 15:57:02 +00:00
|
|
|
# adjust_column_order()
|
|
|
|
|
# set_mapping_to_data_side_documents_data()
|
|
|
|
|
|
|
|
|
|
# source_file = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx"
|
|
|
|
|
# target_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
|
|
|
|
|
# adjust_data_file(source_file=source_file, targe_file=target_file)
|
|
|
|
|
|
|
|
|
|
# audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx"
|
|
|
|
|
# audit_data_sheet: str = "Sheet1"
|
|
|
|
|
# verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx"
|
|
|
|
|
# verify_data_sheet: str = "total_data"
|
|
|
|
|
|
|
|
|
|
audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
|
|
|
|
|
audit_data_sheet: str = "Sheet1"
|
2025-03-13 07:59:19 +00:00
|
|
|
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313024715.xlsx"
|
2025-03-09 03:40:00 +00:00
|
|
|
verify_data_sheet: str = "total_mapping_data"
|
2025-03-06 23:27:18 +00:00
|
|
|
# verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt"
|
2025-03-09 03:40:00 +00:00
|
|
|
verify_document_list_file_list = [None,
|
|
|
|
|
"./sample_documents/aus_prospectus_29_documents_sample.txt",
|
|
|
|
|
"./sample_documents/aus_prospectus_17_documents_sample.txt"]
|
2025-03-13 04:45:52 +00:00
|
|
|
zero_equal_none = False
|
2025-03-10 16:09:00 +00:00
|
|
|
is_for_all = True
|
2025-03-17 20:07:02 +00:00
|
|
|
# for verify_document_list_file in verify_document_list_file_list:
|
|
|
|
|
# calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
|
|
|
|
|
# audit_data_sheet=audit_data_sheet,
|
|
|
|
|
# verify_file_path=verify_file_path,
|
|
|
|
|
# verify_data_sheet=verify_data_sheet,
|
|
|
|
|
# verify_document_list_file = verify_document_list_file,
|
|
|
|
|
# is_for_all=is_for_all,
|
|
|
|
|
# zero_equal_none=zero_equal_none)
|
2025-03-09 03:40:00 +00:00
|
|
|
|
|
|
|
|
# for verify_document_list_file in verify_document_list_file_list:
|
|
|
|
|
# calculate_metrics_by_provider(audit_file_path=audit_file_path,
|
|
|
|
|
# audit_data_sheet=audit_data_sheet,
|
|
|
|
|
# verify_file_path=verify_file_path,
|
|
|
|
|
# verify_data_sheet=verify_data_sheet,
|
|
|
|
|
# verify_document_list_file = verify_document_list_file,
|
|
|
|
|
# is_for_all=is_for_all)
|
2025-03-07 21:02:12 +00:00
|
|
|
|
|
|
|
|
|
2025-03-05 15:57:02 +00:00
|
|
|
|
|
|
|
|
# set_mapping_to_17_documents_data()
|
|
|
|
|
# set_mapping_to_ravi_data()
|
|
|
|
|
|
|
|
|
|
# calculate_metrics_based_audit_file(is_strict=True)
|
|
|
|
|
# calculate_metrics_based_audit_file(is_strict=False)
|
|
|
|
|
# remove_ter_ogc_performance_fee_annotation()
|
|
|
|
|
# batch_run_documents()
|
|
|
|
|
# transform_pdf_2_image()
|
|
|
|
|
# ground_truth_file = "./test_metrics/ground_truth.xlsx"
|
|
|
|
|
# prediction_file = "./test_metrics/prediction.xlsx"
|
|
|
|
|
# calc_metrics(ground_truth_file, prediction_file)
|
|
|
|
|
|
|
|
|
|
# pdf_file = r"./data/emea_ar/pdf/532438210.pdf"
|
|
|
|
|
# page_list = [25, 26, 27, 28, 29]
|
|
|
|
|
# output_folder = r"./data/emea_ar/output/pdf_part/"
|
|
|
|
|
# output_part_of_pages(pdf_file, page_list, output_folder)
|