From cd7e09757daf1fd9a8074223afa2c829bcaf1eb0 Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 5 Mar 2025 09:57:02 -0600 Subject: [PATCH 01/11] check in calc_metrics to repo. --- .gitignore | 1 - calc_metrics.py | 1173 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1173 insertions(+), 1 deletion(-) create mode 100644 calc_metrics.py diff --git a/.gitignore b/.gitignore index c1f6286..e627f60 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ /utils/__pycache__ /__pycache__/*.pyc /core/__pycache__/*.pyc -/test_calc_metrics.py /test_metrics /data /sample_documents/japan_prospectus.txt diff --git a/calc_metrics.py b/calc_metrics.py new file mode 100644 index 0000000..243c000 --- /dev/null +++ b/calc_metrics.py @@ -0,0 +1,1173 @@ +import os +from time import sleep +import pandas as pd +from glob import glob +from tqdm import tqdm +import numpy as np +from datetime import datetime +import re +import json +import traceback +from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score +import requests +import fitz +from copy import deepcopy +from utils.similarity import Similarity +from core.auz_nz.hybrid_solution_script import final_function_to_match + + +def calc_metrics(ground_truth_file: str, prediction_file: str): + """ + Calculate metrics by comparing ground truth and prediction files + """ + if not os.path.exists(ground_truth_file): + raise FileNotFoundError(f"File not found: {ground_truth_file}") + if not os.path.exists(prediction_file): + raise FileNotFoundError(f"File not found: {prediction_file}") + + ground_truth_df = pd.read_excel(ground_truth_file) + prediction_df = pd.read_excel(prediction_file) + + gt_auum_list = [] + pred_auum_list = [] + + gt_tor_list = [] + pred_tor_list = [] + columns = ["fund_name", "auum", "tor"] + # Check whether the ground truth file contains the same values as the prediction file + # The purpose is to calculate Recall + for gt_index, gt_row in ground_truth_df.iterrows(): + gt_fund_name = gt_row["fund_name"] + gt_auum = gt_row["auum"] + gt_tor = gt_row["tor"] + find_auum_flag = False + find_tor_flag = False + for pred_index, pred_row in prediction_df.iterrows(): + pred_fund_name = pred_row["fund_name"] + pred_auum = pred_row["auum"] + pred_tor = pred_row["tor"] + if gt_fund_name == pred_fund_name: + if gt_auum == pred_auum: + find_auum_flag = True + if gt_tor == pred_tor: + find_tor_flag = True + break + + if find_auum_flag: + gt_auum_list.append(1) + pred_auum_list.append(1) + else: + gt_auum_list.append(1) + pred_auum_list.append(0) + + if find_tor_flag: + gt_tor_list.append(1) + pred_tor_list.append(1) + else: + gt_tor_list.append(1) + pred_tor_list.append(0) + + # Check whether the prediction file contains the same values as the ground truth file + # The purpose is to calculate Precision + for pred_index, pred_row in prediction_df.iterrows(): + pred_fund_name = pred_row["fund_name"] + pred_auum = pred_row["auum"] + pred_tor = pred_row["tor"] + find_auum_flag = False + find_tor_flag = False + for gt_index, gt_row in ground_truth_df.iterrows(): + gt_fund_name = gt_row["fund_name"] + gt_auum = gt_row["auum"] + gt_tor = gt_row["tor"] + if pred_fund_name == gt_fund_name: + if pred_auum == gt_auum: + find_auum_flag = True + if pred_tor == gt_tor: + find_tor_flag = True + break + if not find_auum_flag: + gt_auum_list.append(0) + pred_auum_list.append(1) + + if not find_tor_flag: + gt_tor_list.append(0) + pred_tor_list.append(1) + + precision_auum = precision_score(gt_auum_list, pred_auum_list) + recall_auum = recall_score(gt_auum_list, pred_auum_list) + f1_auum = f1_score(gt_auum_list, pred_auum_list) + accuracy_auum = accuracy_score(gt_auum_list, pred_auum_list) + + precision_tor = precision_score(gt_tor_list, pred_tor_list) + recall_tor = recall_score(gt_tor_list, pred_tor_list) + f1_tor = f1_score(gt_tor_list, pred_tor_list) + accuracy_tor = accuracy_score(gt_tor_list, pred_tor_list) + + print(f"AUUM Support: {sum(gt_auum_list)}") + print(f"F1 AUUM: {f1_auum}") + print(f"Precision AUUM: {precision_auum}") + print(f"Recall AUUM: {recall_auum}") + print(f"Accuracy AUUM: {accuracy_auum}\n") + + print(f"TOR Support: {sum(gt_tor_list)}") + print(f"F1 TOR: {f1_tor}") + print(f"Precision TOR: {precision_tor}") + print(f"Recall TOR: {recall_tor}") + print(f"Accuracy TOR: {accuracy_tor}") + + +def transform_pdf_2_image(): + """ + Transform pdf to image. + """ + import fitz + + folder = r"/Users/bhe/OneDrive - MORNINGSTAR INC/Personal Document/US_Life/pay/" + pdf_file = r"Pay_Date_2025-02-14.pdf" + pdf_path = os.path.join(folder, pdf_file) + pdf_doc = fitz.open(pdf_path) + + pdf_file_pure_name = pdf_file.replace(".pdf", "") + for page_num in range(pdf_doc.page_count): + page = pdf_doc.load_page(page_num) + image = page.get_pixmap(dpi=300) + image_path = os.path.join(folder, f"{pdf_file_pure_name}_{page_num}.png") + image.save(image_path) + + +def invoke_api_demo(doc_id: str = "407881493"): + headers = {"connection": "keep-alive", "content-type": "application/json"} + data = { + "doc_id": doc_id, + } + print(f"Start to invoke API for document: {doc_id}") + # url = 'https://internal-ts00006-stg-dcms-gpt-765982576.us-east-1.elb.amazonaws.com/automation/api/model/us_ar' + url = "http://127.0.0.1:8080/automation/api/model/emea_ar" + try: + response = requests.post(url, json=data, headers=headers) + print("API response status code: {0}".format(response.status_code)) + json_data = json.loads(response.text) + print(json_data) + data_folder = r"/data/emea_ar/output/extract_data_by_api/" + os.makedirs(data_folder, exist_ok=True) + json_file = os.path.join(data_folder, f"{doc_id}.json") + with open(json_file, "w", encoding="utf-8") as f: + json.dump(json_data, f, indent=4) + except Exception as e: + print("Meet exception: {0}".format(e)) + + +def batch_run_documents(): + document_id_list = [ + "292989214", + "316237292", + "321733631", + "323390570", + "327956364", + "333207452", + "334718372", + "344636875", + "362246081", + "366179419", + "380945052", + "382366116", + "387202452", + "389171486", + "391456740", + "391736837", + "394778487", + "401684600", + "402113224", + "402181770", + "402397014", + "405803396", + "445102363", + "445256897", + "448265376", + "449555622", + "449623976", + "458291624", + "458359181", + "463081566", + "469138353", + "471641628", + "476492237", + "478585901", + "478586066", + "479042264", + "479793787", + "481475385", + "483617247", + "486378555", + "486383912", + "492121213", + "497497599", + "502693599", + "502821436", + "503194284", + "506559375", + "507967525", + "508854243", + "509845549", + "520879048", + "529925114", + ] + for doc_id in document_id_list: + invoke_api_demo(doc_id) + + +def remove_ter_ogc_performance_fee_annotation(): + data_folder = r"/data/emea_ar/output/extract_data_by_api/" + os.makedirs(data_folder, exist_ok=True) + # get all of json files from the folder + json_files = glob(os.path.join(data_folder, "*.json")) + remove_dp_list = ["ter", "ogc", "performance_fee"] + for json_file in json_files: + with open(json_file, "r", encoding="utf-8") as f: + json_data = json.load(f) + annotation_data_list = json_data["annotation_data"] + remove_data_list = [] + for annotation_data in annotation_data_list: + if annotation_data["data_point"] in remove_dp_list: + remove_data_list.append(annotation_data) + if len(remove_data_list) > 0: + for remove_data in remove_data_list: + if remove_data in annotation_data_list: + annotation_data_list.remove(remove_data) + with open(json_file, "w", encoding="utf-8") as f: + json.dump(json_data, f, indent=4) + + +def output_part_of_pages(pdf_file: str, page_list: list, output_folder: str): + """ + Output part of pages from a pdf file to new pdf file. + :param pdf_file: str, the path of the pdf file. + :param page_list: list, the page number list. + :param output_folder: str, the output folder. + """ + pdf_doc = fitz.open(pdf_file) + pdf_file_pure_name = os.path.basename(pdf_file).replace(".pdf", "") + new_pdf = fitz.open() + print(f"output pages: {page_list} for {pdf_file_pure_name}") + for page_index in page_list: + new_pdf.insert_pdf(pdf_doc, from_page=page_index, to_page=page_index) + if output_folder is None or len(output_folder) == 0: + output_folder = r"./data/emea_ar/output/pdf_part/" + os.makedirs(output_folder, exist_ok=True) + new_pdf.save(os.path.join(output_folder, f"{pdf_file_pure_name}_part.pdf")) + + +def calculate_metrics_based_audit_file(is_strict: bool = False): + print("Start to calculate metrics based on audit file and verify file...") + audit_file_path = ( + r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/Audited file_phase2.xlsx" + ) + audit_data_sheets = ["Mayank - revised ", "Prathamesh - Revised"] + audit_fields = [ + "doc_id", + "fund_name", + "management_fee_and_costs", + "management_fee", + "performance_fee", + "performance_fee_costs", + "buy_spread", + "sell_spread", + "minimum_initial_investment", + "recoverable_expenses", + "indirect_costs" + ] + audit_data_list = [] + for audit_data_sheet in audit_data_sheets: + sub_audit_data_df = pd.read_excel(audit_file_path, sheet_name=audit_data_sheet) + sub_audit_data_df = sub_audit_data_df[audit_fields] + audit_data_list.append(sub_audit_data_df) + audit_data_df = pd.concat(audit_data_list, ignore_index=True) + audit_data_df = audit_data_df.drop_duplicates() + audit_data_df.fillna("", inplace=True) + audit_data_df.reset_index(drop=True, inplace=True) + + verify_file_path = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250205134704.xlsx" + verify_data_sheet = "total_data" + verify_fields = [ + "DocumentId", + "raw_fund_name", + "management_fee_and_costs", + "management_fee", + "performance_fee", + "performance_fee_costs", + "buy_spread", + "sell_spread", + "minimum_initial_investment", + "recoverable_expenses", + "indirect_costs" + ] + verify_data_df = pd.read_excel(verify_file_path, sheet_name=verify_data_sheet) + verify_data_df = verify_data_df[verify_fields] + verify_data_df = verify_data_df.drop_duplicates() + verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id", "raw_fund_name": "fund_name"}) + verify_data_df.fillna("", inplace=True) + verify_data_df.reset_index(drop=True, inplace=True) + + if len(audit_data_df) == 0 or len(verify_data_df) == 0: + print("No data to calculate metrics.") + return + + # Calculate metrics + gt_management_fee_and_costs_list = [] + pred_management_fee_and_costs_list = [] + gt_management_fee_list = [] + pred_management_fee_list = [] + gt_performance_fee_list = [] + pred_performance_fee_list = [] + gt_performance_fee_costs_list = [] + pred_performance_fee_costs_list = [] + gt_buy_spread_list = [] + pred_buy_spread_list = [] + gt_sell_spread_list = [] + pred_sell_spread_list = [] + gt_minimum_initial_investment_list = [] + pred_minimum_initial_investment_list = [] + gt_recoverable_expenses_list = [] + pred_recoverable_expenses_list = [] + gt_indirect_costs_list = [] + pred_indirect_costs_list = [] + + document_id_list = audit_data_df["doc_id"].unique().tolist() + + print(f"Total document count: {len(document_id_list)}") + print("Construct ground truth and prediction data...") + similarity = Similarity() + for document_id in document_id_list: + doc_audit_data = audit_data_df[audit_data_df["doc_id"] == document_id] + doc_verify_data = verify_data_df[verify_data_df["doc_id"] == document_id] + for index, row in doc_audit_data.iterrows(): + fund_name = row["fund_name"] + fund_name_split = fund_name.lower().split() + management_fee_and_costs = str(row["management_fee_and_costs"]) + management_fee = str(row["management_fee"]) + performance_fee = str(row["performance_fee"]) + performance_fee_costs = str(row["performance_fee_costs"]) + buy_spread = str(row["buy_spread"]) + sell_spread = str(row["sell_spread"]) + minimum_initial_investment = str(row["minimum_initial_investment"]) + recoverable_expenses = str(row["recoverable_expenses"]) + indirect_costs = str(row["indirect_costs"]) + find_flag = False + for idx, r in doc_verify_data.iterrows(): + v_fund_name = r["fund_name"] + if fund_name == v_fund_name: + find_flag = True + else: + v_fund_name_split = v_fund_name.lower().split() + name_similarity = similarity.jaccard_similarity(fund_name_split, v_fund_name_split) + if name_similarity > 0.8: + find_flag = True + if find_flag: + v_management_fee_and_costs = str(r["management_fee_and_costs"]) + v_management_fee = str(r["management_fee"]) + v_performance_fee = str(r["performance_fee"]) + v_performance_fee_costs = str(r["performance_fee_costs"]) + v_buy_spread = str(r["buy_spread"]) + v_sell_spread = str(r["sell_spread"]) + v_minimum_initial_investment = str(r["minimum_initial_investment"]) + v_recoverable_expenses = str(r["recoverable_expenses"]) + v_indirect_costs = str(r["indirect_costs"]) + + get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list) + get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list) + get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list) + get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list) + get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list) + get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + get_gt_pred_by_compare_values(recoverable_expenses, v_recoverable_expenses, gt_recoverable_expenses_list, pred_recoverable_expenses_list) + get_gt_pred_by_compare_values(indirect_costs, v_indirect_costs, gt_indirect_costs_list, pred_indirect_costs_list) + break + if not find_flag: + if management_fee_and_costs is not None and len(management_fee_and_costs) > 0: + gt_management_fee_and_costs_list.append(1) + pred_management_fee_and_costs_list.append(0) + if management_fee is not None and len(management_fee) > 0: + gt_management_fee_list.append(1) + pred_management_fee_list.append(0) + if performance_fee is not None and len(performance_fee) > 0: + gt_performance_fee_list.append(1) + pred_performance_fee_list.append(0) + if performance_fee_costs is not None and len(performance_fee_costs) > 0: + gt_performance_fee_costs_list.append(1) + pred_performance_fee_costs_list.append(0) + if buy_spread is not None and len(buy_spread) > 0: + gt_buy_spread_list.append(1) + pred_buy_spread_list.append(0) + if sell_spread is not None and len(sell_spread) > 0: + gt_sell_spread_list.append(1) + pred_sell_spread_list.append(0) + if minimum_initial_investment is not None and len(minimum_initial_investment) > 0: + gt_minimum_initial_investment_list.append(1) + pred_minimum_initial_investment_list.append(0) + if recoverable_expenses is not None and len(recoverable_expenses) > 0: + gt_recoverable_expenses_list.append(1) + pred_recoverable_expenses_list.append(0) + if indirect_costs is not None and len(indirect_costs) > 0: + gt_indirect_costs_list.append(1) + pred_indirect_costs_list.append(0) + + if is_strict: + for idx, r in doc_verify_data.iterrows(): + v_fund_name = r["fund_name"] + find_flag = False + for index, row in doc_audit_data.iterrows(): + fund_name = row["fund_name"] + if fund_name == v_fund_name: + find_flag = True + else: + v_fund_name_split = v_fund_name.lower().split() + fund_name_split = fund_name.lower().split() + name_similarity = similarity.jaccard_similarity(fund_name_split, v_fund_name_split) + if name_similarity > 0.8: + find_flag = True + if find_flag: + break + if not find_flag: + v_management_fee_and_costs = str(r["management_fee_and_costs"]) + v_management_fee = str(r["management_fee"]) + v_performance_fee = str(r["performance_fee"]) + v_performance_fee_costs = str(r["performance_fee_costs"]) + v_buy_spread = str(r["buy_spread"]) + v_sell_spread = str(r["sell_spread"]) + v_minimum_initial_investment = str(r["minimum_initial_investment"]) + v_recoverable_expenses = str(r["recoverable_expenses"]) + v_indirect_costs = str(r["indirect_costs"]) + + if v_management_fee_and_costs is not None and len(v_management_fee_and_costs) > 0: + gt_management_fee_and_costs_list.append(0) + pred_management_fee_and_costs_list.append(1) + if v_management_fee is not None and len(v_management_fee) > 0: + gt_management_fee_list.append(0) + pred_management_fee_list.append(1) + if v_performance_fee is not None and len(v_performance_fee) > 0: + gt_performance_fee_list.append(0) + pred_performance_fee_list.append(1) + if v_performance_fee_costs is not None and len(v_performance_fee_costs) > 0: + gt_performance_fee_costs_list.append(0) + pred_performance_fee_costs_list.append(1) + if v_buy_spread is not None and len(v_buy_spread) > 0: + gt_buy_spread_list.append(0) + pred_buy_spread_list.append(1) + if v_sell_spread is not None and len(v_sell_spread) > 0: + gt_sell_spread_list.append(0) + pred_sell_spread_list.append(1) + if v_minimum_initial_investment is not None and len(v_minimum_initial_investment) > 0: + gt_minimum_initial_investment_list.append(0) + pred_minimum_initial_investment_list.append(1) + if v_recoverable_expenses is not None and len(v_recoverable_expenses) > 0: + gt_recoverable_expenses_list.append(0) + pred_recoverable_expenses_list.append(1) + if v_indirect_costs is not None and len(v_indirect_costs) > 0: + gt_indirect_costs_list.append(0) + pred_indirect_costs_list.append(1) + + # calculate metrics + print("Calculate metrics...") + precision_management_fee_and_costs = precision_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + recall_management_fee_and_costs = recall_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + f1_management_fee_and_costs = f1_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + accuracy_management_fee_and_costs = accuracy_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + support_management_fee_and_costs = sum(gt_management_fee_and_costs_list) + + precision_management_fee = precision_score(gt_management_fee_list, pred_management_fee_list) + recall_management_fee = recall_score(gt_management_fee_list, pred_management_fee_list) + f1_management_fee = f1_score(gt_management_fee_list, pred_management_fee_list) + accuracy_management_fee = accuracy_score(gt_management_fee_list, pred_management_fee_list) + support_management_fee = sum(gt_management_fee_list) + + precision_performance_fee = precision_score(gt_performance_fee_list, pred_performance_fee_list) + recall_performance_fee = recall_score(gt_performance_fee_list, pred_performance_fee_list) + f1_performance_fee = f1_score(gt_performance_fee_list, pred_performance_fee_list) + accuracy_performance_fee = accuracy_score(gt_performance_fee_list, pred_performance_fee_list) + support_performance_fee = sum(gt_performance_fee_list) + + precision_performance_fee_costs = precision_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + recall_performance_fee_costs = recall_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + f1_performance_fee_costs = f1_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + accuracy_performance_fee_costs = accuracy_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + support_performance_fee_costs = sum(gt_performance_fee_costs_list) + + precision_buy_spread = precision_score(gt_buy_spread_list, pred_buy_spread_list) + recall_buy_spread = recall_score(gt_buy_spread_list, pred_buy_spread_list) + f1_buy_spread = f1_score(gt_buy_spread_list, pred_buy_spread_list) + accuracy_buy_spread = accuracy_score(gt_buy_spread_list, pred_buy_spread_list) + support_buy_spread = sum(gt_buy_spread_list) + + precision_sell_spread = precision_score(gt_sell_spread_list, pred_sell_spread_list) + recall_sell_spread = recall_score(gt_sell_spread_list, pred_sell_spread_list) + f1_sell_spread = f1_score(gt_sell_spread_list, pred_sell_spread_list) + accuracy_sell_spread = accuracy_score(gt_sell_spread_list, pred_sell_spread_list) + support_buy_spread = sum(gt_sell_spread_list) + + precision_minimum_initial_investment = precision_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + recall_minimum_initial_investment = recall_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + f1_minimum_initial_investment = f1_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + accuracy_minimum_initial_investment = accuracy_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + support_minimum_initial_investment = sum(gt_minimum_initial_investment_list) + + precision_recoverable_expenses = precision_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list) + recall_recoverable_expenses = recall_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list) + f1_recoverable_expenses = f1_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list) + accuracy_recoverable_expenses = accuracy_score(gt_recoverable_expenses_list, pred_recoverable_expenses_list) + support_recoverable_expenses = sum(gt_recoverable_expenses_list) + + precision_indirect_costs = precision_score(gt_indirect_costs_list, pred_indirect_costs_list) + recall_indirect_costs = recall_score(gt_indirect_costs_list, pred_indirect_costs_list) + f1_indirect_costs = f1_score(gt_indirect_costs_list, pred_indirect_costs_list) + accuracy_indirect_costs = accuracy_score(gt_indirect_costs_list, pred_indirect_costs_list) + support_indirect_costs = sum(gt_indirect_costs_list) + + metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, + {"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, + {"item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee}, + {"item": "performance_fee_costs", "precision": precision_performance_fee_costs, "recall": recall_performance_fee_costs, "f1": f1_performance_fee_costs, "accuracy": accuracy_performance_fee_costs, "support": support_performance_fee_costs}, + {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, + {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, + {"item": "minimum_initial_investment", "precision": precision_minimum_initial_investment, "recall": recall_minimum_initial_investment, "f1": f1_minimum_initial_investment, "accuracy": accuracy_minimum_initial_investment, "support": support_minimum_initial_investment}, + {"item": "recoverable_expenses", "precision": precision_recoverable_expenses, "recall": recall_recoverable_expenses, "f1": f1_recoverable_expenses, "accuracy": accuracy_recoverable_expenses, "support": support_recoverable_expenses}, + {"item": "indirect_costs", "precision": precision_indirect_costs, "recall": recall_indirect_costs, "f1": f1_indirect_costs, "accuracy": accuracy_indirect_costs, "support": support_indirect_costs}] + metrics_data_df = pd.DataFrame(metrics_data) + averate_precision = metrics_data_df["precision"].mean() + average_recall = metrics_data_df["recall"].mean() + average_f1 = metrics_data_df["f1"].mean() + average_accuracy = metrics_data_df["accuracy"].mean() + sum_support = metrics_data_df["support"].sum() + metrics_data.append({"item": "average_score", "precision": averate_precision, "recall": average_recall, "f1": average_f1, "accuracy": average_accuracy, "support": sum_support}) + metrics_data_df = pd.DataFrame(metrics_data) + metrics_data_df = metrics_data_df[['item', 'f1', 'precision', 'recall', 'accuracy', 'support']] + + # output metrics data to Excel file + print("Output metrics data to Excel file...") + output_folder = r"/data/aus_prospectus/output/metrics_data/" + os.makedirs(output_folder, exist_ok=True) + verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") + if is_strict: + verify_file_name = f"metrics_{verify_file_name}_revised_strict.xlsx" + else: + verify_file_name = f"metrics_{verify_file_name}_revised_not_strict.xlsx" + output_file = os.path.join(output_folder, verify_file_name) + with pd.ExcelWriter(output_file) as writer: + metrics_data_df.to_excel(writer, index=False) + + +def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx", + audit_data_sheet: str = "Sheet1", + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx", + verify_data_sheet: str = "total_data" + ): + print("Start to calculate metrics based on DB data file and extracted file...") + audit_data_df = pd.DataFrame() + verify_data_df = pd.DataFrame() + + audit_fields = [ + "DocumentId", + "FundLegalName", + "FundId", + "FundClassLegalName", + "FundClassId", + "management_fee_and_costs", + "management_fee", + "administration_fees", + "minimum_initial_investment", + "benchmark_name", + "performance_fee", + "interposed_vehicle_performance_fee_cost", + "buy_spread", + "sell_spread", + "total_annual_dollar_based_charges" + # "withdrawal_fee", + # "switching_fee", + # "activity_fee", + + ] + audit_data_df = pd.read_excel(audit_file_path, sheet_name=audit_data_sheet) + audit_data_df = audit_data_df[audit_fields] + audit_data_df = audit_data_df.drop_duplicates() + audit_data_df = audit_data_df.rename(columns={"DocumentId": "doc_id", + "FundLegalName": "fund_name", + "FundId": "fund_id", + "FundClassLegalName": "sec_name", + "FundClassId": "sec_id"}) + audit_data_df.fillna("", inplace=True) + audit_data_df.reset_index(drop=True, inplace=True) + + # verify_file_path = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250205134704.xlsx" + # ravi_verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" + # verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" + verify_fields = [ + "DocumentId", + "raw_fund_name", + "fund_id", + "fund_name", + "raw_share_name", + "sec_id", + "sec_name", + "management_fee_and_costs", + "management_fee", + "administration_fees", + "minimum_initial_investment", + "benchmark_name", + "performance_fee", + "interposed_vehicle_performance_fee_cost", + "buy_spread", + "sell_spread", + "total_annual_dollar_based_charges" + # "withdrawal_fee", + # "switching_fee", + # "activity_fee" + ] + verify_data_df = pd.read_excel(verify_file_path, sheet_name=verify_data_sheet) + # ravi_verify_data_df = pd.read_excel(ravi_verify_file_path, sheet_name=verify_data_sheet) + + # only get raw_verify_data_df data which sec_id is equal with sec_id in ravi_verify_data_df + # verify_data_df = raw_verify_data_df[raw_verify_data_df["sec_id"].isin(ravi_verify_data_df["sec_id"])] + verify_data_df = verify_data_df[verify_fields] + verify_data_df = verify_data_df.drop_duplicates() + verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"}) + verify_data_df.fillna("", inplace=True) + verify_data_df.reset_index(drop=True, inplace=True) + + if len(audit_data_df) == 0 or len(verify_data_df) == 0: + print("No data to calculate metrics.") + return + + # Calculate metrics + gt_management_fee_and_costs_list = [] + pred_management_fee_and_costs_list = [] + gt_management_fee_list = [] + pred_management_fee_list = [] + gt_administration_fees_list = [] + pred_administration_fees_list = [] + gt_minimum_initial_investment_list = [] + pred_minimum_initial_investment_list = [] + gt_benchmark_name_list = [] + pred_benchmark_name_list = [] + gt_performance_fee_list = [] + pred_performance_fee_list = [] + gt_interposed_vehicle_performance_fee_cost_list = [] + pred_interposed_vehicle_performance_fee_cost_list = [] + gt_buy_spread_list = [] + pred_buy_spread_list = [] + gt_sell_spread_list = [] + pred_sell_spread_list = [] + gt_total_annual_dollar_based_charges_list = [] + pred_total_annual_dollar_based_charges_list = [] + + # gt_performance_fee_costs_list = [] + # pred_performance_fee_costs_list = [] + # gt_buy_spread_list = [] + # pred_buy_spread_list = [] + # gt_sell_spread_list = [] + # pred_sell_spread_list = [] + # gt_withdrawal_fee_list = [] + # pred_withdrawal_fee_list = [] + # gt_switching_fee_list = [] + # pred_switching_fee_list = [] + # gt_activity_fee_list = [] + # pred_activity_fee_list = [] + + document_id_list = verify_data_df["doc_id"].unique().tolist() + + print(f"Total document count: {len(document_id_list)}") + print("Construct ground truth and prediction data...") + # similarity = Similarity() + message_list = [] + for document_id in document_id_list: + doc_audit_data = audit_data_df[audit_data_df["doc_id"] == document_id] + audit_sec_id_list = [doc_sec_id for doc_sec_id + in doc_audit_data["sec_id"].unique().tolist() + if len(doc_sec_id) > 0] + # get doc_verify_data which doc_id is same as document_id and sec_id in audit_sec_id_list + doc_verify_data = verify_data_df[(verify_data_df["doc_id"] == document_id) & (verify_data_df["sec_id"].isin(audit_sec_id_list))] + for index, row in doc_audit_data.iterrows(): + fund_name = row["fund_name"] + sec_id = row["sec_id"] + management_fee_and_costs = str(row["management_fee_and_costs"]) + management_fee = str(row["management_fee"]) + administration_fees = str(row["administration_fees"]) + minimum_initial_investment = str(row["minimum_initial_investment"]) + benchmark_name = str(row["benchmark_name"]) + performance_fee = str(row["performance_fee"]) + interposed_vehicle_performance_fee_cost = str(row["interposed_vehicle_performance_fee_cost"]) + buy_spread = str(row["buy_spread"]) + sell_spread = str(row["sell_spread"]) + total_annual_dollar_based_charges = str(row["total_annual_dollar_based_charges"]) + + # get the first row which sec_id in doc_verify_data is same as sec_id + doc_verify_sec_data = doc_verify_data[doc_verify_data["sec_id"] == sec_id] + if len(doc_verify_sec_data) == 0: + continue + doc_verify_sec_row = doc_verify_sec_data.iloc[0] + raw_fund_name = doc_verify_sec_row["raw_fund_name"] + v_management_fee_and_costs = str(doc_verify_sec_row["management_fee_and_costs"]) + v_management_fee = str(doc_verify_sec_row["management_fee"]) + v_administration_fees = str(doc_verify_sec_row["administration_fees"]) + v_minimum_initial_investment = str(doc_verify_sec_row["minimum_initial_investment"]) + v_benchmark_name = str(doc_verify_sec_row["benchmark_name"]) + v_performance_fee = str(doc_verify_sec_row["performance_fee"]) + v_interposed_vehicle_performance_fee_cost = str(doc_verify_sec_row["interposed_vehicle_performance_fee_cost"]) + v_buy_spread = str(doc_verify_sec_row["buy_spread"]) + v_sell_spread = str(doc_verify_sec_row["sell_spread"]) + v_total_annual_dollar_based_charges = str(doc_verify_sec_row["total_annual_dollar_based_charges"]) + + # v_performance_fee_costs = str(doc_verify_sec_row["performance_fee_costs"]) + # v_buy_spread = str(doc_verify_sec_row["buy_spread"]) + # v_sell_spread = str(doc_verify_sec_row["sell_spread"]) + # v_withdrawal_fee = str(doc_verify_sec_row["withdrawal_fee"]) + # v_switching_fee = str(doc_verify_sec_row["switching_fee"]) + # v_activity_fee = str(doc_verify_sec_row["activity_fee"]) + + message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee_and_costs")) + message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee")) + message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "administration_fees")) + message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "minimum_initial_investment")) + message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "benchmark_name")) + message = get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "performance_fee")) + message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, + gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "interposed_vehicle_performance_fee_cost")) + message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "buy_spread")) + message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "sell_spread")) + message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges, + gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "total_annual_dollar_based_charges")) + # message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list) + # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee")) + # message = get_gt_pred_by_compare_values(switching_fee, v_switching_fee, gt_switching_fee_list, pred_switching_fee_list) + # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "switching_fee")) + # message = get_gt_pred_by_compare_values(activity_fee, v_activity_fee, gt_activity_fee_list, pred_activity_fee_list) + # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "activity_fee")) + + message_data_df = pd.DataFrame(message_list) + message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name', 'data_point', 'gt_value', 'pred_value', 'error']] + # order by doc_id, raw_fund_name, data_point + message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_fund_name', 'data_point']) + message_data_df.reset_index(drop=True, inplace=True) + + # calculate metrics + print("Calculate metrics...") + precision_management_fee_and_costs = precision_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + recall_management_fee_and_costs = recall_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + f1_management_fee_and_costs = f1_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + accuracy_management_fee_and_costs = accuracy_score(gt_management_fee_and_costs_list, pred_management_fee_and_costs_list) + support_management_fee_and_costs = sum(gt_management_fee_and_costs_list) + + precision_management_fee = precision_score(gt_management_fee_list, pred_management_fee_list) + recall_management_fee = recall_score(gt_management_fee_list, pred_management_fee_list) + f1_management_fee = f1_score(gt_management_fee_list, pred_management_fee_list) + accuracy_management_fee = accuracy_score(gt_management_fee_list, pred_management_fee_list) + support_management_fee = sum(gt_management_fee_list) + + precision_administration_fees = precision_score(gt_administration_fees_list, pred_administration_fees_list) + recall_administration_fees = recall_score(gt_administration_fees_list, pred_administration_fees_list) + f1_administration_fees = f1_score(gt_administration_fees_list, pred_administration_fees_list) + accuracy_administration_fees = accuracy_score(gt_administration_fees_list, pred_administration_fees_list) + support_administration_fees = sum(gt_administration_fees_list) + + precision_miminimum_initial_investment = precision_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + recall_miminimum_initial_investment = recall_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + f1_miminimum_initial_investment = f1_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + accuracy_miminimum_initial_investment = accuracy_score(gt_minimum_initial_investment_list, pred_minimum_initial_investment_list) + support_miminimum_initial_investment = sum(gt_minimum_initial_investment_list) + + precision_benchmark_name = precision_score(gt_benchmark_name_list, pred_benchmark_name_list) + recall_benchmark_name = recall_score(gt_benchmark_name_list, pred_benchmark_name_list) + f1_benchmark_name = f1_score(gt_benchmark_name_list, pred_benchmark_name_list) + accuracy_benchmark_name = accuracy_score(gt_benchmark_name_list, pred_benchmark_name_list) + support_benchmark_name = sum(gt_benchmark_name_list) + + precision_performance_fee = precision_score(gt_performance_fee_list, pred_performance_fee_list) + recall_performance_fee = recall_score(gt_performance_fee_list, pred_performance_fee_list) + f1_performance_fee = f1_score(gt_performance_fee_list, pred_performance_fee_list) + accuracy_performance_fee = accuracy_score(gt_performance_fee_list, pred_performance_fee_list) + support_performance_fee = sum(gt_performance_fee_list) + + precision_interposed_vehicle_performance_fee_cost = precision_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + recall_interposed_vehicle_performance_fee_cost = recall_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + f1_interposed_vehicle_performance_fee_cost = f1_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + accuracy_interposed_vehicle_performance_fee_cost = accuracy_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + support_interposed_vehicle_performance_fee_cost = sum(gt_interposed_vehicle_performance_fee_cost_list) + + precision_buy_spread = precision_score(gt_buy_spread_list, pred_buy_spread_list) + recall_buy_spread = recall_score(gt_buy_spread_list, pred_buy_spread_list) + f1_buy_spread = f1_score(gt_buy_spread_list, pred_buy_spread_list) + accuracy_buy_spread = accuracy_score(gt_buy_spread_list, pred_buy_spread_list) + support_buy_spread = sum(gt_buy_spread_list) + + precision_sell_spread = precision_score(gt_sell_spread_list, pred_sell_spread_list) + recall_sell_spread = recall_score(gt_sell_spread_list, pred_sell_spread_list) + f1_sell_spread = f1_score(gt_sell_spread_list, pred_sell_spread_list) + accuracy_sell_spread = accuracy_score(gt_sell_spread_list, pred_sell_spread_list) + support_buy_spread = sum(gt_sell_spread_list) + + precision_total_annual_dollar_based_charges = precision_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + recall_total_annual_dollar_based_charges = recall_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + f1_total_annual_dollar_based_charges = f1_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + accuracy_total_annual_dollar_based_charges = accuracy_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + support_total_annual_dollar_based_charges = sum(gt_total_annual_dollar_based_charges_list) + + # precision_withdrawal_fee = precision_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) + # recall_withdrawal_fee = recall_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) + # f1_withdrawal_fee = f1_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) + # accuracy_withdrawal_fee = accuracy_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) + # support_withdrawal_fee = sum(gt_withdrawal_fee_list) + + # precision_switching_fee = precision_score(gt_switching_fee_list, pred_switching_fee_list) + # recall_switching_fee = recall_score(gt_switching_fee_list, pred_switching_fee_list) + # f1_switching_fee = f1_score(gt_switching_fee_list, pred_switching_fee_list) + # accuracy_switching_fee = accuracy_score(gt_switching_fee_list, pred_switching_fee_list) + # support_switching_fee = sum(gt_switching_fee_list) + + # precision_activity_fee = precision_score(gt_activity_fee_list, pred_activity_fee_list) + # recall_activity_fee = recall_score(gt_activity_fee_list, pred_activity_fee_list) + # f1_activity_fee = f1_score(gt_activity_fee_list, pred_activity_fee_list) + # accuracy_activity_fee = accuracy_score(gt_activity_fee_list, pred_activity_fee_list) + # support_activity_fee = sum(gt_activity_fee_list) + + metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, + {"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, + {"item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, + {"item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, + {"item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name}, + {"item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee}, + {"item": "interposed_vehicle_performance_fee_cost", "precision": precision_interposed_vehicle_performance_fee_cost, "recall": recall_interposed_vehicle_performance_fee_cost, + "f1": f1_interposed_vehicle_performance_fee_cost, "accuracy": accuracy_interposed_vehicle_performance_fee_cost, "support": support_interposed_vehicle_performance_fee_cost}, + {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, + {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, + {"item": "total_annual_dollar_based_charges", "precision": precision_total_annual_dollar_based_charges, "recall": recall_total_annual_dollar_based_charges, + "f1": f1_total_annual_dollar_based_charges, "accuracy": accuracy_total_annual_dollar_based_charges, "support": support_total_annual_dollar_based_charges} + # {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, + # {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, + # {"item": "withdrawal_fee", "precision": precision_withdrawal_fee, "recall": recall_withdrawal_fee, "f1": f1_withdrawal_fee, "accuracy": accuracy_withdrawal_fee, "support": support_withdrawal_fee}, + # {"item": "switching_fee", "precision": precision_switching_fee, "recall": recall_switching_fee, "f1": f1_switching_fee, "accuracy": accuracy_switching_fee, "support": support_switching_fee}, + # {"item": "activity_fee", "precision": precision_activity_fee, "recall": recall_activity_fee, "f1": f1_activity_fee, "accuracy": accuracy_activity_fee, "support": support_activity_fee} + ] + metrics_data_df = pd.DataFrame(metrics_data) + averate_precision = metrics_data_df["precision"].mean() + average_recall = metrics_data_df["recall"].mean() + average_f1 = metrics_data_df["f1"].mean() + average_accuracy = metrics_data_df["accuracy"].mean() + sum_support = metrics_data_df["support"].sum() + metrics_data.append({"item": "average_score", "precision": averate_precision, "recall": average_recall, "f1": average_f1, "accuracy": average_accuracy, "support": sum_support}) + metrics_data_df = pd.DataFrame(metrics_data) + metrics_data_df = metrics_data_df[['item', 'f1', 'precision', 'recall', 'accuracy', 'support']] + + # output metrics data to Excel file + print("Output metrics data to Excel file...") + output_folder = r"/data/aus_prospectus/output/metrics_data/" + os.makedirs(output_folder, exist_ok=True) + verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") + metrics_file_name = f"metrics_{verify_file_name}_4_dps_not_strict.xlsx" + output_file = os.path.join(output_folder, metrics_file_name) + with pd.ExcelWriter(output_file) as writer: + metrics_data_df.to_excel(writer, index=False, sheet_name="metrics_data") + message_data_df.to_excel(writer, index=False, sheet_name="message_data") + + +def generate_message(message: dict, doc_id: str, sec_id: str, fund_legal_name: str, raw_fund_name: str, datapoint: str): + message["data_point"] = datapoint + message["fund_legal_name"] = fund_legal_name + message["raw_fund_name"] = raw_fund_name + message["sec_id"] = sec_id + message["doc_id"] = str(doc_id) + return message + + +def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""): + message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""} + if gt_value is not None and len(str(gt_value)) > 0: + gt_list.append(1) + gt_equal_pred = is_equal(gt_value, pred_value, data_point) + if gt_equal_pred: + pred_list.append(1) + else: + pred_list.append(0) + message["error"] = "pred_value is not equal to gt_value" + if pred_value is not None and len(str(pred_value)) > 0: + pred_list.append(1) + gt_list.append(0) + else: + if pred_value is not None and len(str(pred_value)) > 0: + gt_list.append(0) + pred_list.append(1) + message["error"] = "gt_value is empty, but pred_value is not empty" + # else: + # gt_list.append(1) + # pred_list.append(1) + return message + + +def is_equal(gt_value, pred_value, data_point: str = ""): + if gt_value is not None and len(str(gt_value)) > 0 and \ + pred_value is not None and len(str(pred_value)) > 0: + if gt_value == pred_value: + return True + if data_point == "benchmark_name": + gt_value = clean_text(gt_value) + pred_value = clean_text(pred_value) + if gt_value == pred_value or gt_value in pred_value or pred_value in gt_value: + return True + similarity = Similarity() + jacard_score = similarity.jaccard_similarity(gt_value.lower().split(), pred_value.lower().split()) + if jacard_score > 0.8: + return True + return False + + +def clean_text(text: str): + if text is None or len(text) == 0: + return text + text = re.sub(r"\W", " ", text) + text = re.sub(r"\s+", " ", text) + return text + + +def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx", + data_sheet: str = "Sheet1", + raw_name_column: str = "raw_share_name", + mapping_file_path: str = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx", + mapping_sheet: str = "document_mapping", + raw_name_mapping_column: str = None, + output_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"): + data_df = pd.read_excel(data_file_path, sheet_name=data_sheet) + data_df["fund_id"] = "" + data_df["fund_name"] = "" + data_df["sec_id"] = "" + data_df["sec_name"] = "" + + mapping_data = pd.read_excel(mapping_file_path, sheet_name=mapping_sheet) + + doc_id_list = data_df["doc_id"].unique().tolist() + for doc_id in doc_id_list: + doc_data = data_df[data_df["doc_id"] == doc_id] + raw_name_list = doc_data[raw_name_column].unique().tolist() + + doc_mapping_data = mapping_data[mapping_data["DocumentId"] == doc_id] + if len(doc_mapping_data) == 0: + continue + provider_name = doc_mapping_data["CompanyName"].values[0] + if raw_name_mapping_column is not None and raw_name_mapping_column == "FundLegalName": + doc_db_name_list = doc_mapping_data[raw_name_mapping_column].unique().tolist() + for raw_name in raw_name_list: + find_df = doc_mapping_data[doc_mapping_data[raw_name_mapping_column] == raw_name] + if find_df is not None and len(find_df) == 1: + sec_id = find_df["FundClassId"].values[0] + sec_name = find_df["FundClassLegalName"].values[0] + fund_id = find_df["FundId"].values[0] + fund_name = find_df["FundLegalName"].values[0] + # update doc_data which raw_share_name is same as raw_share_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_id"] = sec_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_name"] = sec_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_id"] = fund_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_name"] = fund_name + else: + doc_db_name_list = doc_mapping_data["FundClassLegalName"].unique().tolist() + all_match_result = get_raw_name_db_match_result(doc_id, + provider_name, + raw_name_list, + doc_db_name_list, + iter_count=60) + for raw_share_name in raw_name_list: + if all_match_result.get(raw_share_name) is not None: + matched_db_share_name = all_match_result[raw_share_name] + if ( + matched_db_share_name is not None + and len(matched_db_share_name) > 0 + ): + # get SecId from self.doc_fund_class_mapping + find_share_df = doc_mapping_data[doc_mapping_data["FundClassLegalName"] == matched_db_share_name] + if find_share_df is not None and len(find_share_df) > 0: + sec_id = find_share_df["FundClassId"].values[0] + fund_id = find_share_df["FundId"].values[0] + fund_name = find_share_df["FundLegalName"].values[0] + # update doc_data which raw_share_name is same as raw_share_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_id"] = sec_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_name"] = matched_db_share_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_id"] = fund_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_name"] = fund_name + try: + data_df = data_df[["doc_id", + "raw_fund_name", + "fund_id", + "fund_name", + "raw_share_name", + "sec_id", + "sec_name", + "management_fee_and_costs", + "management_fee", + "administration_fees", + "minimum_initial_investment", + "benchmark_name", + "performance_fee", + "performance_fee_charged", + "buy_spread", + "sell_spread", + "total_annual_dollar_based_charges", + "interposed_vehicle_performance_fee_cost", + "establishment_fee", + "contribution_fee", + "withdrawal_fee", + "exit_fee", + "switching_fee", + "activity_fee", + "hurdle_rate", + "analyst_name" + ]] + except Exception as e: + print(e) + + with open(output_file_path, "wb") as file: + data_df.to_excel(file, index=False) + + +def get_raw_name_db_match_result( + doc_id: str, provider_name: str, raw_name_list: list, doc_share_name_list: list, iter_count: int = 30 + ): + # split raw_name_list into several parts which each part is with 30 elements + # The reason to split is to avoid invoke token limitation issues from CahtGPT + raw_name_list_parts = [ + raw_name_list[i : i + iter_count] + for i in range(0, len(raw_name_list), iter_count) + ] + all_match_result = {} + doc_share_name_list = deepcopy(doc_share_name_list) + for raw_name_list in raw_name_list_parts: + match_result, doc_share_name_list = get_final_function_to_match( + doc_id, provider_name, raw_name_list, doc_share_name_list + ) + all_match_result.update(match_result) + return all_match_result + +def get_final_function_to_match(doc_id, provider_name, raw_name_list, db_name_list): + if len(db_name_list) == 0: + match_result = {} + for raw_name in raw_name_list: + match_result[raw_name] = "" + else: + match_result = final_function_to_match( + doc_id=doc_id, + pred_list=raw_name_list, + db_list=db_name_list, + provider_name=provider_name, + doc_source="aus_prospectus" + ) + matched_name_list = list(match_result.values()) + db_name_list = remove_matched_names(db_name_list, matched_name_list) + return match_result, db_name_list + +def remove_matched_names(target_name_list: list, matched_name_list: list): + if len(matched_name_list) == 0: + return target_name_list + + matched_name_list = list(set(matched_name_list)) + matched_name_list = [ + value for value in matched_name_list if value is not None and len(value) > 0 + ] + for matched_name in matched_name_list: + if ( + matched_name is not None + and len(matched_name) > 0 + and matched_name in target_name_list + ): + target_name_list.remove(matched_name) + return target_name_list + + +def set_mapping_to_ravi_data(): + data_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx" + data_sheet = "Sheet1" + mapping_file_path = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" + mapping_sheet = "document_mapping" + output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" + set_mapping_to_raw_name_data(data_file_path, data_sheet, mapping_file_path, mapping_sheet, output_file_path) + + +def set_mapping_to_data_side_documents_data(): + # data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/Audited file_phase2.xlsx" + # data_sheet = "all" + # mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" + # mapping_sheet = "document_mapping" + # output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/audited_file_phase2_with_mapping.xlsx" + + data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx" + data_sheet = "ground_truth" + raw_name_column = "raw_share_name" + mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" + mapping_sheet = "document_mapping" + raw_name_mapping_column = None + output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" + set_mapping_to_raw_name_data(data_file_path=data_file_path, + data_sheet=data_sheet, + raw_name_column=raw_name_column, + mapping_file_path=mapping_file_path, + mapping_sheet=mapping_sheet, + raw_name_mapping_column=raw_name_mapping_column, + output_file_path=output_file_path) + + +def adjust_data_file(source_file: str, + targe_file: str): + source_data = pd.read_excel(source_file, sheet_name="Sheet1") + source_doc_id_list = source_data["DocumentId"].unique().tolist() + + target_data = pd.read_excel(targe_file, sheet_name="Sheet1") + #remove target_data which doc_id is in source_doc_id_list + target_data = target_data[~target_data["DocumentId"].isin(source_doc_id_list)] + # concat source_data and target_data + target_data = pd.concat([source_data, target_data], ignore_index=True) + with open(targe_file, "wb") as file: + target_data.to_excel(file, index=False) + + +if __name__ == "__main__": + # adjust_column_order() + # set_mapping_to_data_side_documents_data() + + # source_file = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx" + # target_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" + # adjust_data_file(source_file=source_file, targe_file=target_file) + + # audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx" + # audit_data_sheet: str = "Sheet1" + # verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx" + # verify_data_sheet: str = "total_data" + + audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" + audit_data_sheet: str = "Sheet1" + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250305160321_ravi.xlsx" + verify_data_sheet: str = "total_data" + calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, + audit_data_sheet=audit_data_sheet, + verify_file_path=verify_file_path, + verify_data_sheet=verify_data_sheet) + + # set_mapping_to_17_documents_data() + # set_mapping_to_ravi_data() + + # calculate_metrics_based_audit_file(is_strict=True) + # calculate_metrics_based_audit_file(is_strict=False) + # remove_ter_ogc_performance_fee_annotation() + # batch_run_documents() + # transform_pdf_2_image() + # ground_truth_file = "./test_metrics/ground_truth.xlsx" + # prediction_file = "./test_metrics/prediction.xlsx" + # calc_metrics(ground_truth_file, prediction_file) + + # pdf_file = r"./data/emea_ar/pdf/532438210.pdf" + # page_list = [25, 26, 27, 28, 29] + # output_folder = r"./data/emea_ar/output/pdf_part/" + # output_part_of_pages(pdf_file, page_list, output_folder) From c4ed65770d3e4b77ef32ecacb509a127fb39a8f3 Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 5 Mar 2025 17:21:13 -0600 Subject: [PATCH 02/11] Try to support more complex management_fee_and_costs scenarios Support calculate all of data points metrics --- calc_metrics.py | 190 ++++++++++-------- core/data_extraction.py | 2 +- .../data_extraction_prompts_config.json | 90 ++++++++- main.py | 8 +- 4 files changed, 198 insertions(+), 92 deletions(-) diff --git a/calc_metrics.py b/calc_metrics.py index 243c000..4936327 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -559,7 +559,9 @@ def calculate_metrics_based_audit_file(is_strict: bool = False): def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx", audit_data_sheet: str = "Sheet1", verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx", - verify_data_sheet: str = "total_data" + verify_data_sheet: str = "total_data", + verify_document_list_file: str = None, + is_for_all: bool = False ): print("Start to calculate metrics based on DB data file and extracted file...") audit_data_df = pd.DataFrame() @@ -648,16 +650,17 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros pred_minimum_initial_investment_list = [] gt_benchmark_name_list = [] pred_benchmark_name_list = [] - gt_performance_fee_list = [] - pred_performance_fee_list = [] - gt_interposed_vehicle_performance_fee_cost_list = [] - pred_interposed_vehicle_performance_fee_cost_list = [] - gt_buy_spread_list = [] - pred_buy_spread_list = [] - gt_sell_spread_list = [] - pred_sell_spread_list = [] - gt_total_annual_dollar_based_charges_list = [] - pred_total_annual_dollar_based_charges_list = [] + if is_for_all: + gt_performance_fee_list = [] + pred_performance_fee_list = [] + gt_interposed_vehicle_performance_fee_cost_list = [] + pred_interposed_vehicle_performance_fee_cost_list = [] + gt_buy_spread_list = [] + pred_buy_spread_list = [] + gt_sell_spread_list = [] + pred_sell_spread_list = [] + gt_total_annual_dollar_based_charges_list = [] + pred_total_annual_dollar_based_charges_list = [] # gt_performance_fee_costs_list = [] # pred_performance_fee_costs_list = [] @@ -672,6 +675,12 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros # gt_activity_fee_list = [] # pred_activity_fee_list = [] + if verify_document_list_file is not None: + with open(verify_document_list_file, "r", encoding="utf-8") as f: + verify_document_list = f.readlines() + verify_document_list = [int(doc_id.strip()) for doc_id in verify_document_list] + if len(verify_document_list) > 0: + verify_data_df = verify_data_df[verify_data_df["doc_id"].isin(verify_document_list)] document_id_list = verify_data_df["doc_id"].unique().tolist() print(f"Total document count: {len(document_id_list)}") @@ -693,11 +702,12 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros administration_fees = str(row["administration_fees"]) minimum_initial_investment = str(row["minimum_initial_investment"]) benchmark_name = str(row["benchmark_name"]) - performance_fee = str(row["performance_fee"]) - interposed_vehicle_performance_fee_cost = str(row["interposed_vehicle_performance_fee_cost"]) - buy_spread = str(row["buy_spread"]) - sell_spread = str(row["sell_spread"]) - total_annual_dollar_based_charges = str(row["total_annual_dollar_based_charges"]) + if is_for_all: + performance_fee = str(row["performance_fee"]) + interposed_vehicle_performance_fee_cost = str(row["interposed_vehicle_performance_fee_cost"]) + buy_spread = str(row["buy_spread"]) + sell_spread = str(row["sell_spread"]) + total_annual_dollar_based_charges = str(row["total_annual_dollar_based_charges"]) # get the first row which sec_id in doc_verify_data is same as sec_id doc_verify_sec_data = doc_verify_data[doc_verify_data["sec_id"] == sec_id] @@ -710,11 +720,12 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros v_administration_fees = str(doc_verify_sec_row["administration_fees"]) v_minimum_initial_investment = str(doc_verify_sec_row["minimum_initial_investment"]) v_benchmark_name = str(doc_verify_sec_row["benchmark_name"]) - v_performance_fee = str(doc_verify_sec_row["performance_fee"]) - v_interposed_vehicle_performance_fee_cost = str(doc_verify_sec_row["interposed_vehicle_performance_fee_cost"]) - v_buy_spread = str(doc_verify_sec_row["buy_spread"]) - v_sell_spread = str(doc_verify_sec_row["sell_spread"]) - v_total_annual_dollar_based_charges = str(doc_verify_sec_row["total_annual_dollar_based_charges"]) + if is_for_all: + v_performance_fee = str(doc_verify_sec_row["performance_fee"]) + v_interposed_vehicle_performance_fee_cost = str(doc_verify_sec_row["interposed_vehicle_performance_fee_cost"]) + v_buy_spread = str(doc_verify_sec_row["buy_spread"]) + v_sell_spread = str(doc_verify_sec_row["sell_spread"]) + v_total_annual_dollar_based_charges = str(doc_verify_sec_row["total_annual_dollar_based_charges"]) # v_performance_fee_costs = str(doc_verify_sec_row["performance_fee_costs"]) # v_buy_spread = str(doc_verify_sec_row["buy_spread"]) @@ -733,18 +744,19 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "minimum_initial_investment")) message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name") message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "benchmark_name")) - message = get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "performance_fee")) - message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, - gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "interposed_vehicle_performance_fee_cost")) - message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "buy_spread")) - message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "sell_spread")) - message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges, - gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "total_annual_dollar_based_charges")) + if is_for_all: + message = get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "performance_fee")) + message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, + gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "interposed_vehicle_performance_fee_cost")) + message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "buy_spread")) + message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "sell_spread")) + message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges, + gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "total_annual_dollar_based_charges")) # message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list) # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee")) # message = get_gt_pred_by_compare_values(switching_fee, v_switching_fee, gt_switching_fee_list, pred_switching_fee_list) @@ -790,35 +802,36 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros accuracy_benchmark_name = accuracy_score(gt_benchmark_name_list, pred_benchmark_name_list) support_benchmark_name = sum(gt_benchmark_name_list) - precision_performance_fee = precision_score(gt_performance_fee_list, pred_performance_fee_list) - recall_performance_fee = recall_score(gt_performance_fee_list, pred_performance_fee_list) - f1_performance_fee = f1_score(gt_performance_fee_list, pred_performance_fee_list) - accuracy_performance_fee = accuracy_score(gt_performance_fee_list, pred_performance_fee_list) - support_performance_fee = sum(gt_performance_fee_list) - - precision_interposed_vehicle_performance_fee_cost = precision_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) - recall_interposed_vehicle_performance_fee_cost = recall_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) - f1_interposed_vehicle_performance_fee_cost = f1_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) - accuracy_interposed_vehicle_performance_fee_cost = accuracy_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) - support_interposed_vehicle_performance_fee_cost = sum(gt_interposed_vehicle_performance_fee_cost_list) - - precision_buy_spread = precision_score(gt_buy_spread_list, pred_buy_spread_list) - recall_buy_spread = recall_score(gt_buy_spread_list, pred_buy_spread_list) - f1_buy_spread = f1_score(gt_buy_spread_list, pred_buy_spread_list) - accuracy_buy_spread = accuracy_score(gt_buy_spread_list, pred_buy_spread_list) - support_buy_spread = sum(gt_buy_spread_list) - - precision_sell_spread = precision_score(gt_sell_spread_list, pred_sell_spread_list) - recall_sell_spread = recall_score(gt_sell_spread_list, pred_sell_spread_list) - f1_sell_spread = f1_score(gt_sell_spread_list, pred_sell_spread_list) - accuracy_sell_spread = accuracy_score(gt_sell_spread_list, pred_sell_spread_list) - support_buy_spread = sum(gt_sell_spread_list) - - precision_total_annual_dollar_based_charges = precision_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) - recall_total_annual_dollar_based_charges = recall_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) - f1_total_annual_dollar_based_charges = f1_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) - accuracy_total_annual_dollar_based_charges = accuracy_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) - support_total_annual_dollar_based_charges = sum(gt_total_annual_dollar_based_charges_list) + if is_for_all: + precision_performance_fee = precision_score(gt_performance_fee_list, pred_performance_fee_list) + recall_performance_fee = recall_score(gt_performance_fee_list, pred_performance_fee_list) + f1_performance_fee = f1_score(gt_performance_fee_list, pred_performance_fee_list) + accuracy_performance_fee = accuracy_score(gt_performance_fee_list, pred_performance_fee_list) + support_performance_fee = sum(gt_performance_fee_list) + + precision_interposed_vehicle_performance_fee_cost = precision_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + recall_interposed_vehicle_performance_fee_cost = recall_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + f1_interposed_vehicle_performance_fee_cost = f1_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + accuracy_interposed_vehicle_performance_fee_cost = accuracy_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) + support_interposed_vehicle_performance_fee_cost = sum(gt_interposed_vehicle_performance_fee_cost_list) + + precision_buy_spread = precision_score(gt_buy_spread_list, pred_buy_spread_list) + recall_buy_spread = recall_score(gt_buy_spread_list, pred_buy_spread_list) + f1_buy_spread = f1_score(gt_buy_spread_list, pred_buy_spread_list) + accuracy_buy_spread = accuracy_score(gt_buy_spread_list, pred_buy_spread_list) + support_buy_spread = sum(gt_buy_spread_list) + + precision_sell_spread = precision_score(gt_sell_spread_list, pred_sell_spread_list) + recall_sell_spread = recall_score(gt_sell_spread_list, pred_sell_spread_list) + f1_sell_spread = f1_score(gt_sell_spread_list, pred_sell_spread_list) + accuracy_sell_spread = accuracy_score(gt_sell_spread_list, pred_sell_spread_list) + support_buy_spread = sum(gt_sell_spread_list) + + precision_total_annual_dollar_based_charges = precision_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + recall_total_annual_dollar_based_charges = recall_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + f1_total_annual_dollar_based_charges = f1_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + accuracy_total_annual_dollar_based_charges = accuracy_score(gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) + support_total_annual_dollar_based_charges = sum(gt_total_annual_dollar_based_charges_list) # precision_withdrawal_fee = precision_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) # recall_withdrawal_fee = recall_score(gt_withdrawal_fee_list, pred_withdrawal_fee_list) @@ -837,25 +850,32 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros # f1_activity_fee = f1_score(gt_activity_fee_list, pred_activity_fee_list) # accuracy_activity_fee = accuracy_score(gt_activity_fee_list, pred_activity_fee_list) # support_activity_fee = sum(gt_activity_fee_list) - - metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, - {"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, - {"item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, - {"item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, - {"item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name}, - {"item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee}, - {"item": "interposed_vehicle_performance_fee_cost", "precision": precision_interposed_vehicle_performance_fee_cost, "recall": recall_interposed_vehicle_performance_fee_cost, - "f1": f1_interposed_vehicle_performance_fee_cost, "accuracy": accuracy_interposed_vehicle_performance_fee_cost, "support": support_interposed_vehicle_performance_fee_cost}, - {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, - {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, - {"item": "total_annual_dollar_based_charges", "precision": precision_total_annual_dollar_based_charges, "recall": recall_total_annual_dollar_based_charges, - "f1": f1_total_annual_dollar_based_charges, "accuracy": accuracy_total_annual_dollar_based_charges, "support": support_total_annual_dollar_based_charges} - # {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, - # {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, - # {"item": "withdrawal_fee", "precision": precision_withdrawal_fee, "recall": recall_withdrawal_fee, "f1": f1_withdrawal_fee, "accuracy": accuracy_withdrawal_fee, "support": support_withdrawal_fee}, - # {"item": "switching_fee", "precision": precision_switching_fee, "recall": recall_switching_fee, "f1": f1_switching_fee, "accuracy": accuracy_switching_fee, "support": support_switching_fee}, - # {"item": "activity_fee", "precision": precision_activity_fee, "recall": recall_activity_fee, "f1": f1_activity_fee, "accuracy": accuracy_activity_fee, "support": support_activity_fee} - ] + if is_for_all: + metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, + {"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, + {"item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, + {"item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, + {"item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name}, + {"item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee}, + {"item": "interposed_vehicle_performance_fee_cost", "precision": precision_interposed_vehicle_performance_fee_cost, "recall": recall_interposed_vehicle_performance_fee_cost, + "f1": f1_interposed_vehicle_performance_fee_cost, "accuracy": accuracy_interposed_vehicle_performance_fee_cost, "support": support_interposed_vehicle_performance_fee_cost}, + {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, + {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, + {"item": "total_annual_dollar_based_charges", "precision": precision_total_annual_dollar_based_charges, "recall": recall_total_annual_dollar_based_charges, + "f1": f1_total_annual_dollar_based_charges, "accuracy": accuracy_total_annual_dollar_based_charges, "support": support_total_annual_dollar_based_charges} + # {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, + # {"item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, + # {"item": "withdrawal_fee", "precision": precision_withdrawal_fee, "recall": recall_withdrawal_fee, "f1": f1_withdrawal_fee, "accuracy": accuracy_withdrawal_fee, "support": support_withdrawal_fee}, + # {"item": "switching_fee", "precision": precision_switching_fee, "recall": recall_switching_fee, "f1": f1_switching_fee, "accuracy": accuracy_switching_fee, "support": support_switching_fee}, + # {"item": "activity_fee", "precision": precision_activity_fee, "recall": recall_activity_fee, "f1": f1_activity_fee, "accuracy": accuracy_activity_fee, "support": support_activity_fee} + ] + else: + metrics_data = [{"item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, + {"item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, + {"item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, + {"item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, + {"item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name} + ] metrics_data_df = pd.DataFrame(metrics_data) averate_precision = metrics_data_df["precision"].mean() average_recall = metrics_data_df["recall"].mean() @@ -871,7 +891,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros output_folder = r"/data/aus_prospectus/output/metrics_data/" os.makedirs(output_folder, exist_ok=True) verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") - metrics_file_name = f"metrics_{verify_file_name}_4_dps_not_strict.xlsx" + metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx" output_file = os.path.join(output_folder, metrics_file_name) with pd.ExcelWriter(output_file) as writer: metrics_data_df.to_excel(writer, index=False, sheet_name="metrics_data") @@ -1148,12 +1168,14 @@ if __name__ == "__main__": audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" audit_data_sheet: str = "Sheet1" - verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250305160321_ravi.xlsx" + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_46_documents_by_text_20250305170202.xlsx" verify_data_sheet: str = "total_data" + verify_document_list_file: str = "./sample_documents/aus_prospectus_17_documents_sample.txt" calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, audit_data_sheet=audit_data_sheet, verify_file_path=verify_file_path, - verify_data_sheet=verify_data_sheet) + verify_data_sheet=verify_data_sheet, + verify_document_list_file = verify_document_list_file) # set_mapping_to_17_documents_data() # set_mapping_to_ravi_data() diff --git a/core/data_extraction.py b/core/data_extraction.py index 3d8269c..ab880f2 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -575,7 +575,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num != 40: + # if page_num != 21: # continue if page_num in handled_page_num_list: continue diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 78de6d0..04b2fff 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -162,6 +162,26 @@ "The output should be:", "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", "\n", + "B.3 With \"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", sum the values from these 3 columns.", + "---Example Start---", + "Fund \nManagement \nfee 1 \n(% pa) \nIndirect costs1\n(% pa)\nEstimated performance fees2\n(% pa)\nTransaction \ncosts \n(% pa) \nBuy/sell \nspreads (%) \nRecoverable \nexpenses 3 \nEstimated \nother indirect \ncosts \nPerformance \nfees charged \nto the Fund \nby underlying \nmanagers \nPerformance \nfees charged \nby interposed \nvehicles \nipac Life \nChoices \nActive 50 \n0.70 \n0.02 \n0.09 \n0.00 \n0.05 \n0.14 \n0.10/0.10 \nipac Life \nChoices \nActive 70 \n0.79 \n0.01 \n0.08 \n0.00 \n0.05 \n0.17 \n0.10/0.10 \n", + "---Example End---", + "For this case: ", + "a. The table header is with secondary-level header.", + "b. The fund name is before the data row, e.g. ipac Life Choices Active 50", + "c. The data points numbers order in data row, for example: \n0.70 \n0.02 \n0.09 \n0.00 \n0.05 \n0.14 \n0.10/0.10 \n is correct as initial table structure.", + "The 1st number: 0.70 is the management_fee,", + "the 2nd number: 0.02 is the recoverable_expenses,", + "the 3rd number: 0.09 is the indirect_costs", + "the 4th number: 0.00 is the performance_fee,", + "the 5th number: 0.05 is the interposed_vehicle_performance_fee_cost, ", + "the 6th number: 0.14 is the transaction costs, please ignore this number.", + "the 7th number: 0.10 is the buy_spread, ", + "the 8th number: 0.10 is the sell_spread.", + "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.70 + 0.02 + 0.09= 0.81", + "The output should be:", + "{\"data\": [{\"fund name\": \"ipac Life Choices Active 50\", \"share name\": \"ipac Life Choices Active 50\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.7, \"recoverable_expenses\": 0.02, \"indirect_costs\": 0.09, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"ipac Life Choices Active 70\", \"share name\": \"ipac Life Choices Active 70\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.79, \"recoverable_expenses\": 0.01, \"indirect_costs\": 0.08, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", + "\n", "C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", @@ -391,12 +411,12 @@ "b. The fund name is after the data row, e.g. MyNorth Australian Fixed Interest Index", "c. The data points numbers order in data row, for example: \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 is correct as initial table structure.", "The 1st number: 0.20 is the management_fee, the 2nd number and the 3th number: 0.01 0.00 are the indirect costs, ", - "the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the performance_fee by interposed vehicles, ", + "the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ", "the 6th number: 0.00 is the transaction costs, ", "the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.", "The management_fee_and_costs is management_fee + indirect costs = 0.20 + 0.01 + 0.00= 0.21", "The output should be: ", - "{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"performance_fee\": 0.00, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"performance_fee\": 0.00, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", + "{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"performance_fee\": 0.00, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", "\n", "---Example 2 Start---", "Indirect costs \ni \nEstimated performance fees \nii \nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nMyNorth Index \nModerately \nDefensive \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 \nMyNorth Index \nBalanced \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.09/0.09 \n", @@ -407,7 +427,7 @@ "c. The data points numbers order in data row, for example: \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 is correct as initial table structure.", "The 1st number: 0.55 is the management_fee, the 2nd number and the 3th number: 0.00 0.00 are the indirect costs, ", "the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the performance_fee by interposed vehicles, ", - "the 6th number: 0.01 is the transaction costs, ", + "the 6th number: 0.01 is the transaction costs, please ignore this number.", "the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.", "The management_fee_and_costs is management_fee + indirect costs = 0.55 + 0.00 + 0.00= 0.55", "The output should be: ", @@ -433,6 +453,70 @@ "The output should be:", "{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}" ] + }, + { + "keywords": ["Option name \nIndirect costs"], + "prompts": ["Complex management fee and costs rule:", + "If the table with columns:", + "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", + "The management_fee is \"Management fee (% pa)\".", + "The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".", + "The indirect_costs is \"Estimated other indirect costs\"", + "The recoverable_expenses is \"Recoverable expenses\"", + "The performance_fee is \"Peformance fees charged to the option by underlying managers\".", + "The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"", + "The buy_spread and sell_spread are \"Buy/sell spreads\".", + "---Example 1 Start---", + "Option name \nIndirect costs \n(i)\nEstimated performance fees \n(ii)\nManagement \nfee \n(% pa) \n(i) \n(% pa) \n(% pa) \nTransaction \ncosts \n(% pa) \nBuy/sell \nspreads \n(%) \n(iv) \nRecoverable \nexpenses \n(iii) \nEstimated other \nindirect costs \nPerformance \nfees charged to \nthe option \nby underlying \nmanagers \nPerformance \nfees charged \nby interposed \nvehicles \nGenerations Defensive \n0.90 \n0.26 \n0.12 \n0.00 \n0.06 \n0.17 \n0.09/0.08 \nGenerations Moderately \nDefensive \n1.00 \n0.08 \n0.10 \n0.00 \n0.05 \n0.17 \n0.10/0.10 \n", + "---Example 1 End---", + "For this case: ", + "a. The table header is with disorder issue during PDF contents extraction issue.", + "b. The fund name is before the data row, e.g. Generations Defensive", + "c. The data points numbers order in data row, for example: \n0.90 \n0.26 \n0.12 \n0.00 \n0.06 \n0.17 \n0.09/0.08 \n is correct as initial table structure.", + "The 1st number: 0.90 is the management_fee,", + "the 2nd number: 0.26 is the recoverable_expenses,", + "the 3rd number: 0.12 is the indirect_costs", + "the 4th number: 0.00 is the performance_fee,", + "the 5th number: 0.06 is the interposed_vehicle_performance_fee_cost, ", + "the 6th number: 0.17 is the transaction costs, please ignore this number.", + "the 7th number: 0.09 is the buy_spread, ", + "the 8th number: 0.08 is the sell_spread.", + "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.90 + 0.26 + 0.12= 1.28", + "The output should be: ", + "{\"data\": [{\"fund name\": \"Generations Defensive\", \"share name\": \"Generations Defensive\", \"management_fee_and_costs\": 1.28, \"management_fee\": 0.9, \"recoverable_expenses\": 0.26, \"indirect_costs\": 0.12, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.06, \"buy_spread\": 0.09, \"sell_spread\": 0.08}, {\"fund name\": \"Generations Moderately Defensive\", \"share name\": \"Generations Moderately Defensive\", \"management_fee_and_costs\": 1.18, \"management_fee\": 1, \"recoverable_expenses\": 0.08, \"indirect_costs\": 0.1,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" + ] + }, + { + "keywords": "Management \nfee (i) \n(% pa) \nIndirect costs (i) \n(% pa)", + "prompts": ["Complex management fee and costs rule:", + "If the table with columns:", + "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", + "The management_fee is \"Management fee (% pa)\".", + "The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".", + "The indirect_costs is \"Estimated other indirect costs\"", + "The recoverable_expenses is \"Recoverable expenses\"", + "The performance_fee is \"Peformance fees charged to the Investment Option by underlying managers\".", + "The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"", + "The buy_spread and sell_spread are \"Buy/sell spreads\".", + "---Example 1 Start---", + "Investment Option \nManagement \nfee (i) \n(% pa) \nIndirect costs (i) \n(% pa) \nEstimated performance fees (ii) \n(% pa) \nTransaction \ncosts (% pa) \nBuy/sell \nspreads (%) \nRecoverable \nexpenses (iii) \nEstimated \nother \nindirect costs \nPerformance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \nPerformance fees \ncharged by \ninterposed \nvehicles \nNorth Active Defensive \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \nNorth Active Moderately \nDefensive \n0.72 \n0.07 \n0.04 \n0.00 \n0.01 \n0.14 \n0.09/0.09 \nNorth Index Growth \n0.45 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.06/0.06 \nNorth Index High Growth \n0.45 \n0.00 \n0.01 \n0.00 \n0.00 \n0.01 \n0.06/0.07 \n", + "---Example 1 End---", + "For this case: ", + "a. The table header is with secondary-level header.", + "b. The fund name is before the data row, e.g. North Active Defensive", + "c. The data points numbers order in data row, for example: \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \n is correct as initial table structure.", + "The 1st number: 0.62 is the management_fee,", + "the 2nd number: 0.18 is the recoverable_expenses,", + "the 3rd number: 0.05 is the indirect_costs", + "the 4th number: 0.00 is the performance_fee,", + "the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ", + "the 6th number: 0.14 is the transaction costs, please ignore this number.", + "the 7th number: 0.08 is the buy_spread, ", + "the 8th number: 0.08 is the sell_spread.", + "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.62 + 0.18 + 0.05= 0.85", + "The output should be: ", + "{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}" + ] } ] } diff --git a/main.py b/main.py index 1171af8..e28c426 100644 --- a/main.py +++ b/main.py @@ -1504,9 +1504,9 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - re_run_extract_data = False - re_run_mapping_data = False - force_save_total_data = True + re_run_extract_data = True + re_run_mapping_data = True + force_save_total_data = False doc_source = "aus_prospectus" # doc_source = "emea_ar" if doc_source == "aus_prospectus": @@ -1525,7 +1525,7 @@ if __name__ == "__main__": # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" - # special_doc_id_list: list = ["384508026"] + # special_doc_id_list: list = ["539261734"] # special_doc_id_list: list = ["401212184"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" From 52515fc1527e5f7c5c80e95774aa3cf1ef2619f3 Mon Sep 17 00:00:00 2001 From: Blade He Date: Thu, 6 Mar 2025 17:27:18 -0600 Subject: [PATCH 03/11] 1. simplify management_fee_and_costs instructions 2. optimize management_fee_and_costs instructions 3. resolve the issues for complex scenarios: need sum management_fee, recoverable_expenses, indirect_costs as management_fee_and_costs --- calc_metrics.py | 16 ++- core/data_extraction.py | 74 +++++------ .../data_extraction_prompts_config.json | 120 +++--------------- main.py | 5 +- utils/biz_utils.py | 82 +++++++++++- 5 files changed, 145 insertions(+), 152 deletions(-) diff --git a/calc_metrics.py b/calc_metrics.py index 4936327..41de9e1 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -1168,14 +1168,16 @@ if __name__ == "__main__": audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" audit_data_sheet: str = "Sheet1" - verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_46_documents_by_text_20250305170202.xlsx" + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_46_documents_by_text_20250306171226.xlsx" verify_data_sheet: str = "total_data" - verify_document_list_file: str = "./sample_documents/aus_prospectus_17_documents_sample.txt" - calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, - audit_data_sheet=audit_data_sheet, - verify_file_path=verify_file_path, - verify_data_sheet=verify_data_sheet, - verify_document_list_file = verify_document_list_file) + # verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt" + verify_document_list_file_list = [None, "./sample_documents/aus_prospectus_29_documents_sample.txt", "./sample_documents/aus_prospectus_17_documents_sample.txt"] + for verify_document_list_file in verify_document_list_file_list: + calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, + audit_data_sheet=audit_data_sheet, + verify_file_path=verify_file_path, + verify_data_sheet=verify_data_sheet, + verify_document_list_file = verify_document_list_file) # set_mapping_to_17_documents_data() # set_mapping_to_ravi_data() diff --git a/core/data_extraction.py b/core/data_extraction.py index ab880f2..715546b 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -9,7 +9,8 @@ from utils.gpt_utils import chat from utils.pdf_util import PDFUtil from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider from utils.logger import logger -from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name, remove_abundant_data +from utils.biz_utils import add_slash_to_text_as_regex, clean_text, \ + get_most_similar_name, remove_abundant_data, replace_special_table_header class DataExtraction: @@ -575,7 +576,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num != 21: + # if page_num != 24: # continue if page_num in handled_page_num_list: continue @@ -593,6 +594,7 @@ class DataExtraction: else: previous_page_fund_name = None + page_text = replace_special_table_header(page_text) extract_data = self.extract_data_by_page( page_num, page_text, @@ -657,6 +659,7 @@ class DataExtraction: ) if not with_same_structure_table: break + next_page_text = replace_special_table_header(next_page_text) target_text = current_text + next_page_text else: target_text = "" @@ -1507,6 +1510,32 @@ class DataExtraction: complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "") with_special_rule_title = False for datapoint in datapoints: + find_complex_special_rule = False + if page_text is not None and len(page_text) > 0: + complex_special_rule_list = complex_special_rule.get(datapoint, []) + for complex_special_rule in complex_special_rule_list: + complex_keywords = complex_special_rule.get("keywords", []) + if len(complex_keywords) == 0: + continue + exist_keywords = False + for special_keywords in complex_keywords: + special_keywrods_regex = add_slash_to_text_as_regex(special_keywords) + if special_keywords in page_text or \ + re.search(special_keywrods_regex, page_text) is not None: + exist_keywords = True + break + if exist_keywords: + complex_prompts_list = complex_special_rule.get("prompts", []) + if len(complex_prompts_list) > 0: + if not with_special_rule_title: + instructions.append("Special rule:\n") + with_special_rule_title = True + complex_prompts = "\n".join(complex_prompts_list) + instructions.append(complex_prompts) + instructions.append("\n\n") + find_complex_special_rule = True + if find_complex_special_rule: + continue special_rule_list = special_rule_info.get(datapoint, []) if len(special_rule_list) > 0: if not with_special_rule_title: @@ -1515,26 +1544,7 @@ class DataExtraction: special_rule = "\n".join(special_rule_list) instructions.append(special_rule) instructions.append("\n\n") - if page_text is None or len(page_text) == 0: - continue - complex_special_rule_list = complex_special_rule.get(datapoint, []) - for complex_special_rule in complex_special_rule_list: - complex_keywords = complex_special_rule.get("keywords", []) - if len(complex_keywords) == 0: - continue - exist_keywords = False - for special_keywords in complex_keywords: - special_keywrods_regex = add_slash_to_text_as_regex(special_keywords) - if special_keywords in page_text or \ - re.search(special_keywrods_regex, page_text) is not None: - exist_keywords = True - break - if exist_keywords: - complex_prompts_list = complex_special_rule.get("prompts", []) - if len(complex_prompts_list) > 0: - complex_prompts = "\n".join(complex_prompts_list) - instructions.append(complex_prompts) - instructions.append("\n\n") + instructions.append("\n") instructions.append("Special cases:\n") @@ -1563,26 +1573,8 @@ class DataExtraction: contents_list = special_case.get("contents", []) contents = "\n".join(contents_list) instructions.append(contents) - instructions.append("\n\n") + instructions.append("\n") instructions.append("\n") - - # extreme_complex_config_list = special_cases.get("extreme_complex", []) - # if len(extreme_complex_config_list) > 0: - # for extreme_complex_config in extreme_complex_config_list: - # regex = extreme_complex_config.get("regex", "") - # if len(regex) == 0: - # continue - # search = re.search(regex, page_text) - # if search is not None: - # title = extreme_complex_config.get("title", "") - # title = f"{special_cases_number}. {title} " - # special_cases_number += 1 - # instructions.append(title) - # instructions.append("\n") - # contents_list = extreme_complex_config.get("contents", []) - # contents = "\n".join(contents_list) - # instructions.append(contents) - # instructions.append("\n\n") instructions.append("Output requirement:\n") output_requirement = self.instructions_config.get("output_requirement", {}) diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 04b2fff..064dc36 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -162,26 +162,6 @@ "The output should be:", "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", "\n", - "B.3 With \"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", sum the values from these 3 columns.", - "---Example Start---", - "Fund \nManagement \nfee 1 \n(% pa) \nIndirect costs1\n(% pa)\nEstimated performance fees2\n(% pa)\nTransaction \ncosts \n(% pa) \nBuy/sell \nspreads (%) \nRecoverable \nexpenses 3 \nEstimated \nother indirect \ncosts \nPerformance \nfees charged \nto the Fund \nby underlying \nmanagers \nPerformance \nfees charged \nby interposed \nvehicles \nipac Life \nChoices \nActive 50 \n0.70 \n0.02 \n0.09 \n0.00 \n0.05 \n0.14 \n0.10/0.10 \nipac Life \nChoices \nActive 70 \n0.79 \n0.01 \n0.08 \n0.00 \n0.05 \n0.17 \n0.10/0.10 \n", - "---Example End---", - "For this case: ", - "a. The table header is with secondary-level header.", - "b. The fund name is before the data row, e.g. ipac Life Choices Active 50", - "c. The data points numbers order in data row, for example: \n0.70 \n0.02 \n0.09 \n0.00 \n0.05 \n0.14 \n0.10/0.10 \n is correct as initial table structure.", - "The 1st number: 0.70 is the management_fee,", - "the 2nd number: 0.02 is the recoverable_expenses,", - "the 3rd number: 0.09 is the indirect_costs", - "the 4th number: 0.00 is the performance_fee,", - "the 5th number: 0.05 is the interposed_vehicle_performance_fee_cost, ", - "the 6th number: 0.14 is the transaction costs, please ignore this number.", - "the 7th number: 0.10 is the buy_spread, ", - "the 8th number: 0.10 is the sell_spread.", - "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.70 + 0.02 + 0.09= 0.81", - "The output should be:", - "{\"data\": [{\"fund name\": \"ipac Life Choices Active 50\", \"share name\": \"ipac Life Choices Active 50\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.7, \"recoverable_expenses\": 0.02, \"indirect_costs\": 0.09, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"ipac Life Choices Active 70\", \"share name\": \"ipac Life Choices Active 70\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.79, \"recoverable_expenses\": 0.01, \"indirect_costs\": 0.08, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", - "\n", "C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", @@ -394,46 +374,6 @@ "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee\": 0}" ] }, - { - "keywords": ["Indirect costs \ni \nEstimated performance fees"], - "prompts": ["Complex management fee and costs rule:", - "If the table with columns:", - "\"Management fee (% pa)\", \"Indirect costs\", \"Estimated performance fees\", \"Buy/sell spreads\"", - "The management_fee is \"Management fee (% pa)\".", - "The management_fee_costs is \"Management fee (% pa)\" + \"Indirect costs\".", - "The performance_fee is \"Estimated performance fees\"", - "The buy_spread and sell_spread are \"Buy/sell spreads\".", - "---Example 1 Start---", - "Indirect costs \ni\nEstimated performance fees \nii\nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nRecoverable \nexpenses \niii \nEstimated other \nindirect costs \nPerformance \nfees charged to \nthe Investment \nOption by \nunderlying \nmanagers \nPerformance \nfees charged by \ninterposed \nvehicles \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 \nMyNorth \nAustralian Fixed \nInterest Index \niv \n0.25 \n0.01 \n0.00 \n0.00 \n0.00 \n0.07 \n0.10/0.10 \nMyNorth \nInternational \nFixed Interest \nIndex - Hedged \n", - "---Example 1 End---", - "For this case: ", - "a. The table header is with disorder issue during PDF contents extraction issue.", - "b. The fund name is after the data row, e.g. MyNorth Australian Fixed Interest Index", - "c. The data points numbers order in data row, for example: \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 is correct as initial table structure.", - "The 1st number: 0.20 is the management_fee, the 2nd number and the 3th number: 0.01 0.00 are the indirect costs, ", - "the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ", - "the 6th number: 0.00 is the transaction costs, ", - "the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.", - "The management_fee_and_costs is management_fee + indirect costs = 0.20 + 0.01 + 0.00= 0.21", - "The output should be: ", - "{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"performance_fee\": 0.00, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", - "\n", - "---Example 2 Start---", - "Indirect costs \ni \nEstimated performance fees \nii \nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nMyNorth Index \nModerately \nDefensive \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 \nMyNorth Index \nBalanced \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.09/0.09 \n", - "---Example 2 End---", - "For this case: ", - "a. The table header is with disorder issue during PDF contents extraction issue.", - "b. The fund name is before the data row, e.g. MyNorth Index Moderately \nDefensive", - "c. The data points numbers order in data row, for example: \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 is correct as initial table structure.", - "The 1st number: 0.55 is the management_fee, the 2nd number and the 3th number: 0.00 0.00 are the indirect costs, ", - "the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the performance_fee by interposed vehicles, ", - "the 6th number: 0.01 is the transaction costs, please ignore this number.", - "the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.", - "The management_fee_and_costs is management_fee + indirect costs = 0.55 + 0.00 + 0.00= 0.55", - "The output should be: ", - "{\"data\": [{\"fund name\": \"MyNorth Index Moderately Defensive\", \"share name\": \"MyNorth Index Moderately Defensive\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.00, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth Index Balanced\", \"share name\": \"MyNorth Index Balanced\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.00, \"buy_spread\": 0.09, \"sell_spread\": 0.09}]}" - ] - }, { "keywords": ["Retirement and TTR income streams"], "prompts": ["Complex management fee and costs rule:", @@ -455,67 +395,45 @@ ] }, { - "keywords": ["Option name \nIndirect costs"], - "prompts": ["Complex management fee and costs rule:", - "If the table with columns:", - "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", - "The management_fee is \"Management fee (% pa)\".", - "The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".", - "The indirect_costs is \"Estimated other indirect costs\"", - "The recoverable_expenses is \"Recoverable expenses\"", - "The performance_fee is \"Peformance fees charged to the option by underlying managers\".", - "The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"", - "The buy_spread and sell_spread are \"Buy/sell spreads\".", - "---Example 1 Start---", - "Option name \nIndirect costs \n(i)\nEstimated performance fees \n(ii)\nManagement \nfee \n(% pa) \n(i) \n(% pa) \n(% pa) \nTransaction \ncosts \n(% pa) \nBuy/sell \nspreads \n(%) \n(iv) \nRecoverable \nexpenses \n(iii) \nEstimated other \nindirect costs \nPerformance \nfees charged to \nthe option \nby underlying \nmanagers \nPerformance \nfees charged \nby interposed \nvehicles \nGenerations Defensive \n0.90 \n0.26 \n0.12 \n0.00 \n0.06 \n0.17 \n0.09/0.08 \nGenerations Moderately \nDefensive \n1.00 \n0.08 \n0.10 \n0.00 \n0.05 \n0.17 \n0.10/0.10 \n", - "---Example 1 End---", - "For this case: ", - "a. The table header is with disorder issue during PDF contents extraction issue.", - "b. The fund name is before the data row, e.g. Generations Defensive", - "c. The data points numbers order in data row, for example: \n0.90 \n0.26 \n0.12 \n0.00 \n0.06 \n0.17 \n0.09/0.08 \n is correct as initial table structure.", - "The 1st number: 0.90 is the management_fee,", - "the 2nd number: 0.26 is the recoverable_expenses,", - "the 3rd number: 0.12 is the indirect_costs", - "the 4th number: 0.00 is the performance_fee,", - "the 5th number: 0.06 is the interposed_vehicle_performance_fee_cost, ", - "the 6th number: 0.17 is the transaction costs, please ignore this number.", - "the 7th number: 0.09 is the buy_spread, ", - "the 8th number: 0.08 is the sell_spread.", - "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.90 + 0.26 + 0.12= 1.28", - "The output should be: ", - "{\"data\": [{\"fund name\": \"Generations Defensive\", \"share name\": \"Generations Defensive\", \"management_fee_and_costs\": 1.28, \"management_fee\": 0.9, \"recoverable_expenses\": 0.26, \"indirect_costs\": 0.12, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.06, \"buy_spread\": 0.09, \"sell_spread\": 0.08}, {\"fund name\": \"Generations Moderately Defensive\", \"share name\": \"Generations Moderately Defensive\", \"management_fee_and_costs\": 1.18, \"management_fee\": 1, \"recoverable_expenses\": 0.08, \"indirect_costs\": 0.1,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" - ] - }, - { - "keywords": "Management \nfee (i) \n(% pa) \nIndirect costs (i) \n(% pa)", + "keywords": "Recoverable expenses \nEstimated other indirect costs", "prompts": ["Complex management fee and costs rule:", "If the table with columns:", "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", "The management_fee is \"Management fee (% pa)\".", "The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".", - "The indirect_costs is \"Estimated other indirect costs\"", "The recoverable_expenses is \"Recoverable expenses\"", + "The indirect_costs is \"Estimated other indirect costs\"", "The performance_fee is \"Peformance fees charged to the Investment Option by underlying managers\".", "The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"", "The buy_spread and sell_spread are \"Buy/sell spreads\".", "---Example 1 Start---", - "Investment Option \nManagement \nfee (i) \n(% pa) \nIndirect costs (i) \n(% pa) \nEstimated performance fees (ii) \n(% pa) \nTransaction \ncosts (% pa) \nBuy/sell \nspreads (%) \nRecoverable \nexpenses (iii) \nEstimated \nother \nindirect costs \nPerformance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \nPerformance fees \ncharged by \ninterposed \nvehicles \nNorth Active Defensive \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \nNorth Active Moderately \nDefensive \n0.72 \n0.07 \n0.04 \n0.00 \n0.01 \n0.14 \n0.09/0.09 \nNorth Index Growth \n0.45 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.06/0.06 \nNorth Index High Growth \n0.45 \n0.00 \n0.01 \n0.00 \n0.00 \n0.01 \n0.06/0.07 \n", + "Investment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \nNorth Active Defensive \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \nNorth Active Moderately \nDefensive \n0.72 \n0.07 \n0.04 \n0.00 \n0.01 \n0.14 \n0.09/0.09 \nNorth Index Growth \n0.45 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.06/0.06 \nNorth Index High Growth \n0.45 \n0.00 \n0.01 \n0.00 \n0.00 \n0.01 \n0.06/0.07 \n", "---Example 1 End---", "For this case: ", - "a. The table header is with secondary-level header.", - "b. The fund name is before the data row, e.g. North Active Defensive", - "c. The data points numbers order in data row, for example: \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \n is correct as initial table structure.", + "a. The fund name is before the data row, e.g. North Active Defensive", + "c. The data points numbers in data row. ", + "For example: \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \n is with correct order as initial table structure.", "The 1st number: 0.62 is the management_fee,", "the 2nd number: 0.18 is the recoverable_expenses,", "the 3rd number: 0.05 is the indirect_costs", "the 4th number: 0.00 is the performance_fee,", "the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ", - "the 6th number: 0.14 is the transaction costs, please ignore this number.", + "the 6th number: 0.14 is the Transaction costs (% pa).", "the 7th number: 0.08 is the buy_spread, ", "the 8th number: 0.08 is the sell_spread.", - "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.62 + 0.18 + 0.05= 0.85", + "The management_fee_and_costs is Management fee (i) + Recoverable expenses + Estimated other indirect costs = 0.62 + 0.18 + 0.05= 0.85", + "**Attention: Ignore Transaction costs (% pa), the 6th number, DO NOT APPLY ITS VALUE TO CALCULATE management_fee_and_costs!!!**", "The output should be: ", - "{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}" + "{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}", + "---Example 2 Start---", + "Investment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 \nMyNorth \nAustralian Fixed \nInterest Index \niv \n0.25 \n0.01 \n0.00 \n0.00 \n0.00 \n0.07 \n0.10/0.10 \nMyNorth \nInternational \nFixed Interest \nIndex - Hedged \n", + "---Example 2 End---", + "For this case: ", + "a. This table header is same as Example 1.", + "b. The algorithm to calculate management_fee_and_costs is same as Example 1.", + "c. The difference is **the fund name is after the data row, e.g. the fund name of the first data row is: MyNorth Australian Fixed Interest Index**", + "The output should be: ", + "{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" ] } ] diff --git a/main.py b/main.py index e28c426..64fe6e0 100644 --- a/main.py +++ b/main.py @@ -1506,7 +1506,7 @@ if __name__ == "__main__": re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = False + force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" if doc_source == "aus_prospectus": @@ -1525,7 +1525,8 @@ if __name__ == "__main__": # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" - # special_doc_id_list: list = ["539261734"] + # special_doc_id_list: list = ["410899007", "539266880", "539266817", + # "539261734", "539266893"] # special_doc_id_list: list = ["401212184"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" diff --git a/utils/biz_utils.py b/utils/biz_utils.py index ade84a3..8485bc2 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -1033,4 +1033,84 @@ def remove_abundant_data_detail(data_detail_list: list, for remove_data in remove_list: if remove_data in data_detail_list: data_detail_list.remove(remove_data) - return data_detail_list \ No newline at end of file + return data_detail_list + + +def replace_special_table_header(page_text: str): + """ + For some special table header, replace to the standard header + e.g. + raw header 1: + Investment Option \n + Management \nfee (i) \n(% pa) \n + Indirect costs (i) \n(% pa) \n + Estimated performance fees (ii) \n(% pa) \n + Transaction \ncosts (% pa) \n + Buy/sell \nspreads (%) \n + Recoverable \nexpenses (iii) \n + Estimated \nother \nindirect costs \n + Performance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \n + Performance fees \ncharged by \ninterposed \nvehicles \n + + raw header 2: + Fund \n + Management \nfee 1 \n(% pa) \n + Indirect costs1\n(% pa)\n + Estimated performance fees2\n(% pa)\n + Transaction \ncosts \n(% pa) \n + Buy/sell \nspreads (%) \n + Recoverable \nexpenses 3 \n + Estimated \nother indirect \ncosts \n + Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n + Performance \nfees charged \nby interposed \nvehicles \n + + There are 2 layers of headers, the first layer is the main header, the second layer is the sub header + The purpose is to merge the sub header to the main header + Indirect costs (i) \n(% pa) replace to Recoverable expenses\nEstimated other indirect costs + Estimated performance fees2\n(% pa) replace to Performance fees charged to the Fund by underlying managers\nPerformance fees charged by interposed vehicles + + Remove the second layer header. + e.g. + Recoverable \nexpenses (iii) \n + Estimated \nother \nindirect costs \n + Performance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \n + Performance fees \ncharged by \ninterposed \nvehicles \n + + or + + Recoverable \nexpenses 3 \n + Estimated \nother indirect \ncosts \n + Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n + Performance \nfees charged \nby interposed \nvehicles \n + """ + replace_info_list = [ + { + # item 0: document 410899007 + # item 1: document 539266880, 539266817, 539261734 + # item 2: document 539266893 + "regex_all_list": + [r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n", + r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?interposed\s*vehicles\s*\n", + r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"], + "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n" + }, + { + # item 0: document 410899007 + "regex_all_list": + [r"Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Transactions\s*costs[\s\S]*?Buy\/sell\s*spreads\s*\(\%\)\s*\n"], + "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n" + } + ] + updated_text = False + for replace_info in replace_info_list: + for regex_all in replace_info["regex_all_list"]: + if re.search(regex_all, page_text) is not None: + page_text = re.sub(regex_all, replace_info["replace_text"], page_text) + updated_text = True + break + if updated_text: + break + return page_text + + + \ No newline at end of file From 2cd4f5f787e45e932b2d5e294dfc0b09696c7153 Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 7 Mar 2025 15:02:12 -0600 Subject: [PATCH 04/11] Supplement provider information to ground truth data Calculate metrics based on providers Integrate "merge" data algorithm for AUS Prospectus final outputs --- calc_metrics.py | 586 ++++++++++++++++++++++++++-------------- core/data_extraction.py | 2 +- core/data_mapping.py | 175 +++++++++++- main.py | 43 ++- prepare_data.py | 288 +++++++++++++++++++- 5 files changed, 854 insertions(+), 240 deletions(-) diff --git a/calc_metrics.py b/calc_metrics.py index 41de9e1..b136fb8 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -11,9 +11,7 @@ import traceback from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score import requests import fitz -from copy import deepcopy from utils.similarity import Similarity -from core.auz_nz.hybrid_solution_script import final_function_to_match def calc_metrics(ground_truth_file: str, prediction_file: str): @@ -891,6 +889,8 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros output_folder = r"/data/aus_prospectus/output/metrics_data/" os.makedirs(output_folder, exist_ok=True) verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") + if is_for_all: + verify_file_name = f"metrics_{verify_file_name}_all" metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx" output_file = os.path.join(output_folder, metrics_file_name) with pd.ExcelWriter(output_file) as writer: @@ -898,6 +898,369 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros message_data_df.to_excel(writer, index=False, sheet_name="message_data") +def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx", + audit_data_sheet: str = "Sheet1", + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_17_documents_by_text_20250303171140.xlsx", + verify_data_sheet: str = "total_data", + verify_document_list_file: str = None, + is_for_all: bool = False + ): + print("Start to calculate metrics based on DB data file and extracted file...") + audit_data_df = pd.DataFrame() + verify_data_df = pd.DataFrame() + + audit_fields = [ + "DocumentId", + "provider_id", + "provider_name", + "FundLegalName", + "FundId", + "FundClassLegalName", + "FundClassId", + "management_fee_and_costs", + "management_fee", + "administration_fees", + "minimum_initial_investment", + "benchmark_name", + "performance_fee", + "interposed_vehicle_performance_fee_cost", + "buy_spread", + "sell_spread", + "total_annual_dollar_based_charges" + ] + audit_data_df = pd.read_excel(audit_file_path, sheet_name=audit_data_sheet) + audit_data_df = audit_data_df[audit_fields] + audit_data_df = audit_data_df.drop_duplicates() + audit_data_df = audit_data_df.rename(columns={"DocumentId": "doc_id", + "FundLegalName": "fund_name", + "FundId": "fund_id", + "FundClassLegalName": "sec_name", + "FundClassId": "sec_id"}) + audit_data_df.fillna("", inplace=True) + audit_data_df.reset_index(drop=True, inplace=True) + + verify_fields = [ + "DocumentId", + "raw_fund_name", + "fund_id", + "fund_name", + "raw_share_name", + "sec_id", + "sec_name", + "management_fee_and_costs", + "management_fee", + "administration_fees", + "minimum_initial_investment", + "benchmark_name", + "performance_fee", + "interposed_vehicle_performance_fee_cost", + "buy_spread", + "sell_spread", + "total_annual_dollar_based_charges" + ] + verify_data_df = pd.read_excel(verify_file_path, sheet_name=verify_data_sheet) + verify_data_df = verify_data_df[verify_fields] + verify_data_df = verify_data_df.drop_duplicates() + verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"}) + verify_data_df.fillna("", inplace=True) + verify_data_df.reset_index(drop=True, inplace=True) + + if len(audit_data_df) == 0 or len(verify_data_df) == 0: + print("No data to calculate metrics.") + return + + # Calculate metrics + if verify_document_list_file is not None: + with open(verify_document_list_file, "r", encoding="utf-8") as f: + verify_document_list = f.readlines() + verify_document_list = [int(doc_id.strip()) for doc_id in verify_document_list] + if len(verify_document_list) > 0: + verify_data_df = verify_data_df[verify_data_df["doc_id"].isin(verify_document_list)] + document_id_list = verify_data_df["doc_id"].unique().tolist() + + print(f"Total document count: {len(document_id_list)}") + print("Construct ground truth and prediction data...") + # similarity = Similarity() + message_list = [] + provider_gt_pred_data = {} + for document_id in document_id_list: + doc_audit_data = audit_data_df[audit_data_df["doc_id"] == document_id] + provider_id = doc_audit_data["provider_id"].iloc[0] + provider_name = doc_audit_data["provider_name"].iloc[0] + if provider_id not in list(provider_gt_pred_data.keys()): + provider_gt_pred_data[provider_id] = {"provider_name": provider_name, + "gt_management_fee_and_costs_list": [], + "pred_management_fee_and_costs_list": [], + "gt_management_fee_list": [], + "pred_management_fee_list": [], + "gt_administration_fees_list": [], + "pred_administration_fees_list": [], + "gt_minimum_initial_investment_list": [], + "pred_minimum_initial_investment_list": [], + "gt_benchmark_name_list": [], + "pred_benchmark_name_list": []} + if is_for_all: + provider_gt_pred_data[provider_id].update({"gt_performance_fee_list": [], + "pred_performance_fee_list": [], + "gt_interposed_vehicle_performance_fee_cost_list": [], + "pred_interposed_vehicle_performance_fee_cost_list": [], + "gt_buy_spread_list": [], + "pred_buy_spread_list": [], + "gt_sell_spread_list": [], + "pred_sell_spread_list": [], + "gt_total_annual_dollar_based_charges_list": [], + "pred_total_annual_dollar_based_charges_list": []}) + audit_sec_id_list = [doc_sec_id for doc_sec_id + in doc_audit_data["sec_id"].unique().tolist() + if len(doc_sec_id) > 0] + # get doc_verify_data which doc_id is same as document_id and sec_id in audit_sec_id_list + doc_verify_data = verify_data_df[(verify_data_df["doc_id"] == document_id) & (verify_data_df["sec_id"].isin(audit_sec_id_list))] + for index, row in doc_audit_data.iterrows(): + fund_name = row["fund_name"] + sec_id = row["sec_id"] + management_fee_and_costs = str(row["management_fee_and_costs"]) + management_fee = str(row["management_fee"]) + administration_fees = str(row["administration_fees"]) + minimum_initial_investment = str(row["minimum_initial_investment"]) + benchmark_name = str(row["benchmark_name"]) + if is_for_all: + performance_fee = str(row["performance_fee"]) + interposed_vehicle_performance_fee_cost = str(row["interposed_vehicle_performance_fee_cost"]) + buy_spread = str(row["buy_spread"]) + sell_spread = str(row["sell_spread"]) + total_annual_dollar_based_charges = str(row["total_annual_dollar_based_charges"]) + + # get the first row which sec_id in doc_verify_data is same as sec_id + doc_verify_sec_data = doc_verify_data[doc_verify_data["sec_id"] == sec_id] + if len(doc_verify_sec_data) == 0: + continue + doc_verify_sec_row = doc_verify_sec_data.iloc[0] + raw_fund_name = doc_verify_sec_row["raw_fund_name"] + v_management_fee_and_costs = str(doc_verify_sec_row["management_fee_and_costs"]) + v_management_fee = str(doc_verify_sec_row["management_fee"]) + v_administration_fees = str(doc_verify_sec_row["administration_fees"]) + v_minimum_initial_investment = str(doc_verify_sec_row["minimum_initial_investment"]) + v_benchmark_name = str(doc_verify_sec_row["benchmark_name"]) + if is_for_all: + v_performance_fee = str(doc_verify_sec_row["performance_fee"]) + v_interposed_vehicle_performance_fee_cost = str(doc_verify_sec_row["interposed_vehicle_performance_fee_cost"]) + v_buy_spread = str(doc_verify_sec_row["buy_spread"]) + v_sell_spread = str(doc_verify_sec_row["sell_spread"]) + v_total_annual_dollar_based_charges = str(doc_verify_sec_row["total_annual_dollar_based_charges"]) + + message = get_gt_pred_by_compare_values(management_fee_and_costs, + v_management_fee_and_costs, + provider_gt_pred_data[provider_id]["gt_management_fee_and_costs_list"], + provider_gt_pred_data[provider_id]["pred_management_fee_and_costs_list"], + data_point="management_fee_and_costs") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee_and_costs")) + message = get_gt_pred_by_compare_values(management_fee, + v_management_fee, + provider_gt_pred_data[provider_id]["gt_management_fee_list"], + provider_gt_pred_data[provider_id]["pred_management_fee_list"], + data_point="management_fee") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee")) + message = get_gt_pred_by_compare_values(administration_fees, + v_administration_fees, + provider_gt_pred_data[provider_id]["gt_administration_fees_list"], + provider_gt_pred_data[provider_id]["pred_administration_fees_list"], + data_point="administration_fees") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "administration_fees")) + message = get_gt_pred_by_compare_values(minimum_initial_investment, + v_minimum_initial_investment, + provider_gt_pred_data[provider_id]["gt_minimum_initial_investment_list"], + provider_gt_pred_data[provider_id]["pred_minimum_initial_investment_list"], + data_point="minimum_initial_investment") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "minimum_initial_investment")) + message = get_gt_pred_by_compare_values(benchmark_name, + v_benchmark_name, + provider_gt_pred_data[provider_id]["gt_benchmark_name_list"], + provider_gt_pred_data[provider_id]["pred_benchmark_name_list"], + data_point="benchmark_name") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "benchmark_name")) + if is_for_all: + message = get_gt_pred_by_compare_values(performance_fee, + v_performance_fee, + provider_gt_pred_data[provider_id]["gt_performance_fee_list"], + provider_gt_pred_data[provider_id]["pred_performance_fee_list"], + data_point="performance_fee") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "performance_fee")) + message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, + v_interposed_vehicle_performance_fee_cost, + provider_gt_pred_data[provider_id]["gt_interposed_vehicle_performance_fee_cost_list"], + provider_gt_pred_data[provider_id]["pred_interposed_vehicle_performance_fee_cost_list"], + data_point="interposed_vehicle_performance_fee_cost") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "interposed_vehicle_performance_fee_cost")) + message = get_gt_pred_by_compare_values(buy_spread, + v_buy_spread, + provider_gt_pred_data[provider_id]["gt_buy_spread_list"], + provider_gt_pred_data[provider_id]["pred_buy_spread_list"], + data_point="buy_spread") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "buy_spread")) + message = get_gt_pred_by_compare_values(sell_spread, + v_sell_spread, + provider_gt_pred_data[provider_id]["gt_sell_spread_list"], + provider_gt_pred_data[provider_id]["pred_sell_spread_list"], + data_point="sell_spread") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "sell_spread")) + message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, + v_total_annual_dollar_based_charges, + provider_gt_pred_data[provider_id]["gt_total_annual_dollar_based_charges_list"], + provider_gt_pred_data[provider_id]["pred_total_annual_dollar_based_charges_list"], + data_point="total_annual_dollar_based_charges") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "total_annual_dollar_based_charges")) + + message_data_df = pd.DataFrame(message_list) + message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name', 'data_point', 'gt_value', 'pred_value', 'error']] + # order by doc_id, raw_fund_name, data_point + message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_fund_name', 'data_point']) + message_data_df.reset_index(drop=True, inplace=True) + + # calculate metrics + print("Calculate metrics...") + provider_metrics_list = [] + for provider_id, gt_pred_data in provider_gt_pred_data.items(): + provider_name = gt_pred_data["provider_name"] + precision_management_fee_and_costs = precision_score(gt_pred_data["gt_management_fee_and_costs_list"], + gt_pred_data["pred_management_fee_and_costs_list"]) + recall_management_fee_and_costs = recall_score(gt_pred_data["gt_management_fee_and_costs_list"], gt_pred_data["pred_management_fee_and_costs_list"]) + f1_management_fee_and_costs = f1_score(gt_pred_data["gt_management_fee_and_costs_list"], gt_pred_data["pred_management_fee_and_costs_list"]) + accuracy_management_fee_and_costs = accuracy_score(gt_pred_data["gt_management_fee_and_costs_list"], gt_pred_data["pred_management_fee_and_costs_list"]) + support_management_fee_and_costs = sum(gt_pred_data["gt_management_fee_and_costs_list"]) + + precision_management_fee = precision_score(gt_pred_data["gt_management_fee_list"], gt_pred_data["pred_management_fee_list"]) + recall_management_fee = recall_score(gt_pred_data["gt_management_fee_list"], gt_pred_data["pred_management_fee_list"]) + f1_management_fee = f1_score(gt_pred_data["gt_management_fee_list"], gt_pred_data["pred_management_fee_list"]) + accuracy_management_fee = accuracy_score(gt_pred_data["gt_management_fee_list"], gt_pred_data["pred_management_fee_list"]) + support_management_fee = sum(gt_pred_data["gt_management_fee_list"]) + + precision_administration_fees = precision_score(gt_pred_data["gt_administration_fees_list"], gt_pred_data["pred_administration_fees_list"]) + recall_administration_fees = recall_score(gt_pred_data["gt_administration_fees_list"], gt_pred_data["pred_administration_fees_list"]) + f1_administration_fees = f1_score(gt_pred_data["gt_administration_fees_list"], gt_pred_data["pred_administration_fees_list"]) + accuracy_administration_fees = accuracy_score(gt_pred_data["gt_administration_fees_list"], gt_pred_data["pred_administration_fees_list"]) + support_administration_fees = sum(gt_pred_data["gt_administration_fees_list"]) + + precision_miminimum_initial_investment = precision_score(gt_pred_data["gt_minimum_initial_investment_list"], + gt_pred_data["pred_minimum_initial_investment_list"]) + recall_miminimum_initial_investment = recall_score(gt_pred_data["gt_minimum_initial_investment_list"], + gt_pred_data["pred_minimum_initial_investment_list"]) + f1_miminimum_initial_investment = f1_score(gt_pred_data["gt_minimum_initial_investment_list"], + gt_pred_data["pred_minimum_initial_investment_list"]) + accuracy_miminimum_initial_investment = accuracy_score(gt_pred_data["gt_minimum_initial_investment_list"], + gt_pred_data["pred_minimum_initial_investment_list"]) + support_miminimum_initial_investment = sum(gt_pred_data["gt_minimum_initial_investment_list"]) + + precision_benchmark_name = precision_score(gt_pred_data["gt_benchmark_name_list"], + gt_pred_data["pred_benchmark_name_list"]) + recall_benchmark_name = recall_score(gt_pred_data["gt_benchmark_name_list"], + gt_pred_data["pred_benchmark_name_list"]) + f1_benchmark_name = f1_score(gt_pred_data["gt_benchmark_name_list"], + gt_pred_data["pred_benchmark_name_list"]) + accuracy_benchmark_name = accuracy_score(gt_pred_data["gt_benchmark_name_list"], + gt_pred_data["pred_benchmark_name_list"]) + support_benchmark_name = sum(gt_pred_data["gt_benchmark_name_list"]) + + if is_for_all: + precision_performance_fee = precision_score(gt_pred_data["gt_performance_fee_list"], + gt_pred_data["pred_performance_fee_list"]) + recall_performance_fee = recall_score(gt_pred_data["gt_performance_fee_list"], + gt_pred_data["pred_performance_fee_list"]) + f1_performance_fee = f1_score(gt_pred_data["gt_performance_fee_list"], + gt_pred_data["pred_performance_fee_list"]) + accuracy_performance_fee = accuracy_score(gt_pred_data["gt_performance_fee_list"], + gt_pred_data["pred_performance_fee_list"]) + support_performance_fee = sum(gt_pred_data["gt_performance_fee_list"]) + + precision_interposed_vehicle_performance_fee_cost = precision_score(gt_pred_data["gt_interposed_vehicle_performance_fee_cost_list"], + gt_pred_data["pred_interposed_vehicle_performance_fee_cost_list"]) + recall_interposed_vehicle_performance_fee_cost = recall_score(gt_pred_data["gt_interposed_vehicle_performance_fee_cost_list"], + gt_pred_data["pred_interposed_vehicle_performance_fee_cost_list"]) + f1_interposed_vehicle_performance_fee_cost = f1_score(gt_pred_data["gt_interposed_vehicle_performance_fee_cost_list"], + gt_pred_data["pred_interposed_vehicle_performance_fee_cost_list"]) + accuracy_interposed_vehicle_performance_fee_cost = accuracy_score(gt_pred_data["gt_interposed_vehicle_performance_fee_cost_list"], + gt_pred_data["pred_interposed_vehicle_performance_fee_cost_list"]) + support_interposed_vehicle_performance_fee_cost = sum(gt_pred_data["gt_interposed_vehicle_performance_fee_cost_list"]) + + precision_buy_spread = precision_score(gt_pred_data["gt_buy_spread_list"], + gt_pred_data["pred_buy_spread_list"]) + recall_buy_spread = recall_score(gt_pred_data["gt_buy_spread_list"], + gt_pred_data["pred_buy_spread_list"]) + f1_buy_spread = f1_score(gt_pred_data["gt_buy_spread_list"], + gt_pred_data["pred_buy_spread_list"]) + accuracy_buy_spread = accuracy_score(gt_pred_data["gt_buy_spread_list"], + gt_pred_data["pred_buy_spread_list"]) + support_buy_spread = sum(gt_pred_data["gt_buy_spread_list"]) + + precision_sell_spread = precision_score(gt_pred_data["gt_sell_spread_list"], + gt_pred_data["pred_sell_spread_list"]) + recall_sell_spread = recall_score(gt_pred_data["gt_sell_spread_list"], + gt_pred_data["pred_sell_spread_list"]) + f1_sell_spread = f1_score(gt_pred_data["gt_sell_spread_list"], + gt_pred_data["pred_sell_spread_list"]) + accuracy_sell_spread = accuracy_score(gt_pred_data["gt_sell_spread_list"], + gt_pred_data["pred_sell_spread_list"]) + support_buy_spread = sum(gt_pred_data["gt_sell_spread_list"]) + + precision_total_annual_dollar_based_charges = precision_score(gt_pred_data["gt_total_annual_dollar_based_charges_list"], + gt_pred_data["pred_total_annual_dollar_based_charges_list"]) + recall_total_annual_dollar_based_charges = recall_score(gt_pred_data["gt_total_annual_dollar_based_charges_list"], + gt_pred_data["pred_total_annual_dollar_based_charges_list"]) + f1_total_annual_dollar_based_charges = f1_score(gt_pred_data["gt_total_annual_dollar_based_charges_list"], + gt_pred_data["pred_total_annual_dollar_based_charges_list"]) + accuracy_total_annual_dollar_based_charges = accuracy_score(gt_pred_data["gt_total_annual_dollar_based_charges_list"], + gt_pred_data["pred_total_annual_dollar_based_charges_list"]) + support_total_annual_dollar_based_charges = sum(gt_pred_data["gt_total_annual_dollar_based_charges_list"]) + + if is_for_all: + metrics_data = [{"provider_id": provider_id, "provider_name": provider_name, "item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "interposed_vehicle_performance_fee_cost", "precision": precision_interposed_vehicle_performance_fee_cost, "recall": recall_interposed_vehicle_performance_fee_cost, + "f1": f1_interposed_vehicle_performance_fee_cost, "accuracy": accuracy_interposed_vehicle_performance_fee_cost, "support": support_interposed_vehicle_performance_fee_cost}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "sell_spread", "precision": precision_sell_spread, "recall": recall_sell_spread, "f1": f1_sell_spread, "accuracy": accuracy_sell_spread, "support": support_buy_spread}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "total_annual_dollar_based_charges", "precision": precision_total_annual_dollar_based_charges, "recall": recall_total_annual_dollar_based_charges, + "f1": f1_total_annual_dollar_based_charges, "accuracy": accuracy_total_annual_dollar_based_charges, "support": support_total_annual_dollar_based_charges} + ] + else: + metrics_data = [{"provider_id": provider_id, "provider_name": provider_name, "item": "management_fee_and_costs", "precision": precision_management_fee_and_costs, "recall": recall_management_fee_and_costs, "f1": f1_management_fee_and_costs, "accuracy": accuracy_management_fee_and_costs, "support": support_management_fee_and_costs}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "management_fee", "precision": precision_management_fee, "recall": recall_management_fee, "f1": f1_management_fee, "accuracy": accuracy_management_fee, "support": support_management_fee}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name} + ] + metrics_data_df = pd.DataFrame(metrics_data) + averate_precision = metrics_data_df["precision"].mean() + average_recall = metrics_data_df["recall"].mean() + average_f1 = metrics_data_df["f1"].mean() + average_accuracy = metrics_data_df["accuracy"].mean() + sum_support = metrics_data_df["support"].sum() + metrics_data.append({"provider_id": provider_id, "provider_name": provider_name, "item": "average_score", "precision": averate_precision, "recall": average_recall, "f1": average_f1, "accuracy": average_accuracy, "support": sum_support}) + metrics_data_df = pd.DataFrame(metrics_data) + metrics_data_df = metrics_data_df[["provider_id", "provider_name", "item", "f1", "precision", "recall", "accuracy", "support"]] + provider_metrics_list.append(metrics_data_df) + + all_provider_metrics_df = pd.concat(provider_metrics_list) + all_provider_metrics_df.reset_index(drop=True, inplace=True) + + # output metrics data to Excel file + print("Output metrics data to Excel file...") + output_folder = r"/data/aus_prospectus/output/metrics_data/" + os.makedirs(output_folder, exist_ok=True) + verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") + if is_for_all: + verify_file_name = f"{verify_file_name}_all" + metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_for_providers.xlsx" + output_file = os.path.join(output_folder, metrics_file_name) + with pd.ExcelWriter(output_file) as writer: + all_provider_metrics_df.to_excel(writer, index=False, sheet_name="metrics_data") + message_data_df.to_excel(writer, index=False, sheet_name="message_data") + + + def generate_message(message: dict, doc_id: str, sec_id: str, fund_legal_name: str, raw_fund_name: str, datapoint: str): message["data_point"] = datapoint message["fund_legal_name"] = fund_legal_name @@ -954,203 +1317,6 @@ def clean_text(text: str): text = re.sub(r"\W", " ", text) text = re.sub(r"\s+", " ", text) return text - - -def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx", - data_sheet: str = "Sheet1", - raw_name_column: str = "raw_share_name", - mapping_file_path: str = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx", - mapping_sheet: str = "document_mapping", - raw_name_mapping_column: str = None, - output_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"): - data_df = pd.read_excel(data_file_path, sheet_name=data_sheet) - data_df["fund_id"] = "" - data_df["fund_name"] = "" - data_df["sec_id"] = "" - data_df["sec_name"] = "" - - mapping_data = pd.read_excel(mapping_file_path, sheet_name=mapping_sheet) - - doc_id_list = data_df["doc_id"].unique().tolist() - for doc_id in doc_id_list: - doc_data = data_df[data_df["doc_id"] == doc_id] - raw_name_list = doc_data[raw_name_column].unique().tolist() - - doc_mapping_data = mapping_data[mapping_data["DocumentId"] == doc_id] - if len(doc_mapping_data) == 0: - continue - provider_name = doc_mapping_data["CompanyName"].values[0] - if raw_name_mapping_column is not None and raw_name_mapping_column == "FundLegalName": - doc_db_name_list = doc_mapping_data[raw_name_mapping_column].unique().tolist() - for raw_name in raw_name_list: - find_df = doc_mapping_data[doc_mapping_data[raw_name_mapping_column] == raw_name] - if find_df is not None and len(find_df) == 1: - sec_id = find_df["FundClassId"].values[0] - sec_name = find_df["FundClassLegalName"].values[0] - fund_id = find_df["FundId"].values[0] - fund_name = find_df["FundLegalName"].values[0] - # update doc_data which raw_share_name is same as raw_share_name - data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_id"] = sec_id - data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_name"] = sec_name - data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_id"] = fund_id - data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_name"] = fund_name - else: - doc_db_name_list = doc_mapping_data["FundClassLegalName"].unique().tolist() - all_match_result = get_raw_name_db_match_result(doc_id, - provider_name, - raw_name_list, - doc_db_name_list, - iter_count=60) - for raw_share_name in raw_name_list: - if all_match_result.get(raw_share_name) is not None: - matched_db_share_name = all_match_result[raw_share_name] - if ( - matched_db_share_name is not None - and len(matched_db_share_name) > 0 - ): - # get SecId from self.doc_fund_class_mapping - find_share_df = doc_mapping_data[doc_mapping_data["FundClassLegalName"] == matched_db_share_name] - if find_share_df is not None and len(find_share_df) > 0: - sec_id = find_share_df["FundClassId"].values[0] - fund_id = find_share_df["FundId"].values[0] - fund_name = find_share_df["FundLegalName"].values[0] - # update doc_data which raw_share_name is same as raw_share_name - data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_id"] = sec_id - data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_name"] = matched_db_share_name - data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_id"] = fund_id - data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_name"] = fund_name - try: - data_df = data_df[["doc_id", - "raw_fund_name", - "fund_id", - "fund_name", - "raw_share_name", - "sec_id", - "sec_name", - "management_fee_and_costs", - "management_fee", - "administration_fees", - "minimum_initial_investment", - "benchmark_name", - "performance_fee", - "performance_fee_charged", - "buy_spread", - "sell_spread", - "total_annual_dollar_based_charges", - "interposed_vehicle_performance_fee_cost", - "establishment_fee", - "contribution_fee", - "withdrawal_fee", - "exit_fee", - "switching_fee", - "activity_fee", - "hurdle_rate", - "analyst_name" - ]] - except Exception as e: - print(e) - - with open(output_file_path, "wb") as file: - data_df.to_excel(file, index=False) - - -def get_raw_name_db_match_result( - doc_id: str, provider_name: str, raw_name_list: list, doc_share_name_list: list, iter_count: int = 30 - ): - # split raw_name_list into several parts which each part is with 30 elements - # The reason to split is to avoid invoke token limitation issues from CahtGPT - raw_name_list_parts = [ - raw_name_list[i : i + iter_count] - for i in range(0, len(raw_name_list), iter_count) - ] - all_match_result = {} - doc_share_name_list = deepcopy(doc_share_name_list) - for raw_name_list in raw_name_list_parts: - match_result, doc_share_name_list = get_final_function_to_match( - doc_id, provider_name, raw_name_list, doc_share_name_list - ) - all_match_result.update(match_result) - return all_match_result - -def get_final_function_to_match(doc_id, provider_name, raw_name_list, db_name_list): - if len(db_name_list) == 0: - match_result = {} - for raw_name in raw_name_list: - match_result[raw_name] = "" - else: - match_result = final_function_to_match( - doc_id=doc_id, - pred_list=raw_name_list, - db_list=db_name_list, - provider_name=provider_name, - doc_source="aus_prospectus" - ) - matched_name_list = list(match_result.values()) - db_name_list = remove_matched_names(db_name_list, matched_name_list) - return match_result, db_name_list - -def remove_matched_names(target_name_list: list, matched_name_list: list): - if len(matched_name_list) == 0: - return target_name_list - - matched_name_list = list(set(matched_name_list)) - matched_name_list = [ - value for value in matched_name_list if value is not None and len(value) > 0 - ] - for matched_name in matched_name_list: - if ( - matched_name is not None - and len(matched_name) > 0 - and matched_name in target_name_list - ): - target_name_list.remove(matched_name) - return target_name_list - - -def set_mapping_to_ravi_data(): - data_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx" - data_sheet = "Sheet1" - mapping_file_path = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" - mapping_sheet = "document_mapping" - output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" - set_mapping_to_raw_name_data(data_file_path, data_sheet, mapping_file_path, mapping_sheet, output_file_path) - - -def set_mapping_to_data_side_documents_data(): - # data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/Audited file_phase2.xlsx" - # data_sheet = "all" - # mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" - # mapping_sheet = "document_mapping" - # output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/audited_file_phase2_with_mapping.xlsx" - - data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx" - data_sheet = "ground_truth" - raw_name_column = "raw_share_name" - mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" - mapping_sheet = "document_mapping" - raw_name_mapping_column = None - output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" - set_mapping_to_raw_name_data(data_file_path=data_file_path, - data_sheet=data_sheet, - raw_name_column=raw_name_column, - mapping_file_path=mapping_file_path, - mapping_sheet=mapping_sheet, - raw_name_mapping_column=raw_name_mapping_column, - output_file_path=output_file_path) - - -def adjust_data_file(source_file: str, - targe_file: str): - source_data = pd.read_excel(source_file, sheet_name="Sheet1") - source_doc_id_list = source_data["DocumentId"].unique().tolist() - - target_data = pd.read_excel(targe_file, sheet_name="Sheet1") - #remove target_data which doc_id is in source_doc_id_list - target_data = target_data[~target_data["DocumentId"].isin(source_doc_id_list)] - # concat source_data and target_data - target_data = pd.concat([source_data, target_data], ignore_index=True) - with open(targe_file, "wb") as file: - target_data.to_excel(file, index=False) if __name__ == "__main__": @@ -1172,12 +1338,24 @@ if __name__ == "__main__": verify_data_sheet: str = "total_data" # verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt" verify_document_list_file_list = [None, "./sample_documents/aus_prospectus_29_documents_sample.txt", "./sample_documents/aus_prospectus_17_documents_sample.txt"] + is_for_all = False + # for verify_document_list_file in verify_document_list_file_list: + # calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, + # audit_data_sheet=audit_data_sheet, + # verify_file_path=verify_file_path, + # verify_data_sheet=verify_data_sheet, + # verify_document_list_file = verify_document_list_file, + # is_for_all=is_for_all) + for verify_document_list_file in verify_document_list_file_list: - calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, - audit_data_sheet=audit_data_sheet, - verify_file_path=verify_file_path, - verify_data_sheet=verify_data_sheet, - verify_document_list_file = verify_document_list_file) + calculate_metrics_by_provider(audit_file_path=audit_file_path, + audit_data_sheet=audit_data_sheet, + verify_file_path=verify_file_path, + verify_data_sheet=verify_data_sheet, + verify_document_list_file = verify_document_list_file, + is_for_all=is_for_all) + + # set_mapping_to_17_documents_data() # set_mapping_to_ravi_data() diff --git a/core/data_extraction.py b/core/data_extraction.py index 715546b..1998d61 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -576,7 +576,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num != 24: + # if page_num != 21: # continue if page_num in handled_page_num_list: continue diff --git a/core/data_mapping.py b/core/data_mapping.py index 8578c1c..e50798e 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -228,7 +228,180 @@ class DataMapping: mapped_data["similarity"] = 1 self.output_mapping_file(mapped_data_list) - return mapped_data_list + + if self.doc_source == "aus_prospectus": + output_data_folder_splits = self.output_data_excel_folder.split("output") + if len(output_data_folder_splits) == 2: + merged_data_folder = f'{output_data_folder_splits[0]}output/merged_data/docs/' + os.makedirs(merged_data_folder, exist_ok=True) + + merged_data_json_folder = os.path.join(merged_data_folder, "json/") + os.makedirs(merged_data_json_folder, exist_ok=True) + + merged_data_excel_folder = os.path.join(merged_data_folder, "excel/") + os.makedirs(merged_data_excel_folder, exist_ok=True) + merged_data_list = self.merge_output_data_aus_prospectus(mapped_data_list, + merged_data_json_folder, + merged_data_excel_folder) + return merged_data_list + else: + return mapped_data_list + + def merge_output_data_aus_prospectus(self, + mapped_data_list: list, + merged_data_json_folder: str, + merged_data_excel_folder: str): + # TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16 + if mapped_data_list is None or len(mapped_data_list) == 0: + return + if merged_data_json_folder is None or len(merged_data_json_folder) == 0: + return + if merged_data_excel_folder is None or len(merged_data_excel_folder) == 0: + return + mapping_data_df = pd.DataFrame(mapped_data_list) + mapping_data_df.reset_index(drop=True, inplace=True) + mapping_data_df.fillna("", inplace=True) + + document_mapping_df = self.document_mapping_info_df + document_mapping_df.fillna("", inplace=True) + + datapoint_keyword_config_file = ( + f"./configuration/{self.doc_source}/datapoint_name.json" + ) + with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f: + datapoint_keyword_config = json.load(f) + datapoint_name_list = list(datapoint_keyword_config.keys()) + total_data_list = [] + + doc_date = str(document_mapping_df["EffectiveDate"].values[0])[0:10] + share_doc_data_df = mapping_data_df[(mapping_data_df["investment_type"] == 1)] + exist_raw_name_list = [] + for index, row in share_doc_data_df.iterrows(): + doc_id = str(row["doc_id"]) + page_index = int(row["page_index"]) + raw_fund_name = str(row["raw_fund_name"]) + raw_share_name = str(row["raw_share_name"]) + raw_name = str(row["raw_name"]) + datapoint = str(row["datapoint"]) + value = row["value"] + investment_type = row["investment_type"] + share_class_id = row["investment_id"] + share_class_legal_name = row["investment_name"] + fund_id = "" + fund_legal_name = "" + if share_class_id != "": + record_row = document_mapping_df[document_mapping_df["SecId"] == share_class_id] + if len(record_row) > 0: + fund_id = record_row["FundId"].values[0] + fund_legal_name = record_row["FundName"].values[0] + + exist = False + for exist_raw_name_info in exist_raw_name_list: + exist_raw_name = exist_raw_name_info["raw_name"] + exist_investment_type = exist_raw_name_info["investment_type"] + exist_investment_id = exist_raw_name_info["investment_id"] + if ( + exist_raw_name == raw_name + and exist_investment_type == investment_type + ) or (len(exist_investment_id) > 0 and exist_investment_id == share_class_id): + exist = True + break + if not exist: + data = { + "DocumentId": doc_id, + "raw_fund_name": raw_fund_name, + "raw_share_name": raw_share_name, + "raw_name": raw_name, + "fund_id": fund_id, + "fund_name": fund_legal_name, + "sec_id": share_class_id, + "sec_name": share_class_legal_name, + "EffectiveDate": doc_date, + "page_index": [], + "RawName": raw_name, + } + for datapoint_name in datapoint_name_list: + data[datapoint_name] = "" + exist_raw_name_list.append( + {"raw_name": raw_name, "investment_type": investment_type, "investment_id": share_class_id} + ) + total_data_list.append(data) + # find data from total_data_list by raw_name + for data in total_data_list: + if data["raw_name"] == raw_name: + update_key = datapoint + data[update_key] = value + if page_index not in data["page_index"]: + data["page_index"].append(page_index) + break + if len(share_class_id) > 0 and data["sec_id"] == share_class_id: + update_key = datapoint + if len(str(data[update_key])) == 0: + data[update_key] = value + if page_index not in data["page_index"]: + data["page_index"].append(page_index) + break + + fund_doc_data_df = mapping_data_df[(mapping_data_df["investment_type"] == 33)] + fund_doc_data_df.fillna("", inplace=True) + for index, row in fund_doc_data_df.iterrows(): + doc_id = str(row["doc_id"]) + page_index = int(row["page_index"]) + raw_fund_name = str(row["raw_fund_name"]) + raw_share_name = "" + raw_name = str(row["raw_name"]) + datapoint = str(row["datapoint"]) + value = row["value"] + fund_id = row["investment_id"] + fund_legal_name = row["investment_name"] + exist = False + if fund_id != "": + for data in total_data_list: + if (fund_id != "" and data["fund_id"] == fund_id) or ( + data["raw_fund_name"] == raw_fund_name + ): + update_key = datapoint + data[update_key] = value + if page_index not in data["page_index"]: + data["page_index"].append(page_index) + exist = True + else: + for data in total_data_list: + if data["raw_name"] == raw_name: + update_key = datapoint + data[update_key] = value + if page_index not in data["page_index"]: + data["page_index"].append(page_index) + exist = True + if not exist: + data = { + "DocumentId": doc_id, + "raw_fund_name": raw_fund_name, + "raw_share_name": "", + "raw_name": raw_name, + "fund_id": fund_id, + "fund_name": fund_legal_name, + "sec_id": "", + "sec_name": "", + "EffectiveDate": doc_date, + "page_index": [page_index], + "RawName": raw_name, + } + for datapoint_name in datapoint_name_list: + data[datapoint_name] = "" + data[datapoint] = value + total_data_list.append(data) + total_data_df = pd.DataFrame(total_data_list) + total_data_df.fillna("", inplace=True) + + merged_data_excel_file = os.path.join(merged_data_excel_folder, f"merged_{self.doc_id}.xlsx") + with pd.ExcelWriter(merged_data_excel_file) as writer: + total_data_df.to_excel(writer, index=False, sheet_name="merged_data") + + merged_data_json_file = os.path.join(merged_data_json_folder, f"merged_{self.doc_id}.json") + with open(merged_data_json_file, "w", encoding="utf-8") as f: + json.dump(total_data_list, f, ensure_ascii=False, indent=4) + return total_data_list def get_raw_name_db_match_result( self, raw_name_list, investment_type: str, iter_count: int = 30 diff --git a/main.py b/main.py index 64fe6e0..2a145aa 100644 --- a/main.py +++ b/main.py @@ -499,7 +499,17 @@ def batch_start_job( ) logger.info(f"Saving mapping data to {output_mapping_total_folder}") - unique_doc_ids = result_mappingdata_df["doc_id"].unique().tolist() + result_mappingdata_df_columns = list(result_mappingdata_df.columns) + doc_id_column = "" + if "doc_id" in result_mappingdata_df_columns: + doc_id_column = "doc_id" + if "DocumentId" in result_mappingdata_df_columns: + doc_id_column = "DocumentId" + + if doc_id_column == "": + logger.error(f"Cannot find doc_id column in mapping data") + return + unique_doc_ids = result_mappingdata_df[doc_id_column].unique().tolist() os.makedirs(output_mapping_total_folder, exist_ok=True) time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime()) file_name = f"mapping_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx" @@ -507,11 +517,11 @@ def batch_start_job( file_name = f"{total_data_prefix}_{file_name}" output_file = os.path.join(output_mapping_total_folder, file_name) - doc_mapping_data_in_db = only_output_mapping_data_in_db(result_mappingdata_df) + # doc_mapping_data_in_db = only_output_mapping_data_in_db(result_mappingdata_df) with pd.ExcelWriter(output_file) as writer: - doc_mapping_data_in_db.to_excel( - writer, index=False, sheet_name="data_in_doc_mapping" - ) + # doc_mapping_data_in_db.to_excel( + # writer, index=False, sheet_name="data_in_doc_mapping" + # ) result_mappingdata_df.to_excel( writer, index=False, sheet_name="total_mapping_data" ) @@ -519,27 +529,6 @@ def batch_start_job( writer, index=False, sheet_name="extract_data" ) - if ( - doc_source == "aus_prospectus" - and document_mapping_file is not None - and len(document_mapping_file) > 0 - and os.path.exists(document_mapping_file) - ): - try: - merged_total_data_folder = os.path.join( - output_mapping_total_folder, "merged/" - ) - os.makedirs(merged_total_data_folder, exist_ok=True) - data_file_base_name = os.path.basename(output_file) - output_merged_data_file_path = os.path.join( - merged_total_data_folder, "merged_" + data_file_base_name - ) - merge_output_data_aus_prospectus( - output_file, document_mapping_file, output_merged_data_file_path - ) - except Exception as e: - logger.error(f"Error: {e}") - if calculate_metrics: prediction_sheet_name = "data_in_doc_mapping" ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx" @@ -1527,7 +1516,7 @@ if __name__ == "__main__": document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" # special_doc_id_list: list = ["410899007", "539266880", "539266817", # "539261734", "539266893"] - # special_doc_id_list: list = ["401212184"] + # special_doc_id_list: list = ["539266880"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/prepare_data.py b/prepare_data.py index cf4de36..72b9b1b 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -8,10 +8,12 @@ import re import time import traceback import json_repair +from copy import deepcopy from utils.logger import logger from utils.pdf_download import download_pdf_from_documents_warehouse from utils.pdf_util import PDFUtil +from core.auz_nz.hybrid_solution_script import final_function_to_match def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str): @@ -1463,18 +1465,290 @@ def prepare_multi_fund_aus_prospectus_document(data_folder: str = r"/data/aus_pr with open(output_sample_document_file, "w") as f: for doc_id in document_id_list: f.write(f"{doc_id}\n") + + +def set_mapping_to_ravi_data(): + data_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx" + data_sheet = "Sheet1" + mapping_file_path = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" + mapping_sheet = "document_mapping" + output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" + set_mapping_to_raw_name_data(data_file_path, data_sheet, mapping_file_path, mapping_sheet, output_file_path) + + +def set_mapping_to_data_side_documents_data(): + # data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/Audited file_phase2.xlsx" + # data_sheet = "all" + # mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" + # mapping_sheet = "document_mapping" + # output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/audited_file_phase2_with_mapping.xlsx" + + data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx" + data_sheet = "ground_truth" + raw_name_column = "raw_share_name" + mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" + mapping_sheet = "document_mapping" + raw_name_mapping_column = None + output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" + set_mapping_to_raw_name_data(data_file_path=data_file_path, + data_sheet=data_sheet, + raw_name_column=raw_name_column, + mapping_file_path=mapping_file_path, + mapping_sheet=mapping_sheet, + raw_name_mapping_column=raw_name_mapping_column, + output_file_path=output_file_path) + + +def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx", + data_sheet: str = "Sheet1", + raw_name_column: str = "raw_share_name", + mapping_file_path: str = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx", + mapping_sheet: str = "document_mapping", + raw_name_mapping_column: str = None, + output_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"): + data_df = pd.read_excel(data_file_path, sheet_name=data_sheet) + data_df["provider_id"] = "" + data_df["provider_name"] = "" + data_df["fund_id"] = "" + data_df["fund_name"] = "" + data_df["sec_id"] = "" + data_df["sec_name"] = "" + + mapping_data = pd.read_excel(mapping_file_path, sheet_name=mapping_sheet) + + doc_id_list = data_df["doc_id"].unique().tolist() + for doc_id in doc_id_list: + doc_data = data_df[data_df["doc_id"] == doc_id] + raw_name_list = doc_data[raw_name_column].unique().tolist() + + doc_mapping_data = mapping_data[mapping_data["DocumentId"] == doc_id] + if len(doc_mapping_data) == 0: + continue + provider_id = doc_mapping_data["CompanyId"].values[0] + provider_name = doc_mapping_data["CompanyName"].values[0] + data_df.loc[(data_df["doc_id"] == doc_id), "provider_id"] = provider_id + data_df.loc[(data_df["doc_id"] == doc_id), "provider_name"] = provider_name + if raw_name_mapping_column is not None and raw_name_mapping_column == "FundLegalName": + doc_db_name_list = doc_mapping_data[raw_name_mapping_column].unique().tolist() + for raw_name in raw_name_list: + find_df = doc_mapping_data[doc_mapping_data[raw_name_mapping_column] == raw_name] + if find_df is not None and len(find_df) == 1: + sec_id = find_df["FundClassId"].values[0] + sec_name = find_df["FundClassLegalName"].values[0] + fund_id = find_df["FundId"].values[0] + fund_name = find_df["FundLegalName"].values[0] + # update doc_data which raw_share_name is same as raw_share_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_id"] = sec_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_name"] = sec_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_id"] = fund_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_name"] = fund_name + else: + doc_db_name_list = doc_mapping_data["FundClassLegalName"].unique().tolist() + all_match_result = get_raw_name_db_match_result(doc_id, + provider_name, + raw_name_list, + doc_db_name_list, + iter_count=60) + for raw_share_name in raw_name_list: + if all_match_result.get(raw_share_name) is not None: + matched_db_share_name = all_match_result[raw_share_name] + if ( + matched_db_share_name is not None + and len(matched_db_share_name) > 0 + ): + # get SecId from self.doc_fund_class_mapping + find_share_df = doc_mapping_data[doc_mapping_data["FundClassLegalName"] == matched_db_share_name] + if find_share_df is not None and len(find_share_df) > 0: + sec_id = find_share_df["FundClassId"].values[0] + fund_id = find_share_df["FundId"].values[0] + fund_name = find_share_df["FundLegalName"].values[0] + # update doc_data which raw_share_name is same as raw_share_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_id"] = sec_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_name"] = matched_db_share_name + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_id"] = fund_id + data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_name"] = fund_name + try: + data_df = data_df[["doc_id", + "provider_id", + "provider_name", + "raw_fund_name", + "fund_id", + "fund_name", + "raw_share_name", + "sec_id", + "sec_name", + "management_fee_and_costs", + "management_fee", + "administration_fees", + "minimum_initial_investment", + "benchmark_name", + "performance_fee", + "performance_fee_charged", + "buy_spread", + "sell_spread", + "total_annual_dollar_based_charges", + "interposed_vehicle_performance_fee_cost", + "establishment_fee", + "contribution_fee", + "withdrawal_fee", + "exit_fee", + "switching_fee", + "activity_fee", + "hurdle_rate", + "analyst_name" + ]] + except Exception as e: + print(e) + + with open(output_file_path, "wb") as file: + data_df.to_excel(file, index=False) + + +def get_raw_name_db_match_result( + doc_id: str, provider_name: str, raw_name_list: list, doc_share_name_list: list, iter_count: int = 30 + ): + # split raw_name_list into several parts which each part is with 30 elements + # The reason to split is to avoid invoke token limitation issues from CahtGPT + raw_name_list_parts = [ + raw_name_list[i : i + iter_count] + for i in range(0, len(raw_name_list), iter_count) + ] + all_match_result = {} + doc_share_name_list = deepcopy(doc_share_name_list) + for raw_name_list in raw_name_list_parts: + match_result, doc_share_name_list = get_final_function_to_match( + doc_id, provider_name, raw_name_list, doc_share_name_list + ) + all_match_result.update(match_result) + return all_match_result + +def get_final_function_to_match(doc_id, provider_name, raw_name_list, db_name_list): + if len(db_name_list) == 0: + match_result = {} + for raw_name in raw_name_list: + match_result[raw_name] = "" + else: + match_result = final_function_to_match( + doc_id=doc_id, + pred_list=raw_name_list, + db_list=db_name_list, + provider_name=provider_name, + doc_source="aus_prospectus" + ) + matched_name_list = list(match_result.values()) + db_name_list = remove_matched_names(db_name_list, matched_name_list) + return match_result, db_name_list + +def remove_matched_names(target_name_list: list, matched_name_list: list): + if len(matched_name_list) == 0: + return target_name_list + + matched_name_list = list(set(matched_name_list)) + matched_name_list = [ + value for value in matched_name_list if value is not None and len(value) > 0 + ] + for matched_name in matched_name_list: + if ( + matched_name is not None + and len(matched_name) > 0 + and matched_name in target_name_list + ): + target_name_list.remove(matched_name) + return target_name_list + + +def adjust_data_file(source_file: str, + targe_file: str): + source_data = pd.read_excel(source_file, sheet_name="Sheet1") + source_doc_id_list = source_data["DocumentId"].unique().tolist() + + target_data = pd.read_excel(targe_file, sheet_name="Sheet1") + #remove target_data which doc_id is in source_doc_id_list + target_data = target_data[~target_data["DocumentId"].isin(source_doc_id_list)] + # concat source_data and target_data + target_data = pd.concat([source_data, target_data], ignore_index=True) + with open(targe_file, "wb") as file: + target_data.to_excel(file, index=False) + + +def set_provider_to_ground_truth(groud_truth_file: str, + ground_truth_sheet: str, + document_mapping_file: str, + document_mapping_sheet: str): + ground_truth_df = pd.read_excel(groud_truth_file, sheet_name=ground_truth_sheet) + ground_truth_df["provider_id"] = "" + ground_truth_df["provider_name"] = "" + + mapping_data = pd.read_excel(document_mapping_file, sheet_name=document_mapping_sheet) + + doc_id_list = ground_truth_df["DocumentId"].unique().tolist() + for doc_id in doc_id_list: + doc_mapping_data = mapping_data[mapping_data["DocumentId"] == doc_id] + if len(doc_mapping_data) == 0: + continue + provider_id = doc_mapping_data["CompanyId"].values[0] + provider_name = doc_mapping_data["CompanyName"].values[0] + ground_truth_df.loc[(ground_truth_df["DocumentId"] == doc_id), "provider_id"] = provider_id + ground_truth_df.loc[(ground_truth_df["DocumentId"] == doc_id), "provider_name"] = provider_name + try: + ground_truth_df = ground_truth_df[["DocumentId", + "provider_id", + "provider_name", + "raw_fund_name", + "FundId", + "FundLegalName", + "raw_share_name", + "FundClassId", + "FundClassLegalName", + "management_fee_and_costs", + "management_fee", + "administration_fees", + "minimum_initial_investment", + "benchmark_name", + "performance_fee", + "performance_fee_charged", + "buy_spread", + "sell_spread", + "total_annual_dollar_based_charges", + "interposed_vehicle_performance_fee_cost", + "establishment_fee", + "contribution_fee", + "withdrawal_fee", + "exit_fee", + "switching_fee", + "activity_fee", + "hurdle_rate", + "analyst_name" + ]] + except Exception as e: + print(e) + + with open(groud_truth_file, "wb") as file: + ground_truth_df.to_excel(file, index=False) - if __name__ == "__main__": + set_provider_to_ground_truth( + groud_truth_file=r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx", + ground_truth_sheet="Sheet1", + document_mapping_file=r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx", + document_mapping_sheet="document_mapping" + ) + + # set_mapping_to_data_side_documents_data() + + # source_file = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx" + # target_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" + # adjust_data_file(source_file=source_file, targe_file=target_file) + # pdf_exist() # prepare_multi_fund_aus_prospectus_document() - merge_aus_document_prospectus_data(aus_data_folder=r"/data/aus_prospectus/basic_information/17_documents/", - aus_document_mapping_file="aus_prospectus_17_documents_mapping.xlsx", - aus_prospectus_data_file="aus_prospectus_data_17_documents_secid.xlsx", - document_mapping_sheet="document_mapping", - output_file="aus_prospectus_17_documents_data.xlsx", - output_sheet="aus_document_prospectus") + # merge_aus_document_prospectus_data(aus_data_folder=r"/data/aus_prospectus/basic_information/17_documents/", + # aus_document_mapping_file="aus_prospectus_17_documents_mapping.xlsx", + # aus_prospectus_data_file="aus_prospectus_data_17_documents_secid.xlsx", + # document_mapping_sheet="document_mapping", + # output_file="aus_prospectus_17_documents_data.xlsx", + # output_sheet="aus_document_prospectus") folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/" file_name = "doc_ar_data_for_emea_11_06.xlsx" # get_document_with_all_4_data_points(folder, file_name, None) From fa2dede4542da49f48fa6e4a99b4e56597c05f60 Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 7 Mar 2025 18:38:36 -0600 Subject: [PATCH 05/11] optimize for management_fee_and_costs and management_fee --- .../aus_prospectus/datapoint_keyword.json | 4 +- .../aus_prospectus/datapoint_name.json | 8 +-- core/data_extraction.py | 39 ++++++++++---- .../data_extraction_prompts_config.json | 19 +++++-- main.py | 52 +++++++++++++++++-- utils/biz_utils.py | 2 +- 6 files changed, 99 insertions(+), 25 deletions(-) diff --git a/configuration/aus_prospectus/datapoint_keyword.json b/configuration/aus_prospectus/datapoint_keyword.json index 9e5281d..9026586 100644 --- a/configuration/aus_prospectus/datapoint_keyword.json +++ b/configuration/aus_prospectus/datapoint_keyword.json @@ -1,7 +1,7 @@ { "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, - "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees"]}, - "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "investment fees and costs", "investment fee and costs", "Management costs", "Investment fees"]}, + "management_fee_and_costs": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]}, + "management_fee": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]}, "performance_fee": {"english": ["performance fee", "performance fees"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]}, diff --git a/configuration/aus_prospectus/datapoint_name.json b/configuration/aus_prospectus/datapoint_name.json index fd91a4a..9b3272f 100644 --- a/configuration/aus_prospectus/datapoint_name.json +++ b/configuration/aus_prospectus/datapoint_name.json @@ -1,14 +1,14 @@ { - "total_annual_dollar_based_charges": "total annual dollar based charges", "management_fee_and_costs": "management fee and costs", "management_fee": "management fee", + "administration_fees": "administration fee", "performance_fee": "performance fee", + "interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost", "buy_spread": "buy spread", "sell_spread": "sell spread", - "administration_fees": "administration fee", - "interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost", + "total_annual_dollar_based_charges": "total annual dollar based charges", + "minimum_initial_investment": "minimum initial investment", "benchmark_name": "benchmark name", - "minimum_initial_investment": "minimum initial investment", "indirect_costs": "indirect cost", "recoverable_expenses": "recoverable expenses", "change_recoverable_expanses": "change recoverable expanses" diff --git a/core/data_extraction.py b/core/data_extraction.py index 1998d61..1ff1f0d 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -448,8 +448,13 @@ class DataExtraction: """ management_fee_costs_list = [] management_fee_list = [] + complex_rule_keywords = "Recoverable expenses \nEstimated other indirect costs" for data_dict in data_list: extract_data = data_dict.get("extract_data", {}) + exist_complex_rule_keywords = False + page_text = data_dict.get("page_text", "") + if complex_rule_keywords in page_text: + exist_complex_rule_keywords = True data = extract_data.get("data", []) for data_item in data: keys = list(data_item.keys()) @@ -467,11 +472,17 @@ class DataExtraction: if (mf_fund_name == fund_name and mf_share_name == share_name) or \ (len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and (mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))): - mf_value = mf.get("management_fee", -1) - if mf_value != -1 and mf_value >= management_fee: - mf["management_fee"] = management_fee - found = True - break + if exist_complex_rule_keywords and \ + ("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys): + mfc["management_fee"] = management_fee + found = True + break + else: + mf_value = mf.get("management_fee", -1) + if mf_value != -1 and mf_value >= management_fee: + mf["management_fee"] = management_fee + found = True + break if not found: management_fee_list.append({"fund_name": fund_name, "share_name": share_name, @@ -486,11 +497,17 @@ class DataExtraction: if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \ (len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and (mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))): - mfc_value = mfc.get("management_fee_and_costs", -1) - if mfc_value != -1 and mfc_value <= management_fee_costs: - mfc["management_fee_and_costs"] = management_fee_costs - found = True - break + if exist_complex_rule_keywords and \ + ("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys): + mfc["management_fee_and_costs"] = management_fee_costs + found = True + break + else: + mfc_value = mfc.get("management_fee_and_costs", -1) + if mfc_value != -1 and mfc_value <= management_fee_costs: + mfc["management_fee_and_costs"] = management_fee_costs + found = True + break if not found: management_fee_costs_list.append({"fund_name": fund_name, "share_name": share_name, @@ -576,7 +593,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num != 21: + # if page_num != 18: # continue if page_num in handled_page_num_list: continue diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 064dc36..c4cffce 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -191,7 +191,7 @@ "The output should be:", "{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee\": 0.18}]}", "\n", - "F. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00%, please ignore and output empty.", + "F. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00% or 0.55%-1.00%, please ignore and output empty.", "---Example 1 Start---", "Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to ‘Administration fee \nrebate’ section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown ‘in the ‘Management \nfees and costs’ section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n", "---Example 1 End---", @@ -202,6 +202,11 @@ "---Example 2 End---", "The relevant values: 0.07 and 1.00, are in the range, should ignore, so the output should be:", "{\"data\": []}", + "---Example 3 Start---", + "Management fees and costs \n0.67–1.17% p.a. (estimated) \nThe fees and costs for \nmanaging your investment \n", + "---Example 3 End---", + "The relevant values: 0.67 and 1.17, are in the range, should ignore, so the output should be:", + "{\"data\": []}", "\n", "G. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.", "---Example 1 Start---", @@ -224,7 +229,15 @@ "So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee is 0.09.", "For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee is 0.11.", "So the output should be:", - "{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee\": 0.11}]}" + "{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee\": 0.11}]}", + "---Example 4 Start---", + "Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return – Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n", + "---Example 4 End---", + "The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.", + "The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".", + "Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.", + "So the output should be:", + "{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}" ], "administration_fees":[ "Administration fees and costs is share class level data.", @@ -395,7 +408,7 @@ ] }, { - "keywords": "Recoverable expenses \nEstimated other indirect costs", + "keywords": ["Recoverable expenses \nEstimated other indirect costs"], "prompts": ["Complex management fee and costs rule:", "If the table with columns:", "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", diff --git a/main.py b/main.py index 2a145aa..1646226 100644 --- a/main.py +++ b/main.py @@ -279,7 +279,39 @@ class EMEA_AR_Parsing: ) with open(json_file, "r", encoding="utf-8") as f: doc_mapping_data = json.load(f) - return doc_mapping_data + if self.doc_source == "aus_prospectus": + output_data_folder_splits = output_data_json_folder.split("output") + if len(output_data_folder_splits) == 2: + merged_data_folder = f'{output_data_folder_splits[0]}output/merged_data/docs/' + os.makedirs(merged_data_folder, exist_ok=True) + + merged_data_json_folder = os.path.join(merged_data_folder, "json/") + os.makedirs(merged_data_json_folder, exist_ok=True) + + merged_data_excel_folder = os.path.join(merged_data_folder, "excel/") + os.makedirs(merged_data_excel_folder, exist_ok=True) + + merged_data_file = os.path.join(merged_data_json_folder, f"merged_{self.doc_id}.json") + if os.path.exists(merged_data_file): + with open(merged_data_file, "r", encoding="utf-8") as f: + merged_data_list = json.load(f) + return merged_data_list + else: + data_mapping = DataMapping( + self.doc_id, + self.datapoints, + data_from_gpt, + self.document_mapping_info_df, + self.output_mapping_data_folder, + self.doc_source, + compare_with_provider=self.compare_with_provider + ) + merged_data_list = data_mapping.merge_output_data_aus_prospectus(doc_mapping_data, + merged_data_json_folder, + merged_data_excel_folder) + return merged_data_list + else: + return doc_mapping_data """ doc_id, datapoints: list, @@ -1420,7 +1452,7 @@ def get_aus_prospectus_document_category(): def test_post_adjust_extract_data(): - doc_id = "454036250" + doc_id = "539266814" pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( @@ -1459,7 +1491,8 @@ def test_post_adjust_extract_data(): with open(data_file_path, "r", encoding="utf-8") as f: data_list = json.load(f) # data_list = data_extraction.remove_duplicate_data(data_list) - data_list = data_extraction.post_adjust_for_value_with_production_name(data_list) + # data_list = data_extraction.post_adjust_for_value_with_production_name(data_list) + data_list = data_extraction.post_supplement_data(data_list) if __name__ == "__main__": @@ -1516,7 +1549,18 @@ if __name__ == "__main__": document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" # special_doc_id_list: list = ["410899007", "539266880", "539266817", # "539261734", "539266893"] - # special_doc_id_list: list = ["539266880"] + # special_doc_id_list: list = ["530101994", + # "539241700", + # "539261734", + # "539266814", + # "539266817", + # "539266874", + # "539266880", + # "539266893", + # "544886057", + # "550769189", + # "553449663"] + # special_doc_id_list = ["539241700"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 8485bc2..cd5839f 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -1090,7 +1090,7 @@ def replace_special_table_header(page_text: str): # item 2: document 539266893 "regex_all_list": [r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n", - r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?interposed\s*vehicles\s*\n", + r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?(interposed\s*vehicles|managers\s*vehicles)\s*\n", r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"], "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n" }, From 4ee762963e1648e753378a892cb023497d5c7f36 Mon Sep 17 00:00:00 2001 From: Blade He Date: Sat, 8 Mar 2025 21:40:00 -0600 Subject: [PATCH 06/11] optimized for management_fee_and_costs and administration_fees --- calc_metrics.py | 97 ++++++++------- core/data_extraction.py | 9 +- .../data_extraction_prompts_config.json | 114 +++++++++++++----- main.py | 6 +- utils/biz_utils.py | 12 ++ 5 files changed, 161 insertions(+), 77 deletions(-) diff --git a/calc_metrics.py b/calc_metrics.py index b136fb8..0b76786 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -713,6 +713,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros continue doc_verify_sec_row = doc_verify_sec_data.iloc[0] raw_fund_name = doc_verify_sec_row["raw_fund_name"] + raw_share_name = doc_verify_sec_row["raw_share_name"] v_management_fee_and_costs = str(doc_verify_sec_row["management_fee_and_costs"]) v_management_fee = str(doc_verify_sec_row["management_fee"]) v_administration_fees = str(doc_verify_sec_row["administration_fees"]) @@ -733,28 +734,28 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros # v_activity_fee = str(doc_verify_sec_row["activity_fee"]) message = get_gt_pred_by_compare_values(management_fee_and_costs, v_management_fee_and_costs, gt_management_fee_and_costs_list, pred_management_fee_and_costs_list, data_point="management_fee_and_costs") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee_and_costs")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs")) message = get_gt_pred_by_compare_values(management_fee, v_management_fee, gt_management_fee_list, pred_management_fee_list, data_point="management_fee") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee")) message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, gt_administration_fees_list, pred_administration_fees_list, data_point="administration_fees") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "administration_fees")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees")) message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, gt_minimum_initial_investment_list, pred_minimum_initial_investment_list, data_point="minimum_initial_investment") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "minimum_initial_investment")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment")) message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "benchmark_name")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name")) if is_for_all: message = get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "performance_fee")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee")) message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "interposed_vehicle_performance_fee_cost")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost")) message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, gt_buy_spread_list, pred_buy_spread_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "buy_spread")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread")) message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, gt_sell_spread_list, pred_sell_spread_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "sell_spread")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread")) message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges, gt_total_annual_dollar_based_charges_list, pred_total_annual_dollar_based_charges_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "total_annual_dollar_based_charges")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges")) # message = get_gt_pred_by_compare_values(withdrawal_fee, v_withdrawal_fee, gt_withdrawal_fee_list, pred_withdrawal_fee_list) # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "withdrawal_fee")) # message = get_gt_pred_by_compare_values(switching_fee, v_switching_fee, gt_switching_fee_list, pred_switching_fee_list) @@ -763,9 +764,10 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros # message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "activity_fee")) message_data_df = pd.DataFrame(message_list) - message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name', 'data_point', 'gt_value', 'pred_value', 'error']] + message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name', + 'raw_share_name', 'data_point', 'gt_value', 'pred_value', 'error']] # order by doc_id, raw_fund_name, data_point - message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_fund_name', 'data_point']) + message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_share_name', 'data_point']) message_data_df.reset_index(drop=True, inplace=True) # calculate metrics @@ -1036,6 +1038,7 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ continue doc_verify_sec_row = doc_verify_sec_data.iloc[0] raw_fund_name = doc_verify_sec_row["raw_fund_name"] + raw_share_name = doc_verify_sec_row["raw_share_name"] v_management_fee_and_costs = str(doc_verify_sec_row["management_fee_and_costs"]) v_management_fee = str(doc_verify_sec_row["management_fee"]) v_administration_fees = str(doc_verify_sec_row["administration_fees"]) @@ -1053,67 +1056,68 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ provider_gt_pred_data[provider_id]["gt_management_fee_and_costs_list"], provider_gt_pred_data[provider_id]["pred_management_fee_and_costs_list"], data_point="management_fee_and_costs") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee_and_costs")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee_and_costs")) message = get_gt_pred_by_compare_values(management_fee, v_management_fee, provider_gt_pred_data[provider_id]["gt_management_fee_list"], provider_gt_pred_data[provider_id]["pred_management_fee_list"], data_point="management_fee") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "management_fee")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "management_fee")) message = get_gt_pred_by_compare_values(administration_fees, v_administration_fees, provider_gt_pred_data[provider_id]["gt_administration_fees_list"], provider_gt_pred_data[provider_id]["pred_administration_fees_list"], data_point="administration_fees") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "administration_fees")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "administration_fees")) message = get_gt_pred_by_compare_values(minimum_initial_investment, v_minimum_initial_investment, provider_gt_pred_data[provider_id]["gt_minimum_initial_investment_list"], provider_gt_pred_data[provider_id]["pred_minimum_initial_investment_list"], data_point="minimum_initial_investment") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "minimum_initial_investment")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "minimum_initial_investment")) message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, provider_gt_pred_data[provider_id]["gt_benchmark_name_list"], provider_gt_pred_data[provider_id]["pred_benchmark_name_list"], data_point="benchmark_name") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "benchmark_name")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name")) if is_for_all: message = get_gt_pred_by_compare_values(performance_fee, v_performance_fee, provider_gt_pred_data[provider_id]["gt_performance_fee_list"], provider_gt_pred_data[provider_id]["pred_performance_fee_list"], data_point="performance_fee") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "performance_fee")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee")) message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, provider_gt_pred_data[provider_id]["gt_interposed_vehicle_performance_fee_cost_list"], provider_gt_pred_data[provider_id]["pred_interposed_vehicle_performance_fee_cost_list"], data_point="interposed_vehicle_performance_fee_cost") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "interposed_vehicle_performance_fee_cost")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost")) message = get_gt_pred_by_compare_values(buy_spread, v_buy_spread, provider_gt_pred_data[provider_id]["gt_buy_spread_list"], provider_gt_pred_data[provider_id]["pred_buy_spread_list"], data_point="buy_spread") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "buy_spread")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "buy_spread")) message = get_gt_pred_by_compare_values(sell_spread, v_sell_spread, provider_gt_pred_data[provider_id]["gt_sell_spread_list"], provider_gt_pred_data[provider_id]["pred_sell_spread_list"], data_point="sell_spread") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "sell_spread")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "sell_spread")) message = get_gt_pred_by_compare_values(total_annual_dollar_based_charges, v_total_annual_dollar_based_charges, provider_gt_pred_data[provider_id]["gt_total_annual_dollar_based_charges_list"], provider_gt_pred_data[provider_id]["pred_total_annual_dollar_based_charges_list"], data_point="total_annual_dollar_based_charges") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, "total_annual_dollar_based_charges")) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "total_annual_dollar_based_charges")) message_data_df = pd.DataFrame(message_list) - message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name', 'data_point', 'gt_value', 'pred_value', 'error']] + message_data_df = message_data_df[['doc_id', 'sec_id', 'raw_fund_name', 'fund_legal_name', + 'raw_share_name', 'data_point', 'gt_value', 'pred_value', 'error']] # order by doc_id, raw_fund_name, data_point - message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_fund_name', 'data_point']) + message_data_df = message_data_df.sort_values(by=['doc_id', 'raw_share_name', 'data_point']) message_data_df.reset_index(drop=True, inplace=True) # calculate metrics @@ -1261,10 +1265,17 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ -def generate_message(message: dict, doc_id: str, sec_id: str, fund_legal_name: str, raw_fund_name: str, datapoint: str): +def generate_message(message: dict, + doc_id: str, + sec_id: str, + fund_legal_name: str, + raw_fund_name: str, + raw_share_name: str, + datapoint: str): message["data_point"] = datapoint message["fund_legal_name"] = fund_legal_name message["raw_fund_name"] = raw_fund_name + message["raw_share_name"] = raw_share_name message["sec_id"] = sec_id message["doc_id"] = str(doc_id) return message @@ -1334,26 +1345,28 @@ if __name__ == "__main__": audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" audit_data_sheet: str = "Sheet1" - verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_46_documents_by_text_20250306171226.xlsx" - verify_data_sheet: str = "total_data" + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250308202351.xlsx" + verify_data_sheet: str = "total_mapping_data" # verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt" - verify_document_list_file_list = [None, "./sample_documents/aus_prospectus_29_documents_sample.txt", "./sample_documents/aus_prospectus_17_documents_sample.txt"] + verify_document_list_file_list = [None, + "./sample_documents/aus_prospectus_29_documents_sample.txt", + "./sample_documents/aus_prospectus_17_documents_sample.txt"] is_for_all = False - # for verify_document_list_file in verify_document_list_file_list: - # calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, - # audit_data_sheet=audit_data_sheet, - # verify_file_path=verify_file_path, - # verify_data_sheet=verify_data_sheet, - # verify_document_list_file = verify_document_list_file, - # is_for_all=is_for_all) - for verify_document_list_file in verify_document_list_file_list: - calculate_metrics_by_provider(audit_file_path=audit_file_path, - audit_data_sheet=audit_data_sheet, - verify_file_path=verify_file_path, - verify_data_sheet=verify_data_sheet, - verify_document_list_file = verify_document_list_file, - is_for_all=is_for_all) + calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, + audit_data_sheet=audit_data_sheet, + verify_file_path=verify_file_path, + verify_data_sheet=verify_data_sheet, + verify_document_list_file = verify_document_list_file, + is_for_all=is_for_all) + + # for verify_document_list_file in verify_document_list_file_list: + # calculate_metrics_by_provider(audit_file_path=audit_file_path, + # audit_data_sheet=audit_data_sheet, + # verify_file_path=verify_file_path, + # verify_data_sheet=verify_data_sheet, + # verify_document_list_file = verify_document_list_file, + # is_for_all=is_for_all) diff --git a/core/data_extraction.py b/core/data_extraction.py index 1ff1f0d..a81c8b6 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -302,6 +302,8 @@ class DataExtraction: raw_name_dict.pop(raw_name_as_production_name) for data_dict in data_list: + # if data_dict.get("page_index", -1) > 9: + # break extract_data = data_dict.get("extract_data", {}) data = extract_data.get("data", []) remove_item_list = [] @@ -312,7 +314,10 @@ class DataExtraction: share_name = data_item.get("share_name", "") raw_name = self.get_raw_name(fund_name, share_name) if raw_name.lower() in self.document_production.lower(): - dp_keys = [key for key in keys if key not in ["fund_name", "share_name"]] + dp_keys = [key for key in keys if key not in ["fund_name", + "share_name", + "management_fee_and_costs", + "management_fee"]] for dp_key in dp_keys: if dp_key not in datapoint_list_with_production_name: datapoint_list_with_production_name.append(dp_key) @@ -593,7 +598,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num != 18: + # if page_num != 25: # continue if page_num in handled_page_num_list: continue diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index c4cffce..0063c89 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -145,15 +145,32 @@ "The output should be:", "{\"data\": []}", "\n", - "B. If there are multiple Management fee and costs sub-columns, here is the rule: ", - "B.1 With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".", + "B. The table title is with Ongoing annual fees and costs.", + "B.1 Management fees and costs should not include transaction costs and performance fees.", + "---Example Start", + "Ongoing annual\nfees and costs\nC Class and E Class -P Class - Performance \nStandard Fee Option Fee Option \n36 \n(E Class is closed to \nnew investors) \nPlatinum International Fund 1.56% p.a. 1.46% p.a. \nOngoing annual fees and costs include estimated management fees and costs, estimated \ntransaction costs and estimated performance fees (for P Class – Performance Fee Option \nonly). Please see page 36 for further information.", + "---Example End", + "The values 1.56 and 1.46 include estimated management fees and costs, estimated \ntransaction costs and estimated performance fees, should ignore them.", + "The output should be:", + "{\"data\": []}", + "B.2 If with pure management fees and costs in table, please extract relevant values", + "---Example Start---", + "Fees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nManagement fees and costs \nEstimated management fees and costs \nper annum are: \nPlatinum International Fund 1.41% 1.16%\nPlatinum Global Fund (Long Only) 1.35% 1.10%\n", + "---Example End---", + "a. For this example, there is pure \"Management fees and costs\", please extract relevant values.", + "b. This example mentioned share classes, please output according to share class.", + "The output should be", + "{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"management_fee_and_costs\": 1.41, \"management_fee\": 1.41}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"management_fee_and_costs\": 1.41, \"management_fee\": 1.41}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"management_fee_and_costs\": 1.16, \"management_fee\": 1.16}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.35}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.35}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"management_fee_and_costs\": 1.1, \"management_fee\": 1.1}]}", + "\n", + "C. If there are multiple Management fee and costs sub-columns, here is the rule: ", + "C.1 With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".", "---Example Start---", "\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n", "---Example End---", "The output should be:", "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]}", "\n", - "B.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".", + "C.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".", "The management_fee is the value of \"Management fee (% pa)\".", "The management_fee_and_costs is the value of \"Total management cost (% pa)\".", "---Example Start---", @@ -162,7 +179,7 @@ "The output should be:", "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", "\n", - "C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", + "D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", "---Example 1 End---", @@ -173,15 +190,20 @@ "---Example 2 End---", "The output should be:", "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}", + "---Example 3 Start---", + "Fund name \nManagement \nfees and costs \n(p.a.) 1 \nBuy/sell \nspread \n(%) 2 \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity \nIncome Fund \n1.22% 0.05\nFirst Sentier Wholesale Geared \nShare Fund 3 \n1.04%(g)/2.18%(n) 4 0.20–0.50 5 \n\n", + "---Example 3 End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"First Sentier Wholesale Equity Income Fund\", \"share name\": \"First Sentier Wholesale Equity Income Fund\", \"management_fee_and_costs\": 1.22, \"management_fee\": 1.22, \"buy_spread\": 0.05, \"sell_spread\": 0.05}, {\"fund name\": \"First Sentier Wholesale Geared Share Fund\", \"share name\": \"First Sentier Wholesale Geared Share Fund\", \"management_fee_and_costs\": 2.18, \"management_fee\": 2.18, \"buy_spread\": 0.5, \"sell_spread\": 0.5}]}", "\n", - "D. If only find \"Management fees\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", + "E. If only find \"Management fees\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", "Underlying FundManagement fee component \nVanguard High Growth Index Fund1.50% p.a. of the NAV of the Underlying Fund\n", "---Example 1 End---", "The output should be:", "{\"data\": [{\"fund name\": \"Vanguard High Growth Index Fund\", \"share name\": \"Vanguard High Growth Index Fund\", \"management_fee_and_costs\": 1.5, \"management_fee\": 1.5}]}", "\n", - "E. If with columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\", please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", + "F. If with columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\", please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", "\n\nInvestment option \nInvestment fees \nand costs (excl \nPerformance Fees) \nPerformance \nFee \nTransaction \ncosts \nTotal \ninvestment \nfees and costs \nBalanced 0.53% 0.43% 0.13%1.09% \nCapital Stable \n0.32% \n0.18% \n0.09% \n0.59% \n", "---Example 1 End---", @@ -191,7 +213,7 @@ "The output should be:", "{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee\": 0.18}]}", "\n", - "F. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00% or 0.55%-1.00%, please ignore and output empty.", + "G. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00% or 0.55%-1.00%, please ignore and output empty.", "---Example 1 Start---", "Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to ‘Administration fee \nrebate’ section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown ‘in the ‘Management \nfees and costs’ section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n", "---Example 1 End---", @@ -208,7 +230,7 @@ "The relevant values: 0.67 and 1.17, are in the range, should ignore, so the output should be:", "{\"data\": []}", "\n", - "G. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.", + "H. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.", "---Example 1 Start---", "Fees and costs for \nyour investment options \n\nAdministration fees and costs apply in addition to the fees and costs shown in this table. Please refer to the PDS and Fee Brochure for \nfurther information about fees and costs, including how the figures shown below are calculated. \n\nThe investment fees and \ncosts are made up of \nPerformance \nfee \nPlus \nother \ninvestment \nfees and \ncosts \nEquals \ninvestment \nfees and \ncosts \nTransaction \ncosts (net) \nBuy-sell \nspreads \nTransaction \ncosts \n(gross) 1 \n% pa \n% pa \n% pa \nEntry %/ \nExit % \n% pa \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \n", "---Example 1 End---", @@ -237,7 +259,35 @@ "The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".", "Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.", "So the output should be:", - "{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}" + "{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", + "\n", + "I. Some table is very complex, with many data points columns, please extract the relevant values.", + "---Example 1 Start---", + "Option name \nTotal administration\nand investment\nfees and costs (p.a.)\n= \nAdministration\nfees and\ncosts (p.a.)\n+ \nInvestment fees \nand costs (p.a.) \n2 \n+ \nPerformance \nfee (p.a.) \n1 \nBuy/sell\nspread\n(%)\n6 \nCFS Multi-Manager Multi-Sector (These investment options are located in the Investment Options Menu.) \nCFS Defensive \n0.94% \n0.20% 0.74%0.15 \nCFS Conservative 1.04% \n1 \n0.20% 0.81% 0.03%\n1 \n0.15 \n", + "---Example 1 End---", + "For this table, there are \"Administration fees and costs (p.a.)\" as administration_fees, ", + "\"Investment fees and costs (p.a.)\" as management_fee_and_costs and management_fee, ", + "\"Performance fee (p.a.)\" as performance_fee, ", + "\"Buy/sell spread (%)\" as buy_spread and sell_spread.", + "If one row has 5 decimal numbers, ", + "the 2nd decimal number is the administration_fees, ", + "the 3rd decimal number is the management_fee_and_costs and management_fee, ", + "the 4th decimal number is the performance_fee, ", + "the 5th decimal number is the buy_spread and sell_spread.", + "If one row has 4 decimal numbers, ", + "the 2nd decimal number is the administration_fees, ", + "the 3rd decimal number is the management_fee_and_costs and management_fee, ", + "the 4th decimal number is the buy_spread and sell_spread.", + "Please always ignore the 1st decimal number, we need not the total sum values.", + "The output should be:", + "{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", + "J. If exist **\"Maximum management fee\"** in context, please ignore relevant values.", + "---Example Start---", + "Fund name \nMaximum \nmanagement \nfee (p.a.) \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity Income Fund 3.075% \nAUSTRALIAN SHARE \nFirst Sentier Wholesale Australian Share Fund 1.538%", + "---Example End---", + "The values in example is **Maximum management fee**, should ignore all of them.", + "The Output should be:", + "{\"data\": []}" ], "administration_fees":[ "Administration fees and costs is share class level data.", @@ -253,6 +303,11 @@ "----Example 2 End----", "The administration fee is $1.00 per week plus 0.17% pa, so the output should be:", "{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17}]}", + "---Example 3 Start---", + "\nPrime Super Income Stream\nType of fee \nor cost \nAmount How and when paid \nOngoing annual fees and costs \n1 \nAdministration \nfees and costs \nAdministration \nfees of $1.30 \nper week \nPlus \n0.50% p.a. of \nyour account \nbalance, capped \nat $500 p.a. \nDeducted from your \naccount on the last \nbusiness day of each \nmonth, except if you \nare leaving Prime \nSuper, in which case \nit is deducted prior to \nyour exit from Prime \nSuper. \nInvestment \nfees and costs \n2 \n0.07% to 1.00% \nof assets p.a. \ndepending on \nthe investment \noption \nTaken into account \nprior to the declaration \nof weekly earning \nrates. This cost is not \ndeducted directly from \nyour account. \n", + "---Example 3 End---", + "The administration fee is $1.30 per week plus 0.50% p.a., so the output should be:", + "{\"data\": [{\"fund name\": \"Prime Super Income Stream\", \"share name\": \"Prime Super Income Stream\", \"administration_fees\": 0.50}]}", "\n", "Complex cases:", "A. Need to add multiple numbers together.", @@ -342,44 +397,43 @@ { "management_fee_and_costs": [ { - "keywords": ["Estimated investment \ncosts \nAdministration \nfees"], + "keywords": ["Administration fees \nEstimated administration costs \nInvestment fees"], "prompts": ["Complex management fee and costs rule:", "If the table with columns:", - "\"Administration fees (% pa)\", \"Investment fees (% pa)\" and \"Estimated other investment costs (% pa)\"", - "The administration_fees is \"Administration fees (% pa)\"", - "The management_fee is \"Investment fees (% pa)\".", - "The management_fee_and_costs is \"Investment fees (% pa)\" + \"Estimated other investment costs (% pa)\".", + "\"Administration fees\", \"Investment fees\" ,\"Estimated other investment costs\" and \"Estimated performance fees\"", + "The administration_fees is \"Administration fees\"", + "The management_fee is \"Investment fees\".", + "The management_fee_and_costs is \"Investment fees\" + \"Estimated other investment costs\".", + "The performance_fee is \"Estimated performance fees\"", "---Example 1 Start---", - "Investment \noption \nAdministration fees and \nestimated administration costs \nInvestment fees and estimated \ninvestment costs \nEstimated investment \ncosts \nAdministration \nfees \n(% pa) \nInvestment \nfees \n(% pa) \n2 \nEstimated \ntotal \nongoing \nEstimated \nadministration \ncosts \n(% pa) \n1 \nEstimated \nperformance \nfees \n(% pa) \n3 \nEstimated \ntransaction \ncosts \n(% pa) \n5 \nEstimated \nother \ninvestment \ncosts \n(% pa) \n4 \nannual \nfees and \ncosts \n(% pa) \nCash \nPerpetual Cash \n0.10% \n0.00% \n0.00% \nn/a \n0.00% \n0.02% \n0.12% \nFixed income and credit \nBentham Global \nIncome \n0.25% \n0.00% \n0.67% \nn/a \n0.00% \n0.05% \n0.97% \nProperty and infrastructure \nLazard Global \nListed \nInfrastructure \n0.25% \n0.00% \n0.80% \nn/a \n0.08% \n0.09% \n1.22% \n", - "---Example 1 End---", - "For this case, although the table header is with disorder issue during PDF contents extraction issue.", - "But the data points numbers order in data row (for example: 0.25% \n0.00% \n0.80% \nn/a \n0.08% \n0.09% \n1.22% \n) is correct as initial table structure.", + "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \nCash \nPerpetual Cash \n0.10% \n0.00% \n0.00% \nn/a \n0.00% \n0.02% \n0.12% \nFixed income and credit \nBentham Global \nIncome \n0.25% \n0.00% \n0.67% \nn/a \n0.00% \n0.05% \n0.97% \nInternetional shares \nPerpetual Global \nInnovation Share \n0.25% \n0.00% \n0.99% \n2.30 \n0.01% \n0.27% \n3.82% \n", + "---Example 1 End---", + "The data points numbers order in data row (for example: 0.25% \n0.00% \n0.99% \n2.30 \n0.01% \n0.27% \n3.82% \n) is correct as initial table structure.", "Please pay attention below information", "Assume the column sequence number is from 1.", - "\"Administration fees (% pa)\" values are as the column 1 numbers, \"Investment fees (% pa)\" values are as the column 3 numbers, \"Estimated other investment costs (% pa)\" values are as the column 5 numbers.", - "For fund: Lazard Global Listed Infrastructure, the administration_fees should be the column 1 number: 0.25, the management_fee should be the column 3 number: 0.8, the management_fee_and_costs should be 0.88 = 0.8(the column 3 number) + 0.08 (the column 5 number)", + "\"Administration fees\" values are as the column 1 numbers, \"Investment fees\" values are as the column 3 numbers, \"Estimated other investment costs\" values are as the column 5 numbers, \"Estimated performance fees\" values are as the column 4 numbers.", + "For fund: Perpetual Global Innovation Share, the administration_fees should be the column 1 number: 0.25, the management_fee should be the column 3 number: 0.99, the management_fee_and_costs should be 1 = 0.99(the column 3 number) + 0.01 (the column 5 number), the performance_fee should be 2.3 (the column 4 number)", "Therefore, the output should be:", - "{\"data\": [{\"fund name\": \"Perpetual Cash\", \"share name\": \"Perpetual Cash\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"administration_fees\": 0.10}, {\"fund name\": \"Bentham Global Income\", \"share name\": \"Bentham Global Income\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0, \"administration_fees\": 0.25}]}, {\"fund name\": \"Lazard Global Listed Infrastructure\", \"share name\": \"Lazard Global Listed Infrastructure\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.08, \"administration_fees\": 0.25}" + "{\"data\": [{\"fund name\": \"Perpetual Cash\", \"share name\": \"Perpetual Cash\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"administration_fees\": 0.10}, {\"fund name\": \"Bentham Global Income\", \"share name\": \"Bentham Global Income\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"administration_fees\": 0.25}]}, {\"fund name\": \"Perpetual Global Innovation Share\", \"share name\": \"Perpetual Global Innovation Share\", \"management_fee_and_costs\": 1, \"management_fee\": 0.99, \"administration_fees\": 0.25, \"performance_fee\": 2.3}" ] }, { - "keywords": ["Entry Fee \nNil Entry"], + "keywords": ["Entry Fee option \nNil Entry option"], "prompts": ["Complex management fee and costs rule:", "If the table with columns:", - "\"Entry Fee option\", \"Nil Entry Free option\", \"Estimated other investment costs\", \"Estimated Performance fees (B)\"", - "The performance_fee is \"Estimated Performance fees (B)\"", + "\"Entry Fee option\", \"Nil Entry option\", \"Estimated Other investment costs\", \"Estimated Performance fees\"", + "The performance_fee is \"Estimated Performance fees\"", "The fund name's tail is \"Entry Fee\" for \"Entry Fee option\", e.g. if fund name is \"MultiSeries 30\", the Entry Fee fund name is \"MultiSeries 30 Entry Fee\"", - "The fund name's tail is \"Nil Entry\" for \"Nil Entry Free option\", e.g. if fund name is \"MultiSeries 30\", the Nil Entry fund name is \"MultiSeries 30 Nil Entry\".", + "The fund name's tail is \"Nil Entry\" for \"Nil Entry option\", e.g. if fund name is \"MultiSeries 30\", the Nil Entry fund name is \"MultiSeries 30 Nil Entry\".", "For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".", - "For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry Free option\" + \"Estimated other investment costs\".", + "For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry option\" + \"Estimated other investment costs\".", "---Example 1 Start---", - "Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n", + "Management Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n", "---Example 1 End---", - "For this case, although the table header is with disorder issue during PDF contents extraction issue.", - "But the data points numbers order in data row (for example: 2.14 2.990.02 0.000.21 2.37 3.22) is correct as initial table structure.", + "The data points numbers order in data row (for example: 2.14 2.990.02 0.000.21 2.37 3.22) is correct as initial table structure.", "Please pay attention below information", "Assume the column sequence number is from 1.", - "\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry Free option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees (B)\" values are as the column 4 numbers.", + "\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.", "For main fund: Platinum Asia with values: 2.14 2.990.02 0.000.21 2.37 3.22, ", "the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)", "the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)", diff --git a/main.py b/main.py index 1646226..d2dc800 100644 --- a/main.py +++ b/main.py @@ -1526,8 +1526,8 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - re_run_extract_data = True - re_run_mapping_data = True + re_run_extract_data = False + re_run_mapping_data = False force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" @@ -1560,7 +1560,7 @@ if __name__ == "__main__": # "544886057", # "550769189", # "553449663"] - # special_doc_id_list = ["539241700"] + # special_doc_id_list = ["506913190"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/utils/biz_utils.py b/utils/biz_utils.py index cd5839f..e7f4c53 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -1099,6 +1099,18 @@ def replace_special_table_header(page_text: str): "regex_all_list": [r"Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Transactions\s*costs[\s\S]*?Buy\/sell\s*spreads\s*\(\%\)\s*\n"], "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n" + }, + { + # item 0: document 401212184, page 17 - 20 + "regex_all_list": + [r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Entry\s*Fee[\s\S]*?Nil\s*Entry[\s\S]*?Other\s*investment\s*costs[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Nil\s*Entry\s*Fee\s*.*\n", + r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Estimated\s*Other[\s\S]*?Entry\s*Fee\s*Nil\s*Entry[\s\S]*?Nil\s*Entry[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Fee\s*option.*\n"], + "replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n" + }, + { + "regex_all_list": + [r"Investment\s*option\s*Administration fees[\s\S]*?administration\s*costs\s*Investment\s*fees[\s\S]*?investment\s*costs\s*Administration\s*fees[\s\S]*?Investment\s*fees[\s\S]*?Estimated\s*administration[\s\S]*?transaction\s*costs[\s\S]*?annual\s*fees\s*and\s*costs\s*\(\%\s*pa\)\s*\n"], + "replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n" } ] updated_text = False From 604ab326a7f34bacb42a98de34d0109040bc1c9f Mon Sep 17 00:00:00 2001 From: Blade He Date: Sat, 8 Mar 2025 21:50:44 -0600 Subject: [PATCH 07/11] a little change --- utils/biz_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/biz_utils.py b/utils/biz_utils.py index e7f4c53..9bab856 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -1108,6 +1108,7 @@ def replace_special_table_header(page_text: str): "replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n" }, { + # item 0: document 411062815, page 17 "regex_all_list": [r"Investment\s*option\s*Administration fees[\s\S]*?administration\s*costs\s*Investment\s*fees[\s\S]*?investment\s*costs\s*Administration\s*fees[\s\S]*?Investment\s*fees[\s\S]*?Estimated\s*administration[\s\S]*?transaction\s*costs[\s\S]*?annual\s*fees\s*and\s*costs\s*\(\%\s*pa\)\s*\n"], "replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n" From 2548606ccc61af467ae669aa75f6e8950decfd64 Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 10 Mar 2025 08:20:01 -0500 Subject: [PATCH 08/11] a little change --- calc_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/calc_metrics.py b/calc_metrics.py index 0b76786..53dca9e 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -1345,7 +1345,7 @@ if __name__ == "__main__": audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" audit_data_sheet: str = "Sheet1" - verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250308202351.xlsx" + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250308220117.xlsx" verify_data_sheet: str = "total_mapping_data" # verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt" verify_document_list_file_list = [None, From e9f6383258f8af05cde7d8a3ddde448ab7ddfa86 Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 10 Mar 2025 11:09:00 -0500 Subject: [PATCH 09/11] apply configuration file to replace disorder table header contents --- calc_metrics.py | 6 ++- .../aus_prospectus/replace_table_header.json | 35 +++++++++++++++ .../emea_ar/replace_table_header.json | 3 ++ core/data_extraction.py | 17 +++++-- main.py | 6 +-- utils/biz_utils.py | 45 ++++--------------- 6 files changed, 69 insertions(+), 43 deletions(-) create mode 100644 configuration/aus_prospectus/replace_table_header.json create mode 100644 configuration/emea_ar/replace_table_header.json diff --git a/calc_metrics.py b/calc_metrics.py index 53dca9e..840ee54 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -1308,6 +1308,10 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data def is_equal(gt_value, pred_value, data_point: str = ""): if gt_value is not None and len(str(gt_value)) > 0 and \ pred_value is not None and len(str(pred_value)) > 0: + if gt_value == "0.0": + gt_value = "0" + if pred_value == "0.0": + pred_value = "0" if gt_value == pred_value: return True if data_point == "benchmark_name": @@ -1351,7 +1355,7 @@ if __name__ == "__main__": verify_document_list_file_list = [None, "./sample_documents/aus_prospectus_29_documents_sample.txt", "./sample_documents/aus_prospectus_17_documents_sample.txt"] - is_for_all = False + is_for_all = True for verify_document_list_file in verify_document_list_file_list: calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, audit_data_sheet=audit_data_sheet, diff --git a/configuration/aus_prospectus/replace_table_header.json b/configuration/aus_prospectus/replace_table_header.json new file mode 100644 index 0000000..7dfc383 --- /dev/null +++ b/configuration/aus_prospectus/replace_table_header.json @@ -0,0 +1,35 @@ +{ + "details": [ + { + "regex_all_list": + ["\\nIndirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n", + "\\n(Investment\\s*Option|Fund)[\\s\\S]*?Management\\s*fee[\\s\\S]*?Indirect\\s*costs[\\s\\S]*?performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?indirect\\s*costs[\\s\\S]*?(interposed\\s*vehicles|managers\\s*vehicles)\\s*\\n", + "\\nOption\\s*name\\s*Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n"], + "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n", + "comments": ["item 0: document 410899007", + "item 1: document 539266880, 539266817, 539261734", + "item 2: document 539266893"] + + }, + { + "regex_all_list": + ["Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Transactions\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads\\s*\\(\\%\\)\\s*\\n"], + "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n", + "comments": ["item 0: document 410899007"] + }, + { + "regex_all_list": + ["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n", + "Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"], + "replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n", + "comments": ["item 0: document 401212184, page 17", + "item 1: document 401212184, page 18 - 20"] + }, + { + "regex_all_list": + ["Investment\\s*option\\s*Administration fees[\\s\\S]*?administration\\s*costs\\s*Investment\\s*fees[\\s\\S]*?investment\\s*costs\\s*Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Estimated\\s*administration[\\s\\S]*?transaction\\s*costs[\\s\\S]*?annual\\s*fees\\s*and\\s*costs\\s*\\(\\%\\s*pa\\)\\s*\\n"], + "replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n", + "comments": ["item 0: document 411062815, page 17"] + } + ] +} \ No newline at end of file diff --git a/configuration/emea_ar/replace_table_header.json b/configuration/emea_ar/replace_table_header.json new file mode 100644 index 0000000..56b3955 --- /dev/null +++ b/configuration/emea_ar/replace_table_header.json @@ -0,0 +1,3 @@ +{ + "details": [] +} \ No newline at end of file diff --git a/core/data_extraction.py b/core/data_extraction.py index a81c8b6..ecd51b5 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -74,6 +74,7 @@ class DataExtraction: self.datapoint_level_config = self.get_datapoint_level() self.datapoint_type_config = self.get_datapoint_type() self.datapoint_name_config = self.get_datapoint_name() + self.replace_table_header_config = self.get_replace_table_header_config() self.datapoint_reported_name_config, self.non_english_reported_name_config = \ self.get_datapoint_reported_name() self.extract_way = extract_way @@ -206,6 +207,15 @@ class DataExtraction: datapoint_name = json.load(f) return datapoint_name + def get_replace_table_header_config(self) -> str: + replace_table_header_file = os.path.join(self.configuration_folder, "replace_table_header.json") + if os.path.exists(replace_table_header_file): + with open(replace_table_header_file, "r", encoding="utf-8") as f: + replace_table_header_config = json.load(f).get("details", []) + return replace_table_header_config + else: + return [] + def get_pdf_page_text_dict(self) -> dict: pdf_util = PDFUtil(self.pdf_file) success, text, page_text_dict = pdf_util.extract_text() @@ -598,7 +608,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num != 25: + # if page_num != 16: # continue if page_num in handled_page_num_list: continue @@ -616,7 +626,7 @@ class DataExtraction: else: previous_page_fund_name = None - page_text = replace_special_table_header(page_text) + page_text = replace_special_table_header(self.replace_table_header_config, page_text) extract_data = self.extract_data_by_page( page_num, page_text, @@ -681,7 +691,8 @@ class DataExtraction: ) if not with_same_structure_table: break - next_page_text = replace_special_table_header(next_page_text) + next_page_text = replace_special_table_header(self.replace_table_header_config, + next_page_text) target_text = current_text + next_page_text else: target_text = "" diff --git a/main.py b/main.py index d2dc800..9de2959 100644 --- a/main.py +++ b/main.py @@ -1526,8 +1526,8 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - re_run_extract_data = False - re_run_mapping_data = False + re_run_extract_data = True + re_run_mapping_data = True force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" @@ -1560,7 +1560,7 @@ if __name__ == "__main__": # "544886057", # "550769189", # "553449663"] - # special_doc_id_list = ["506913190"] + # special_doc_id_list = ["411062815"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 9bab856..307be09 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -1036,7 +1036,7 @@ def remove_abundant_data_detail(data_detail_list: list, return data_detail_list -def replace_special_table_header(page_text: str): +def replace_special_table_header(replace_table_header_config: list, page_text: str): """ For some special table header, replace to the standard header e.g. @@ -1083,42 +1083,15 @@ def replace_special_table_header(page_text: str): Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n Performance \nfees charged \nby interposed \nvehicles \n """ - replace_info_list = [ - { - # item 0: document 410899007 - # item 1: document 539266880, 539266817, 539261734 - # item 2: document 539266893 - "regex_all_list": - [r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n", - r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?(interposed\s*vehicles|managers\s*vehicles)\s*\n", - r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"], - "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n" - }, - { - # item 0: document 410899007 - "regex_all_list": - [r"Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Transactions\s*costs[\s\S]*?Buy\/sell\s*spreads\s*\(\%\)\s*\n"], - "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n" - }, - { - # item 0: document 401212184, page 17 - 20 - "regex_all_list": - [r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Entry\s*Fee[\s\S]*?Nil\s*Entry[\s\S]*?Other\s*investment\s*costs[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Nil\s*Entry\s*Fee\s*.*\n", - r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Estimated\s*Other[\s\S]*?Entry\s*Fee\s*Nil\s*Entry[\s\S]*?Nil\s*Entry[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Fee\s*option.*\n"], - "replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n" - }, - { - # item 0: document 411062815, page 17 - "regex_all_list": - [r"Investment\s*option\s*Administration fees[\s\S]*?administration\s*costs\s*Investment\s*fees[\s\S]*?investment\s*costs\s*Administration\s*fees[\s\S]*?Investment\s*fees[\s\S]*?Estimated\s*administration[\s\S]*?transaction\s*costs[\s\S]*?annual\s*fees\s*and\s*costs\s*\(\%\s*pa\)\s*\n"], - "replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n" - } - ] + if replace_table_header_config is None or len(replace_table_header_config) == 0: + return page_text updated_text = False - for replace_info in replace_info_list: - for regex_all in replace_info["regex_all_list"]: - if re.search(regex_all, page_text) is not None: - page_text = re.sub(regex_all, replace_info["replace_text"], page_text) + for replace_info in replace_table_header_config: + for regex_all in replace_info.get("regex_all_list", []): + table_header_search = re.search(regex_all, page_text) + if table_header_search is not None: + original_text = table_header_search.group() + page_text = re.sub(regex_all, replace_info.get("replace_text", original_text), page_text) updated_text = True break if updated_text: From b7506c78f3b534814b23e27e8dd2c92f8d467ff9 Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 10 Mar 2025 16:00:17 -0500 Subject: [PATCH 10/11] Add API code file --- app_aus_prospectus.py | 96 +++++++++++++++++++ calc_metrics.py | 4 +- .../aus_prospectus/datapoint_keyword.json | 2 +- .../aus_prospectus/datapoint_level.json | 2 +- .../aus_prospectus/datapoint_name.json | 2 +- .../datapoint_reported_name.json | 2 +- .../aus_prospectus/datapoint_type.json | 2 +- .../aus_prospectus/domicile_datapoints.json | 2 +- core/data_mapping.py | 10 +- .../data_extraction_prompts_config.json | 8 +- yml/aus_prospectus.yml | 27 ++++++ 11 files changed, 139 insertions(+), 18 deletions(-) create mode 100644 app_aus_prospectus.py create mode 100644 yml/aus_prospectus.yml diff --git a/app_aus_prospectus.py b/app_aus_prospectus.py new file mode 100644 index 0000000..90f3443 --- /dev/null +++ b/app_aus_prospectus.py @@ -0,0 +1,96 @@ +from flask import Flask, request, jsonify, render_template +from flasgger import Swagger, swag_from +from main import EMEA_AR_Parsing +from utils.logger import logger +from utils.biz_utils import clean_folder +from tqdm import tqdm +import pandas as pd +import os + + +template = { + "info": { + "title": "Australia Prospectus Data Extraction API", + "description": 'Australia Prospectus Data Extraction API', + "version": "1.0" + } +} +app = Flask(__name__) +# By Swagger, we can see the API documentation in the browser, the example URL is http://127.0.0.1:8080/apidocs/ +swagger = Swagger(app, template=template) + + +@app.route('/automation/api/model/aus_prospectus', methods=['POST']) +@swag_from('yml/aus_prospectus.yml') +def aus_prospectus_data_extract(): + """ + Extract Australia Prospectus data from Australia Prospectus PDF document + input sample: + { + "doc_id": "412778803" + } + output: Australia Prospectus cost data as a list of dictionaries + :return: + :rtype: + """ + logger.info('Australia Prospectus data extraction begin') + doc_id = request.json.get('doc_id') + + if not doc_id: + return jsonify({"error": "doc_id is required"}), 400 + + pdf_folder = r"./data/aus_prospectus/pdf/" + output_pdf_text_folder = r"./data/aus_prospectus/output/pdf_text/" + output_extract_data_folder = r"./data/aus_prospectus/output/extract_data/docs/" + output_mapping_data_folder = r"./data/aus_prospectus/output/mapping_data/docs/" + drilldown_folder = r"./data/aus_prospectus/output/drilldown/" + db_mapping_document_folder = r"./data/aus_prospectus/output/db_mapping/document/" + db_mapping_provider_folder = r"./data/aus_prospectus/output/db_mapping/provider/" + extract_way = "text" + + os.makedirs(pdf_folder, exist_ok=True) + os.makedirs(output_pdf_text_folder, exist_ok=True) + os.makedirs(output_extract_data_folder, exist_ok=True) + os.makedirs(output_mapping_data_folder, exist_ok=True) + os.makedirs(drilldown_folder, exist_ok=True) + os.makedirs(db_mapping_document_folder, exist_ok=True) + os.makedirs(db_mapping_provider_folder, exist_ok=True) + + clean_folder(pdf_folder) + clean_folder(output_pdf_text_folder) + clean_folder(output_extract_data_folder) + clean_folder(output_mapping_data_folder) + clean_folder(drilldown_folder) + clean_folder(db_mapping_document_folder) + clean_folder(db_mapping_provider_folder) + + re_run_extract_data = False + re_run_mapping_data = False + + try: + emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id, + doc_source="aus_prospectus", + pdf_folder=pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, + output_extract_data_folder=output_extract_data_folder, + output_mapping_data_folder=output_mapping_data_folder, + extract_way=extract_way, + drilldown_folder=drilldown_folder, + compare_with_provider=False) + doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data) + doc_mapping_data = emea_ar_parsing.mapping_data( + data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data + ) + results = {"extract_data": doc_mapping_data} + return jsonify(results) + except Exception as e: + logger.error(f"Error: {e}") + results = {"extract_data": [], + "annotation_data": [], + "error": str(e)} + return jsonify(results) + + +if __name__ == '__main__': + # Add use_reloader = False to avoid init twice + app.run(host='0.0.0.0', port="8080", debug=False, use_reloader=False) diff --git a/calc_metrics.py b/calc_metrics.py index 840ee54..783f163 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -601,7 +601,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros # ravi_verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" # verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" verify_fields = [ - "DocumentId", + "doc_id", "raw_fund_name", "fund_id", "fund_name", @@ -629,7 +629,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros # verify_data_df = raw_verify_data_df[raw_verify_data_df["sec_id"].isin(ravi_verify_data_df["sec_id"])] verify_data_df = verify_data_df[verify_fields] verify_data_df = verify_data_df.drop_duplicates() - verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"}) + # verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"}) verify_data_df.fillna("", inplace=True) verify_data_df.reset_index(drop=True, inplace=True) diff --git a/configuration/aus_prospectus/datapoint_keyword.json b/configuration/aus_prospectus/datapoint_keyword.json index 9026586..b6dc778 100644 --- a/configuration/aus_prospectus/datapoint_keyword.json +++ b/configuration/aus_prospectus/datapoint_keyword.json @@ -11,5 +11,5 @@ "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}, "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]}, - "change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} + "change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_level.json b/configuration/aus_prospectus/datapoint_level.json index 036e792..fdb1208 100644 --- a/configuration/aus_prospectus/datapoint_level.json +++ b/configuration/aus_prospectus/datapoint_level.json @@ -11,5 +11,5 @@ "minimum_initial_investment": "fund_level", "indirect_costs": "share_level", "recoverable_expenses": "share_level", - "change_recoverable_expanses": "share_level" + "change_recoverable_expenses": "share_level" } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_name.json b/configuration/aus_prospectus/datapoint_name.json index 9b3272f..893c4c7 100644 --- a/configuration/aus_prospectus/datapoint_name.json +++ b/configuration/aus_prospectus/datapoint_name.json @@ -11,5 +11,5 @@ "benchmark_name": "benchmark name", "indirect_costs": "indirect cost", "recoverable_expenses": "recoverable expenses", - "change_recoverable_expanses": "change recoverable expanses" + "change_recoverable_expenses": "change recoverable expanses" } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_reported_name.json b/configuration/aus_prospectus/datapoint_reported_name.json index c0906c0..7d913ef 100644 --- a/configuration/aus_prospectus/datapoint_reported_name.json +++ b/configuration/aus_prospectus/datapoint_reported_name.json @@ -11,5 +11,5 @@ "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}, "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]}, - "change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} + "change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_type.json b/configuration/aus_prospectus/datapoint_type.json index d1ed4a1..f38a099 100644 --- a/configuration/aus_prospectus/datapoint_type.json +++ b/configuration/aus_prospectus/datapoint_type.json @@ -11,5 +11,5 @@ "minimum_initial_investment": "integer", "indirect_costs": "float", "recoverable_expenses": "float", - "change_recoverable_expanses": "float" + "change_recoverable_expenses": "float" } \ No newline at end of file diff --git a/configuration/aus_prospectus/domicile_datapoints.json b/configuration/aus_prospectus/domicile_datapoints.json index c4ff806..2310ce4 100644 --- a/configuration/aus_prospectus/domicile_datapoints.json +++ b/configuration/aus_prospectus/domicile_datapoints.json @@ -32,7 +32,7 @@ "benchmark_name", "minimum_initial_investment", "indirect_costs", - "change_recoverable_expanses", + "change_recoverable_expenses", "recoverable_expenses" ] } diff --git a/core/data_mapping.py b/core/data_mapping.py index e50798e..c21c47e 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -308,7 +308,8 @@ class DataMapping: break if not exist: data = { - "DocumentId": doc_id, + "doc_id": doc_id, + "effective_date": doc_date, "raw_fund_name": raw_fund_name, "raw_share_name": raw_share_name, "raw_name": raw_name, @@ -316,9 +317,7 @@ class DataMapping: "fund_name": fund_legal_name, "sec_id": share_class_id, "sec_name": share_class_legal_name, - "EffectiveDate": doc_date, "page_index": [], - "RawName": raw_name, } for datapoint_name in datapoint_name_list: data[datapoint_name] = "" @@ -375,7 +374,8 @@ class DataMapping: exist = True if not exist: data = { - "DocumentId": doc_id, + "doc_id": doc_id, + "effective_date": doc_date, "raw_fund_name": raw_fund_name, "raw_share_name": "", "raw_name": raw_name, @@ -383,9 +383,7 @@ class DataMapping: "fund_name": fund_legal_name, "sec_id": "", "sec_name": "", - "EffectiveDate": doc_date, "page_index": [page_index], - "RawName": raw_name, } for datapoint_name in datapoint_name_list: data[datapoint_name] = "" diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 0063c89..f6a9c2e 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -118,7 +118,7 @@ "minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00", "indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100.", "recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.", - "change_recoverable_expanses": "Change recoverable expanses is belong to percentage number, the value should be less than 100." + "change_recoverable_expenses": "Change recoverable expenses is belong to percentage number, the value should be less than 100." }, "special_rule": { "management_fee_and_costs": [ @@ -177,7 +177,7 @@ "Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n", "---Example End---", "The output should be:", - "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", + "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", "\n", "D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", @@ -617,7 +617,7 @@ "high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"], "indirect_costs_value": [0.12, 0.16, 0.02], "recoverable_expenses_value": [0.01, 0.05, 0.06], - "change_recoverable_expanses_value": [0.01, 0.02, 0.03] + "change_recoverable_expenses_value": [0.01, 0.02, 0.03] }, "dp_reported_name" : { "total_annual_dollar_based_charges": "Total annual dollar based charges", @@ -632,7 +632,7 @@ "minimum_initial_investment": "Minimum initial investment", "indirect_costs": "Indirect cost", "recoverable_expenses": "Recoverable expenses", - "change_recoverable_expanses": "Change recoverable expanses", + "change_recoverable_expenses": "Change recoverable expenses", "establishment_fee": "Establishment fee", "contribution_fee": "Contribution fee", "withdrawal_fee": "Withdrawal fee", diff --git a/yml/aus_prospectus.yml b/yml/aus_prospectus.yml new file mode 100644 index 0000000..ca80925 --- /dev/null +++ b/yml/aus_prospectus.yml @@ -0,0 +1,27 @@ +Example to extract data from Australia Prospectus PDF Document. +Sample: + { + "doc_id": "412778803" + } +Author: Blade He +--- +parameters: + - name: Australia Prospectus Document Id + in: body + type: string + required: true + description: Example to extract data from Australia Prospectus PDF Document. + default: {"doc_id": "412778803"} + schema: + required: + - Document Id + properties: + doc_id: + description: Australia Prospectus Document Id + required: true + type: string +responses: + 200: + description: succesfully. + 400: + description: failed. \ No newline at end of file From c7c36dbdd2be741a5155c3ade59e5d6ed6f375d0 Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 11 Mar 2025 17:15:39 -0500 Subject: [PATCH 11/11] 1. update performance_fee name to performance_fee_costs 2. support extract data for total_annual_dollar_based_charges --- calc_metrics.py | 131 ++++++++++++------ .../aus_prospectus/datapoint_keyword.json | 4 +- .../aus_prospectus/datapoint_level.json | 2 +- .../aus_prospectus/datapoint_name.json | 2 +- .../datapoint_reported_name.json | 4 +- .../aus_prospectus/datapoint_type.json | 2 +- .../aus_prospectus/domicile_datapoints.json | 2 +- core/data_extraction.py | 10 +- .../data_extraction_prompts_config.json | 109 +++++++++------ main.py | 7 +- prepare_data.py | 18 ++- 11 files changed, 191 insertions(+), 100 deletions(-) diff --git a/calc_metrics.py b/calc_metrics.py index 783f163..f9dfe47 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -576,7 +576,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros "administration_fees", "minimum_initial_investment", "benchmark_name", - "performance_fee", + "performance_fee_costs", "interposed_vehicle_performance_fee_cost", "buy_spread", "sell_spread", @@ -613,7 +613,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros "administration_fees", "minimum_initial_investment", "benchmark_name", - "performance_fee", + "performance_fee_costs", "interposed_vehicle_performance_fee_cost", "buy_spread", "sell_spread", @@ -649,8 +649,8 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros gt_benchmark_name_list = [] pred_benchmark_name_list = [] if is_for_all: - gt_performance_fee_list = [] - pred_performance_fee_list = [] + gt_performance_fee_costs_list = [] + pred_performance_fee_costs_list = [] gt_interposed_vehicle_performance_fee_cost_list = [] pred_interposed_vehicle_performance_fee_cost_list = [] gt_buy_spread_list = [] @@ -701,7 +701,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros minimum_initial_investment = str(row["minimum_initial_investment"]) benchmark_name = str(row["benchmark_name"]) if is_for_all: - performance_fee = str(row["performance_fee"]) + performance_fee_costs = str(row["performance_fee_costs"]) interposed_vehicle_performance_fee_cost = str(row["interposed_vehicle_performance_fee_cost"]) buy_spread = str(row["buy_spread"]) sell_spread = str(row["sell_spread"]) @@ -720,7 +720,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros v_minimum_initial_investment = str(doc_verify_sec_row["minimum_initial_investment"]) v_benchmark_name = str(doc_verify_sec_row["benchmark_name"]) if is_for_all: - v_performance_fee = str(doc_verify_sec_row["performance_fee"]) + v_performance_fee_costs = str(doc_verify_sec_row["performance_fee_costs"]) v_interposed_vehicle_performance_fee_cost = str(doc_verify_sec_row["interposed_vehicle_performance_fee_cost"]) v_buy_spread = str(doc_verify_sec_row["buy_spread"]) v_sell_spread = str(doc_verify_sec_row["sell_spread"]) @@ -744,8 +744,8 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros message = get_gt_pred_by_compare_values(benchmark_name, v_benchmark_name, gt_benchmark_name_list, pred_benchmark_name_list, data_point="benchmark_name") message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name")) if is_for_all: - message = get_gt_pred_by_compare_values(performance_fee, v_performance_fee, gt_performance_fee_list, pred_performance_fee_list) - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee")) + message = get_gt_pred_by_compare_values(performance_fee_costs, v_performance_fee_costs, gt_performance_fee_costs_list, pred_performance_fee_costs_list) + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee_costs")) message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "interposed_vehicle_performance_fee_cost")) @@ -803,11 +803,11 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros support_benchmark_name = sum(gt_benchmark_name_list) if is_for_all: - precision_performance_fee = precision_score(gt_performance_fee_list, pred_performance_fee_list) - recall_performance_fee = recall_score(gt_performance_fee_list, pred_performance_fee_list) - f1_performance_fee = f1_score(gt_performance_fee_list, pred_performance_fee_list) - accuracy_performance_fee = accuracy_score(gt_performance_fee_list, pred_performance_fee_list) - support_performance_fee = sum(gt_performance_fee_list) + precision_performance_fee_costs = precision_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + recall_performance_fee_costs = recall_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + f1_performance_fee_costs = f1_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + accuracy_performance_fee_costs = accuracy_score(gt_performance_fee_costs_list, pred_performance_fee_costs_list) + support_performance_fee_costs = sum(gt_performance_fee_costs_list) precision_interposed_vehicle_performance_fee_cost = precision_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) recall_interposed_vehicle_performance_fee_cost = recall_score(gt_interposed_vehicle_performance_fee_cost_list, pred_interposed_vehicle_performance_fee_cost_list) @@ -856,7 +856,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros {"item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, {"item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, {"item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name}, - {"item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee}, + {"item": "performance_fee_costs", "precision": precision_performance_fee_costs, "recall": recall_performance_fee_costs, "f1": f1_performance_fee_costs, "accuracy": accuracy_performance_fee_costs, "support": support_performance_fee_costs}, {"item": "interposed_vehicle_performance_fee_cost", "precision": precision_interposed_vehicle_performance_fee_cost, "recall": recall_interposed_vehicle_performance_fee_cost, "f1": f1_interposed_vehicle_performance_fee_cost, "accuracy": accuracy_interposed_vehicle_performance_fee_cost, "support": support_interposed_vehicle_performance_fee_cost}, {"item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, @@ -924,7 +924,7 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ "administration_fees", "minimum_initial_investment", "benchmark_name", - "performance_fee", + "performance_fee_costs", "interposed_vehicle_performance_fee_cost", "buy_spread", "sell_spread", @@ -942,7 +942,7 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ audit_data_df.reset_index(drop=True, inplace=True) verify_fields = [ - "DocumentId", + "doc_id", "raw_fund_name", "fund_id", "fund_name", @@ -954,7 +954,7 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ "administration_fees", "minimum_initial_investment", "benchmark_name", - "performance_fee", + "performance_fee_costs", "interposed_vehicle_performance_fee_cost", "buy_spread", "sell_spread", @@ -963,7 +963,6 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ verify_data_df = pd.read_excel(verify_file_path, sheet_name=verify_data_sheet) verify_data_df = verify_data_df[verify_fields] verify_data_df = verify_data_df.drop_duplicates() - verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"}) verify_data_df.fillna("", inplace=True) verify_data_df.reset_index(drop=True, inplace=True) @@ -1002,8 +1001,8 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ "gt_benchmark_name_list": [], "pred_benchmark_name_list": []} if is_for_all: - provider_gt_pred_data[provider_id].update({"gt_performance_fee_list": [], - "pred_performance_fee_list": [], + provider_gt_pred_data[provider_id].update({"gt_performance_fee_costs_list": [], + "pred_performance_fee_costs_list": [], "gt_interposed_vehicle_performance_fee_cost_list": [], "pred_interposed_vehicle_performance_fee_cost_list": [], "gt_buy_spread_list": [], @@ -1026,7 +1025,7 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ minimum_initial_investment = str(row["minimum_initial_investment"]) benchmark_name = str(row["benchmark_name"]) if is_for_all: - performance_fee = str(row["performance_fee"]) + performance_fee_costs = str(row["performance_fee_costs"]) interposed_vehicle_performance_fee_cost = str(row["interposed_vehicle_performance_fee_cost"]) buy_spread = str(row["buy_spread"]) sell_spread = str(row["sell_spread"]) @@ -1045,7 +1044,7 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ v_minimum_initial_investment = str(doc_verify_sec_row["minimum_initial_investment"]) v_benchmark_name = str(doc_verify_sec_row["benchmark_name"]) if is_for_all: - v_performance_fee = str(doc_verify_sec_row["performance_fee"]) + v_performance_fee_costs = str(doc_verify_sec_row["performance_fee_costs"]) v_interposed_vehicle_performance_fee_cost = str(doc_verify_sec_row["interposed_vehicle_performance_fee_cost"]) v_buy_spread = str(doc_verify_sec_row["buy_spread"]) v_sell_spread = str(doc_verify_sec_row["sell_spread"]) @@ -1082,12 +1081,12 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ data_point="benchmark_name") message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "benchmark_name")) if is_for_all: - message = get_gt_pred_by_compare_values(performance_fee, - v_performance_fee, - provider_gt_pred_data[provider_id]["gt_performance_fee_list"], - provider_gt_pred_data[provider_id]["pred_performance_fee_list"], - data_point="performance_fee") - message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee")) + message = get_gt_pred_by_compare_values(performance_fee_costs, + v_performance_fee_costs, + provider_gt_pred_data[provider_id]["gt_performance_fee_costs_list"], + provider_gt_pred_data[provider_id]["pred_performance_fee_costs_list"], + data_point="performance_fee_costs") + message_list.append(generate_message(message, document_id, sec_id, fund_name, raw_fund_name, raw_share_name, "performance_fee_costs")) message = get_gt_pred_by_compare_values(interposed_vehicle_performance_fee_cost, v_interposed_vehicle_performance_fee_cost, provider_gt_pred_data[provider_id]["gt_interposed_vehicle_performance_fee_cost_list"], @@ -1165,15 +1164,15 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ support_benchmark_name = sum(gt_pred_data["gt_benchmark_name_list"]) if is_for_all: - precision_performance_fee = precision_score(gt_pred_data["gt_performance_fee_list"], - gt_pred_data["pred_performance_fee_list"]) - recall_performance_fee = recall_score(gt_pred_data["gt_performance_fee_list"], - gt_pred_data["pred_performance_fee_list"]) - f1_performance_fee = f1_score(gt_pred_data["gt_performance_fee_list"], - gt_pred_data["pred_performance_fee_list"]) - accuracy_performance_fee = accuracy_score(gt_pred_data["gt_performance_fee_list"], - gt_pred_data["pred_performance_fee_list"]) - support_performance_fee = sum(gt_pred_data["gt_performance_fee_list"]) + precision_performance_fee_costs = precision_score(gt_pred_data["gt_performance_fee_costs_list"], + gt_pred_data["pred_performance_fee_costs_list"]) + recall_performance_fee_costs = recall_score(gt_pred_data["gt_performance_fee_costs_list"], + gt_pred_data["pred_performance_fee_costs_list"]) + f1_performance_fee_costs = f1_score(gt_pred_data["gt_performance_fee_costs_list"], + gt_pred_data["pred_performance_fee_costs_list"]) + accuracy_performance_fee_costs = accuracy_score(gt_pred_data["gt_performance_fee_costs_list"], + gt_pred_data["pred_performance_fee_costs_list"]) + support_performance_fee_costs = sum(gt_pred_data["gt_performance_fee_costs_list"]) precision_interposed_vehicle_performance_fee_cost = precision_score(gt_pred_data["gt_interposed_vehicle_performance_fee_cost_list"], gt_pred_data["pred_interposed_vehicle_performance_fee_cost_list"]) @@ -1221,7 +1220,7 @@ def calculate_metrics_by_provider(audit_file_path: str = r"/data/aus_prospectus/ {"provider_id": provider_id, "provider_name": provider_name, "item": "administration_fees", "precision": precision_administration_fees, "recall": recall_administration_fees, "f1": f1_administration_fees, "accuracy": accuracy_administration_fees, "support": support_administration_fees}, {"provider_id": provider_id, "provider_name": provider_name, "item": "minimum_initial_investment", "precision": precision_miminimum_initial_investment, "recall": recall_miminimum_initial_investment, "f1": f1_miminimum_initial_investment, "accuracy": accuracy_miminimum_initial_investment, "support": support_miminimum_initial_investment}, {"provider_id": provider_id, "provider_name": provider_name, "item": "benchmark_name", "precision": precision_benchmark_name, "recall": recall_benchmark_name, "f1": f1_benchmark_name, "accuracy": accuracy_benchmark_name, "support": support_benchmark_name}, - {"provider_id": provider_id, "provider_name": provider_name, "item": "performance_fee", "precision": precision_performance_fee, "recall": recall_performance_fee, "f1": f1_performance_fee, "accuracy": accuracy_performance_fee, "support": support_performance_fee}, + {"provider_id": provider_id, "provider_name": provider_name, "item": "performance_fee_costs", "precision": precision_performance_fee_costs, "recall": recall_performance_fee_costs, "f1": f1_performance_fee_costs, "accuracy": accuracy_performance_fee_costs, "support": support_performance_fee_costs}, {"provider_id": provider_id, "provider_name": provider_name, "item": "interposed_vehicle_performance_fee_cost", "precision": precision_interposed_vehicle_performance_fee_cost, "recall": recall_interposed_vehicle_performance_fee_cost, "f1": f1_interposed_vehicle_performance_fee_cost, "accuracy": accuracy_interposed_vehicle_performance_fee_cost, "support": support_interposed_vehicle_performance_fee_cost}, {"provider_id": provider_id, "provider_name": provider_name, "item": "buy_spread", "precision": precision_buy_spread, "recall": recall_buy_spread, "f1": f1_buy_spread, "accuracy": accuracy_buy_spread, "support": support_buy_spread}, @@ -1306,12 +1305,24 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data def is_equal(gt_value, pred_value, data_point: str = ""): - if gt_value is not None and len(str(gt_value)) > 0 and \ - pred_value is not None and len(str(pred_value)) > 0: + if gt_value is not None and len(str(gt_value).strip()) > 0 and \ + pred_value is not None and len(str(pred_value).strip()) > 0: if gt_value == "0.0": gt_value = "0" if pred_value == "0.0": pred_value = "0" + if data_point not in ["benchmark_name"]: + try: + gt_num = float(gt_value) + # round to 2 decimal places + gt_value = round(gt_num, 4) + except Exception as e: + pass + try: + pred_value = float(pred_value) + pred_value = round(pred_value, 4) + except Exception as e: + pass if gt_value == pred_value: return True if data_point == "benchmark_name": @@ -1332,9 +1343,47 @@ def clean_text(text: str): text = re.sub(r"\W", " ", text) text = re.sub(r"\s+", " ", text) return text + + +def merge_inference_data(): + file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250308220117.xlsx" + file2 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_5_documents_by_text_20250311165607.xlsx" + columns = [ + "doc_id", + "raw_fund_name", + "fund_id", + "fund_name", + "raw_share_name", + "sec_id", + "sec_name", + "management_fee_and_costs", + "management_fee", + "administration_fees", + "minimum_initial_investment", + "benchmark_name", + "performance_fee_costs", + "interposed_vehicle_performance_fee_cost", + "buy_spread", + "sell_spread", + "total_annual_dollar_based_charges" + ] + + file1_data_df = pd.read_excel(file1, sheet_name="total_mapping_data") + file1_data_df = file1_data_df[columns] + file2_data_df = pd.read_excel(file2, sheet_name="total_mapping_data") + file2_data_df = file2_data_df[columns] + total_data_df = pd.concat([file1_data_df, file2_data_df]) + total_data_df.reset_index(drop=True, inplace=True) + + output_folder = r"/data/aus_prospectus/output/mapping_data/total/" + output_file = os.path.join(output_folder, "merged_mapping_data_info_46_documents_by_text.xlsx") + with pd.ExcelWriter(output_file) as f: + total_data_df.to_excel(f, index=False, sheet_name="total_mapping_data") + if __name__ == "__main__": + # merge_inference_data() # adjust_column_order() # set_mapping_to_data_side_documents_data() @@ -1349,7 +1398,7 @@ if __name__ == "__main__": audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" audit_data_sheet: str = "Sheet1" - verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250308220117.xlsx" + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged_mapping_data_info_46_documents_by_text.xlsx" verify_data_sheet: str = "total_mapping_data" # verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt" verify_document_list_file_list = [None, diff --git a/configuration/aus_prospectus/datapoint_keyword.json b/configuration/aus_prospectus/datapoint_keyword.json index b6dc778..91c29b1 100644 --- a/configuration/aus_prospectus/datapoint_keyword.json +++ b/configuration/aus_prospectus/datapoint_keyword.json @@ -1,8 +1,8 @@ { - "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, + "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar", "administration fees and costs", "Administration fee", "Administration fees"]}, "management_fee_and_costs": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]}, "management_fee": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]}, - "performance_fee": {"english": ["performance fee", "performance fees"]}, + "performance_fee_costs": {"english": ["performance fee", "performance fees"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]}, "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]}, diff --git a/configuration/aus_prospectus/datapoint_level.json b/configuration/aus_prospectus/datapoint_level.json index fdb1208..ddf0df3 100644 --- a/configuration/aus_prospectus/datapoint_level.json +++ b/configuration/aus_prospectus/datapoint_level.json @@ -2,7 +2,7 @@ "total_annual_dollar_based_charges": "share_level", "management_fee_and_costs": "share_level", "management_fee": "share_level", - "performance_fee": "share_level", + "performance_fee_costs": "share_level", "buy_spread": "share_level", "sell_spread": "share_level", "administration_fees": "share_level", diff --git a/configuration/aus_prospectus/datapoint_name.json b/configuration/aus_prospectus/datapoint_name.json index 893c4c7..033ac37 100644 --- a/configuration/aus_prospectus/datapoint_name.json +++ b/configuration/aus_prospectus/datapoint_name.json @@ -2,7 +2,7 @@ "management_fee_and_costs": "management fee and costs", "management_fee": "management fee", "administration_fees": "administration fee", - "performance_fee": "performance fee", + "performance_fee_costs": "performance fee", "interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost", "buy_spread": "buy spread", "sell_spread": "sell spread", diff --git a/configuration/aus_prospectus/datapoint_reported_name.json b/configuration/aus_prospectus/datapoint_reported_name.json index 7d913ef..06d8db0 100644 --- a/configuration/aus_prospectus/datapoint_reported_name.json +++ b/configuration/aus_prospectus/datapoint_reported_name.json @@ -1,8 +1,8 @@ { - "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, + "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar", "administration fees and costs", "Administration fee", "Administration fees"]}, "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]}, "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]}, - "performance_fee": {"english": ["performance fee", "performance fees"]}, + "performance_fee_costs": {"english": ["performance fee", "performance fees"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]}, "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]}, diff --git a/configuration/aus_prospectus/datapoint_type.json b/configuration/aus_prospectus/datapoint_type.json index f38a099..c1bddea 100644 --- a/configuration/aus_prospectus/datapoint_type.json +++ b/configuration/aus_prospectus/datapoint_type.json @@ -2,7 +2,7 @@ "total_annual_dollar_based_charges": "float", "management_fee_and_costs": "float", "management_fee": "float", - "performance_fee": "float", + "performance_fee_costs": "float", "buy_spread": "float", "sell_spread": "float", "administration_fees": "float", diff --git a/configuration/aus_prospectus/domicile_datapoints.json b/configuration/aus_prospectus/domicile_datapoints.json index 2310ce4..c996869 100644 --- a/configuration/aus_prospectus/domicile_datapoints.json +++ b/configuration/aus_prospectus/domicile_datapoints.json @@ -24,7 +24,7 @@ "total_annual_dollar_based_charges", "management_fee_and_costs", "management_fee", - "performance_fee", + "performance_fee_costs", "buy_spread", "sell_spread", "administration_fees", diff --git a/core/data_extraction.py b/core/data_extraction.py index ecd51b5..80486e6 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -117,6 +117,8 @@ class DataExtraction: if self.doc_source == "aus_prospectus" and self.document_category.upper() == "MIS": if "administration_fees" in list(datapoint_page_info.keys()): datapoint_page_info.pop("administration_fees") + if "total_annual_dollar_based_charges" in list(datapoint_page_info.keys()): + datapoint_page_info.pop("total_annual_dollar_based_charges") return datapoint_page_info def get_investment_objective_pages(self): @@ -282,6 +284,8 @@ class DataExtraction: keys = list(data_item.keys()) if "administration_fees" in keys: data_item.pop("administration_fees") + if "total_annual_dollar_based_charges" in keys: + data_item.pop("total_annual_dollar_based_charges") keys = [key for key in list(data_item.keys()) if key not in ["fund_name", "share_name"]] if len(keys) == 0: remove_items.append(data_item) @@ -327,7 +331,9 @@ class DataExtraction: dp_keys = [key for key in keys if key not in ["fund_name", "share_name", "management_fee_and_costs", - "management_fee"]] + "management_fee", + "buy_spread", + "sell_spread"]] for dp_key in dp_keys: if dp_key not in datapoint_list_with_production_name: datapoint_list_with_production_name.append(dp_key) @@ -608,7 +614,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num != 16: + # if page_num not in [4, 5]: # continue if page_num in handled_page_num_list: continue diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index f6a9c2e..bb08215 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -70,7 +70,6 @@ "management_fee_and_costs": "Management fee and costs is share level data.", "management_fee": "Management fee is share level data.", "performance_fee_costs": "Performance fee costs is share class level data.", - "performance_fee": "Performance fees is share class level data.", "buy_spread": "Buy spread is share class level data.", "sell_spread": "Sell spread is share class level data.", "establishment_fee": "Establishment fee is share class level data.", @@ -97,7 +96,6 @@ "management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.", "management_fee": "Management fee is belong to percentage number, the value should be less than 100.", "performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.", - "performance_fee": "Performance fees is belong to percentage number, the value should be less than 100.", "buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.", "sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.", "establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.", @@ -168,7 +166,7 @@ "\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n", "---Example End---", "The output should be:", - "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]}", + "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee_costs\": 0.06}]}", "\n", "C.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".", "The management_fee is the value of \"Management fee (% pa)\".", @@ -177,7 +175,7 @@ "Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n", "---Example End---", "The output should be:", - "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", + "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee_costs\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", "\n", "D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", @@ -209,9 +207,9 @@ "---Example 1 End---", "For this example, please ignore the \"Total investment fees and costs\" and \"Transaction costs\" columns, ", "just output the values from \"Investment fees and costs (excl Performance Fees)\" as management_fee and management_fee_and_costs, ", - "output the values from \"Performance Fee\" as performance_fee.", + "output the values from \"Performance Fee\" as performance_fee_costs.", "The output should be:", - "{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee\": 0.18}]}", + "{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee_costs\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee_costs\": 0.18}]}", "\n", "G. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00% or 0.55%-1.00%, please ignore and output empty.", "---Example 1 Start---", @@ -236,22 +234,22 @@ "---Example 1 End---", "The column: \"Equals investment fees and costs\" is the sum of \"Performance fee\" and \"Plus other investment fees and costs\", we should ignore the \"Performance fee\" value, just output the \"Plus other investment fees and costs\" value.", "The \"Plus other investment fees and costs\" could be the values for both of \"management fee\" and \"management fee and costs\", so the output should be:", - "{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", + "{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee_costs\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee_costs\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", "---Example 2 Start---", "MANAGEMENT COSTS AND TRANSACTION COSTS \n\nOption name Management costs \nEstimated \nperformance \nfee (pa) 1 \nTotal management\ncosts (including\nestimated performance\nfee) pa\nTransaction costs \nper transaction (%) \nMULTI-MANAGER MULTI-SECTOR (These investment options are located in the ‘Investment Options Menu’ on pages 18 to 19.) \nFirstChoice Wholesale Defensive 0.85% 0.85% 0.15\nFirstChoice Wholesale Conservative 0.90% 0.02%1 0.92% 1 0.15 \n", "---Example 2 End---", "The column: \"Total management costs (including estimated performance fee) pa\" is the sum of \"Management costs\" and \"Estimated performance fee (pa)\", we should ignore the \"Estimated performance fee (pa)\" value, just output the \"Management costs\" value.", "Both of management_fee and management_fee_and_costs are the values for \"Management costs\", so the output should be:", - "{\"data\": [{\"fund name\": \"FirstChoice Wholesale Defensive\", \"share name\": \"FirstChoice Wholesale Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.85}, {\"fund name\": \"FirstChoice Wholesale Conservative\", \"share name\": \"FirstChoice Wholesale Conservative\", \"management_fee_and_costs\": 0.9, \"management_fee\": 0.9, \"performance_fee\": 0.02}]}", + "{\"data\": [{\"fund name\": \"FirstChoice Wholesale Defensive\", \"share name\": \"FirstChoice Wholesale Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.85}, {\"fund name\": \"FirstChoice Wholesale Conservative\", \"share name\": \"FirstChoice Wholesale Conservative\", \"management_fee_and_costs\": 0.9, \"management_fee\": 0.9, \"performance_fee_costs\": 0.02}]}", "---Example 3 Start---", "Investment \noption \nInvestment fees and \ncosts (p.a.) \n1 \nTransaction \ncosts (p.a.) \nMySuper/ \nBalanced \n0.38% (including 0.09% \nPerformance fee) \n0.18% \nManaged \nGrowth \n0.38% (including 0.11% \nPerformance fee) \n0.08% \n", "---Example 3 End---", "The column: \"Investment fees and costs (p.a.)\", \"including Performance fee\", meaning the value is the sum of \"Management costs\" and \"performance fee\", We should subtract the \"performance fee\" value, just output the \"Management costs\" value.", "Both of management_fee and management_fee_and_costs are the values for \"Management costs\".", - "So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee is 0.09.", - "For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee is 0.11.", + "So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee_costs is 0.09.", + "For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee_costs is 0.11.", "So the output should be:", - "{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee\": 0.11}]}", + "{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee_costs\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee_costs\": 0.11}]}", "---Example 4 Start---", "Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return – Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n", "---Example 4 End---", @@ -259,7 +257,7 @@ "The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".", "Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.", "So the output should be:", - "{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", + "{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", "\n", "I. Some table is very complex, with many data points columns, please extract the relevant values.", "---Example 1 Start---", @@ -267,12 +265,12 @@ "---Example 1 End---", "For this table, there are \"Administration fees and costs (p.a.)\" as administration_fees, ", "\"Investment fees and costs (p.a.)\" as management_fee_and_costs and management_fee, ", - "\"Performance fee (p.a.)\" as performance_fee, ", + "\"Performance fee (p.a.)\" as performance_fee_costs, ", "\"Buy/sell spread (%)\" as buy_spread and sell_spread.", "If one row has 5 decimal numbers, ", "the 2nd decimal number is the administration_fees, ", "the 3rd decimal number is the management_fee_and_costs and management_fee, ", - "the 4th decimal number is the performance_fee, ", + "the 4th decimal number is the performance_fee_costs, ", "the 5th decimal number is the buy_spread and sell_spread.", "If one row has 4 decimal numbers, ", "the 2nd decimal number is the administration_fees, ", @@ -280,7 +278,7 @@ "the 4th decimal number is the buy_spread and sell_spread.", "Please always ignore the 1st decimal number, we need not the total sum values.", "The output should be:", - "{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", + "{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee_costs\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", "J. If exist **\"Maximum management fee\"** in context, please ignore relevant values.", "---Example Start---", "Fund name \nMaximum \nmanagement \nfee (p.a.) \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity Income Fund 3.075% \nAUSTRALIAN SHARE \nFirst Sentier Wholesale Australian Share Fund 1.538%", @@ -290,40 +288,72 @@ "{\"data\": []}" ], "administration_fees":[ - "Administration fees and costs is share class level data.", + "Administration fees and costs and total annual dollar-based charges are share class level data.", "Simple case:", "----Example 1 Start----", "Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n", "----Example 1 End----", + "According to example, the administration fee is $1.30 per week plus 0.29% pa, so administration_fees is 0.29, ", + "total_annual_dollar_based_charges is 1.30 * 52 = 67.6", "The output should be:", - "{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}", + "{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29, \"total_annual_dollar_based_charges\": 67.6}]}", "\n", "----Example 2 Start----", "At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply)", "----Example 2 End----", - "The administration fee is $1.00 per week plus 0.17% pa, so the output should be:", - "{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17}]}", + "According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ", + "total_annual_dollar_based_charges is 1 * 52 = 52", + "The output should be:", + "{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}", "---Example 3 Start---", "\nPrime Super Income Stream\nType of fee \nor cost \nAmount How and when paid \nOngoing annual fees and costs \n1 \nAdministration \nfees and costs \nAdministration \nfees of $1.30 \nper week \nPlus \n0.50% p.a. of \nyour account \nbalance, capped \nat $500 p.a. \nDeducted from your \naccount on the last \nbusiness day of each \nmonth, except if you \nare leaving Prime \nSuper, in which case \nit is deducted prior to \nyour exit from Prime \nSuper. \nInvestment \nfees and costs \n2 \n0.07% to 1.00% \nof assets p.a. \ndepending on \nthe investment \noption \nTaken into account \nprior to the declaration \nof weekly earning \nrates. This cost is not \ndeducted directly from \nyour account. \n", "---Example 3 End---", - "The administration fee is $1.30 per week plus 0.50% p.a., so the output should be:", - "{\"data\": [{\"fund name\": \"Prime Super Income Stream\", \"share name\": \"Prime Super Income Stream\", \"administration_fees\": 0.50}]}", + "According to example, the administration fee is $1.30 per week plus 0.50% p.a., so administration_fees is 0.5, ", + "total_annual_dollar_based_charges is 1.30 * 52 = 67.6", + "The output should be:", + "{\"data\": [{\"fund name\": \"Prime Super Income Stream\", \"share name\": \"Prime Super Income Stream\", \"administration_fees\": 0.5, \"total_annual_dollar_based_charges\": 67.6}]}", + "---Example 4 Start---", + "At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nTTR income stream Retirement income stream Reference \nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply) \nRefer to the ‘Fees and \nother costs’ section on \npages 40-46 for details \n", + "---Example 4 End---", + "According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ", + "total_annual_dollar_based_charges is 1 * 52 = 52", + "The output should be:", + "{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}", "\n", "Complex cases:", "A. Need to add multiple numbers together.", - "----Example 1 Start----", + "---Example 1 Start---", "MLC MasterKey Super & Pension Fundamentals \n\nType of fee or cost \nOngoing annual fees and costs 1 \n\nAdministration fees and \ncosts \n\nAccount balance \n\nFirst $150,000 \n\nRemaining balance \nover $150,000 \n\nThe percentage Administration fee \ncharged to each account you have \n(excluding the fixed fee and Trustee \nLevy) is capped at $2,500 pa. \n\nPlus \n\nTrustee Levy of 0.02% pa of your \naccount balance. \n\nPlus \n\nAmount \n\nHow and when paid \n\nPercentage fee \n(% pa) \n\n0.30 \n\n0.10 \n\nAdministration fee \n\nThe Administration fee is deducted monthly from your account and will \nbe rounded off to 2 decimal points. As a result of the rounding, the total \nannual amount may slightly differ. \n\nThe percentage fee for each month is calculated using your average Super \nand Pension account balance for the previous month. \n\nThe Trustee Levy will be deducted monthly from your account balance. \n\nThe levy amount for each month is calculated using your account balance \nat the date it's deducted. \n\nYou won't see these costs as direct charges to your account. They reduce \nthe balance held in reserves used to cover certain costs related to the \nrunning of the MLC Super Fund. \n\n4 \n\nMLC MasterKey Super & Pension Fundamentals Product Disclosure Statement", - "----Example 1 End----", + "---Example 1 End---", "For this case, the relevant values: first: 0.30%, remaining balance over: 0.10%, Plus Trustee Levy: 0.02%.", "Please ignore the remaining balance over 0.10%, add first: 0.30% and Plus Trustee Levy: 0.02% = 0.32%", "The output should be:", - "{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"administration_fees\": 0.32}]}" + "{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"administration_fees\": 0.32}]}", + "---Example 2 Start---", + "Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Fund’s \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n", + "---Example 2 End---", + "According to example, the total annual dollar-based charges is $78.00 p.a. ($1.50 per week), so total_annual_dollar_based_charges is 78.", + "Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore this part.", + "The output should be:", + "{\"data\": [{\"fund name\": \"Hostplus Superannuation and Personal Super Plan\", \"share name\": \"Hostplus Superannuation and Personal Super Plan\", \"total_annual_dollar_based_charges\": 78}]}" + ], + "total_annual_dollar_based_charges": [ + "Total annual dollar-based charges are share class level data.", + "Its value corresponds to the administration fees and costs that are charged on a weekly basis.", + "----Example 1 Start----", + "MLC MasterKey Super & Pension Fundamentals\nType of fee or cost \nOngoing annual fees and costs 1 \nAmount \nHow and when paid \nOther administration costs paid from \nreserves of 0.00% pa of your account \nbalance. \nPlus \nA fixed fee of $1.50 per week \nThis fee is deducted monthly if your account balance is below $50,000 \nwhen the percentage administration fee is deducted. \nInvestment fees and \ncosts 2 \nInvestment fees and estimated costs \nfor MLC Horizon 4 Balanced Portfolio, \n1.20% pa. \nYou won ’ t see these fees and costs as direct charges to your account. \nThey're reflected in the daily unit price of each investment option and will \nreduce the net return on your investment \nInvestment fees and estimated costs \nfor other investment options, ranges \nfrom 0.00% pa to 2.84% pa \n(estimated). \nTransaction costs \nMLC Horizon 4 Balanced Portfolio, \n0.06% pa (estimated). \nOther investment options, ranges \nfrom 0.00% pa to 0.24% pa \n(estimated). \nYou won ’ t see these costs as direct charges to your account. They're \nreflected in the daily unit price of each investment option and will reduce \nthe net return on your investment. \nMember activity related fees and costs \nBuy-sell spread \nYou won ’ t see this fee as a direct charge to your account. It ’ s reflected in \nthe buy and sell unit price of each investment option when there ’ s a \ntransaction on your account. \nMLC Horizon 4 Balanced Portfolio, \n0.10%/0.10% \nOther investment options, ranges \nfrom 0.00%/0.00% to 0.30%/0.30% \nThe current buy-sell spreads of an investment option are available at \nmlc.com.au/buysellspreads \n", + "----Example 1 End----", + "According to example, the fixed fee is $1.50 per week, so total_annual_dollar_based_charges is 1.50 * 52 = 78", + "In the context, also with management fees and costs, management fee, buy_spread and sell_spread for specific fund: MLC Horizon 4 Balanced Portfolio.", + "Please output the relevant values based on specific fund name.", + "The output should be:", + "{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" ], "buy_spread": [ "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)" ], - "performance_fee": [ + "performance_fee_costs": [ "Performance fees is share class level data.", "If the performance fees is with the range, please ignore and output empty.", "---Example 1 Start---", @@ -404,7 +434,7 @@ "The administration_fees is \"Administration fees\"", "The management_fee is \"Investment fees\".", "The management_fee_and_costs is \"Investment fees\" + \"Estimated other investment costs\".", - "The performance_fee is \"Estimated performance fees\"", + "The performance_fee_costs is \"Estimated performance fees\"", "---Example 1 Start---", "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \nCash \nPerpetual Cash \n0.10% \n0.00% \n0.00% \nn/a \n0.00% \n0.02% \n0.12% \nFixed income and credit \nBentham Global \nIncome \n0.25% \n0.00% \n0.67% \nn/a \n0.00% \n0.05% \n0.97% \nInternetional shares \nPerpetual Global \nInnovation Share \n0.25% \n0.00% \n0.99% \n2.30 \n0.01% \n0.27% \n3.82% \n", "---Example 1 End---", @@ -412,9 +442,9 @@ "Please pay attention below information", "Assume the column sequence number is from 1.", "\"Administration fees\" values are as the column 1 numbers, \"Investment fees\" values are as the column 3 numbers, \"Estimated other investment costs\" values are as the column 5 numbers, \"Estimated performance fees\" values are as the column 4 numbers.", - "For fund: Perpetual Global Innovation Share, the administration_fees should be the column 1 number: 0.25, the management_fee should be the column 3 number: 0.99, the management_fee_and_costs should be 1 = 0.99(the column 3 number) + 0.01 (the column 5 number), the performance_fee should be 2.3 (the column 4 number)", + "For fund: Perpetual Global Innovation Share, the administration_fees should be the column 1 number: 0.25, the management_fee should be the column 3 number: 0.99, the management_fee_and_costs should be 1 = 0.99(the column 3 number) + 0.01 (the column 5 number), the performance_fee_costs should be 2.3 (the column 4 number)", "Therefore, the output should be:", - "{\"data\": [{\"fund name\": \"Perpetual Cash\", \"share name\": \"Perpetual Cash\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"administration_fees\": 0.10}, {\"fund name\": \"Bentham Global Income\", \"share name\": \"Bentham Global Income\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"administration_fees\": 0.25}]}, {\"fund name\": \"Perpetual Global Innovation Share\", \"share name\": \"Perpetual Global Innovation Share\", \"management_fee_and_costs\": 1, \"management_fee\": 0.99, \"administration_fees\": 0.25, \"performance_fee\": 2.3}" + "{\"data\": [{\"fund name\": \"Perpetual Cash\", \"share name\": \"Perpetual Cash\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"administration_fees\": 0.10}, {\"fund name\": \"Bentham Global Income\", \"share name\": \"Bentham Global Income\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"administration_fees\": 0.25}]}, {\"fund name\": \"Perpetual Global Innovation Share\", \"share name\": \"Perpetual Global Innovation Share\", \"management_fee_and_costs\": 1, \"management_fee\": 0.99, \"administration_fees\": 0.25, \"performance_fee_costs\": 2.3}" ] }, { @@ -422,7 +452,7 @@ "prompts": ["Complex management fee and costs rule:", "If the table with columns:", "\"Entry Fee option\", \"Nil Entry option\", \"Estimated Other investment costs\", \"Estimated Performance fees\"", - "The performance_fee is \"Estimated Performance fees\"", + "The performance_fee_costs is \"Estimated Performance fees\"", "The fund name's tail is \"Entry Fee\" for \"Entry Fee option\", e.g. if fund name is \"MultiSeries 30\", the Entry Fee fund name is \"MultiSeries 30 Entry Fee\"", "The fund name's tail is \"Nil Entry\" for \"Nil Entry option\", e.g. if fund name is \"MultiSeries 30\", the Nil Entry fund name is \"MultiSeries 30 Nil Entry\".", "For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".", @@ -435,10 +465,10 @@ "Assume the column sequence number is from 1.", "\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.", "For main fund: Platinum Asia with values: 2.14 2.990.02 0.000.21 2.37 3.22, ", - "the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)", - "the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)", + "the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", + "the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", "Therefore, the output should be:", - "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee\": 0}" + "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}" ] }, { @@ -470,7 +500,7 @@ "The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".", "The recoverable_expenses is \"Recoverable expenses\"", "The indirect_costs is \"Estimated other indirect costs\"", - "The performance_fee is \"Peformance fees charged to the Investment Option by underlying managers\".", + "The performance_fee_costs is \"Peformance fees charged to the Investment Option by underlying managers\".", "The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"", "The buy_spread and sell_spread are \"Buy/sell spreads\".", "---Example 1 Start---", @@ -483,7 +513,7 @@ "The 1st number: 0.62 is the management_fee,", "the 2nd number: 0.18 is the recoverable_expenses,", "the 3rd number: 0.05 is the indirect_costs", - "the 4th number: 0.00 is the performance_fee,", + "the 4th number: 0.00 is the performance_fee_costs,", "the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ", "the 6th number: 0.14 is the Transaction costs (% pa).", "the 7th number: 0.08 is the buy_spread, ", @@ -491,7 +521,7 @@ "The management_fee_and_costs is Management fee (i) + Recoverable expenses + Estimated other indirect costs = 0.62 + 0.18 + 0.05= 0.85", "**Attention: Ignore Transaction costs (% pa), the 6th number, DO NOT APPLY ITS VALUE TO CALCULATE management_fee_and_costs!!!**", "The output should be: ", - "{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}", + "{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}", "---Example 2 Start---", "Investment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 \nMyNorth \nAustralian Fixed \nInterest Index \niv \n0.25 \n0.01 \n0.00 \n0.00 \n0.00 \n0.07 \n0.10/0.10 \nMyNorth \nInternational \nFixed Interest \nIndex - Hedged \n", "---Example 2 End---", @@ -500,7 +530,7 @@ "b. The algorithm to calculate management_fee_and_costs is same as Example 1.", "c. The difference is **the fund name is after the data row, e.g. the fund name of the first data row is: MyNorth Australian Fixed Interest Index**", "The output should be: ", - "{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" + "{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" ] } ] @@ -594,11 +624,10 @@ "share 2", "share 3" ], - "total_annual_dollar_based_charges_value": [125.00, 95.00, 26.00], + "total_annual_dollar_based_charges_value": [65, 57, 67.6], "management_fee_and_costs_value": [2.63, 1.58, 2.55], "management_fee_value": [0.85, 1.10, 0.23], - "performance_fee_value": [0.03, 0.21, 0.08], - "performance_fee_costs_value": [0.05, 0.25, 0.09], + "performance_fee_costs_value": [0.03, 0.21, 0.08], "buy_spread_value": [0.10, 0.15, 0.12], "sell_spread_value": [0.10, 0.10, 0.15], "establishment_fee_value": [0.75, 1.20, 0.25], @@ -623,7 +652,7 @@ "total_annual_dollar_based_charges": "Total annual dollar based charges", "management_fee_and_costs": "Management fee and costs", "management_fee": "Management fee", - "performance_fee": "Performance fee", + "performance_fee_costs": "Performance fee", "buy_spread": "Buy spread", "sell_spread": "Sell spread", "administration_fees": "Administration fee", diff --git a/main.py b/main.py index 9de2959..22894f9 100644 --- a/main.py +++ b/main.py @@ -1526,8 +1526,8 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - re_run_extract_data = True - re_run_mapping_data = True + re_run_extract_data = False + re_run_mapping_data = False force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" @@ -1560,7 +1560,8 @@ if __name__ == "__main__": # "544886057", # "550769189", # "553449663"] - # special_doc_id_list = ["411062815"] + special_doc_id_list = ["420339794", "441280757", "454036250", "471206458", "412778803"] + # special_doc_id_list = ["441280757"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/prepare_data.py b/prepare_data.py index 72b9b1b..3b773aa 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -1725,15 +1725,21 @@ def set_provider_to_ground_truth(groud_truth_file: str, with open(groud_truth_file, "wb") as file: ground_truth_df.to_excel(file, index=False) + + +def update_data_by_latest_ground_truth(): + # TODO: update current ground truth data by the latest version + latest_ground_truth_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" if __name__ == "__main__": - set_provider_to_ground_truth( - groud_truth_file=r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx", - ground_truth_sheet="Sheet1", - document_mapping_file=r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx", - document_mapping_sheet="document_mapping" - ) + update_data_by_latest_ground_truth() + # set_provider_to_ground_truth( + # groud_truth_file=r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx", + # ground_truth_sheet="Sheet1", + # document_mapping_file=r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx", + # document_mapping_sheet="document_mapping" + # ) # set_mapping_to_data_side_documents_data()