In [10]:
import openpyxl
from collections import defaultdict
import pandas as pd
import statistics
import os
import re
from utils.similarity import Similarity


imp_datapoints = ["Management Fee and Costs", "Management Fee", "Performance fee and cost", "Interposed vehicle Performance fee and Costs",
 "Administration Fee and costs", "Total Annual Dollar Based Charges", "Buy Spread", "Sell Spread", "Performance Fee",
 "Minimum Initial Investment", "Benchmark"]


imp_datapoints_mapping = {
 "Management Fee and Costs": "management_fee_and_costs",
 "Management Fee": "management_fee",
 "Performance fee and cost": "performance_fee_costs",
 "Interposed vehicle Performance fee and Costs": "interposed_vehicle_performance_fee_cost",
 "Administration Fee and costs": "administration_fees",
 "Total Annual Dollar Based Charges": "total_annual_dollar_based_charges",
 "Buy Spread": "buy_spread",
 "Sell Spread": "sell_spread",
 "Performance Fee": "PerformanceFeeCharged",
 "Minimum Initial Investment": "minimum_initial_investment",
 "Benchmark": "benchmark_name"
}

path_ground_truth = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
# path_generated_results = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx"
path_generated_results = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318203253_new.xlsx"
provider_mapping_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx"



In [11]:

message_list = []
total_fn = []
def load_excel(filepath, header_row_index):
 """Load an Excel file and use the specified row as the header."""
 wb = openpyxl.load_workbook(filepath, data_only=True)
 sheet = wb.active
 headers = []
 data = []

 for index, row in enumerate(sheet.iter_rows(values_only=True)):
 if index == header_row_index:
 headers = [cell if cell is not None else "" for cell in row]
 elif index > header_row_index:
 data.append([cell if cell is not None else "" for cell in row])

 return headers, data

def index_data_by_key(data, key_index, secondary_key_index, header):
 """Index data by primary and secondary keys (doc_id and sec_name)."""
 indexed_data = defaultdict(dict)
 
 for row in data:
 row_data = {}
 # Store the entire row, which will be useful for full row comparison
 for i in range(len(row)):
 if header[i] == "doc_id":
 primary_key = int(row[i])
 elif header[i] == "sec_name":
 # share class should be the comparison level and key
 secondary_key = str(row[i])
 else:
 row_data[header[i]] = convert_if_number(row[i])
 if secondary_key is None or (isinstance(secondary_key, str) and len(secondary_key) == 0):
 continue
 indexed_data[primary_key][secondary_key] = row_data
 return indexed_data

def convert_if_number(value):
 """Attempt to convert value to a float or int, otherwise return as string."""
 try:
 float_value = round(float(value), 2)
 int_value = int(float_value)
 return int_value if int_value == float_value else float_value
 except (ValueError, TypeError):
 return value

def compare_values(value1, value2):
 """Convert values to numbers if possible and compare, otherwise compare as strings."""
 value1 = convert_if_number(value1)
 value2 = convert_if_number(value2)
 return value1 == value2

def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched, document_list):
 """Compare data from two indexed sets, with the focus on matching generated results against ground truth."""
 results = {}
 funds_matched, funds_not_matched = 0, 0
 # Initialize result dictionaries for each column except 'doc_id'
 for keys in headers:
 if keys != "doc_id":
 results[keys] = {}
 results[keys]["TP"] = 0
 results[keys]["TN"] = 0
 results[keys]["FP"] = 0
 results[keys]["FN"] = 0
 results[keys]["SUPPORT"] = 0
 
 # Iterate over the generated results instead of the ground truth
 
 total = 0
 # print(document_list)
 for doc_id, secs in ground_truth.items():
 if document_list is not None and str(doc_id) not in document_list:
 continue
 if doc_id in generated_results:
 for sec_name, truth_values in secs.items():
 if sec_name in generated_results[doc_id]:
 generated_values = generated_results[doc_id][sec_name]
 # Compare all other columns
 for i in intersection_list:
 for keys in imp_datapoints:
 if i == imp_datapoints_mapping[keys]:
 truth = str(truth_values[i]).strip()
 generated = str(generated_values[i]).strip()
 total = total +1
 if truth == "":
 if truth == generated:
 results[i]["TN"] = results[i]["TN"] + 1
 else:
 results[i]["FP"] = results[i]["FP"] + 1
 # if "Performance fee and cost" in keys:
 debug = 0
 # print(keys, " - " , doc_id, " truth is null and generated - ", generated_values[i], sec_name) 
 message = {"data_point": i, "doc_id": doc_id, "sec_name": sec_name, 
 "truth": truth, "generated": generated, "error": "Truth is null and generated is not null"}
 message_list.append(message) 
 else:
 if truth == generated:
 results[i]["TP"] = results[i]["TP"] + 1
 elif generated != "":
 if i == "benchmark_name" and compare_text(truth, generated):
 results[i]["TP"] = results[i]["TP"] + 1
 else:
 results[i]["FP"] = results[i]["FP"] + 1
 # if "Performance fee and cost" in keys:
 debug = 0
 # print(keys, " - " , doc_id, " truth - ", truth_values[i], " and generated - ", generated_values[i], " ", sec_name)
 message = {"data_point": i, "doc_id": doc_id, "sec_name": sec_name, 
 "truth": truth, "generated": generated, "error": "Truth is not equal with generated"}
 message_list.append(message)
 else:
 results[i]["FN"] = results[i]["FN"] + 1
 # if "Performance fee and cost" in keys:
 debug = 0
 # print(keys, " - " , doc_id, " generated is null and truth is - ", truth_values[i], sec_name)
 message = {"data_point": i, "doc_id": doc_id, "sec_name": sec_name, 
 "truth": truth, "generated": generated, "error": "Generated is null and truth is not null"}
 message_list.append(message)
 results[i]["SUPPORT"] = results[i]["SUPPORT"] + 1
 funds_matched += 1
 else:
 funds_not_matched += 1
 else:
 # If the entire document is not found, count all funds as not matched
 funds_not_matched += len(secs)
 return results, message_list, funds_matched, funds_not_matched

def clean_text(text: str):
 if text is None or len(text) == 0:
 return text
 text = re.sub(r"\W", " ", text)
 text = re.sub(r"\s+", " ", text)
 return text

def compare_text(source_text, target_text):
 source_text = clean_text(source_text)
 target_text = clean_text(target_text)
 if source_text == target_text or source_text in target_text or target_text in source_text:
 return True
 similarity = Similarity()
 jacard_score = similarity.jaccard_similarity(source_text.lower().split(), target_text.lower().split())
 if jacard_score > 0.8:
 return True
 
 
def calculate_metrics(tp, tn, fp, fn):
 """Calculate precision, recall, accuracy, and F1-score."""
 precision = tp / (tp + fp) if (tp + fp) != 0 else 0
 recall = tp / (tp + fn) if (tp + fn) != 0 else 0
 accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0
 f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
 return precision, recall, accuracy, f1_score

def print_metrics_table(data):
 # Print table headers
 print("{:<50}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}".format("Metric", "F1-Score", "Precision", "Recall", "Accuracy", "SUPPORT", "TP", "TN", "FP", "FN"))
 total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]
 
 total_tp = []
 total_tn = []
 total_fp = []
 #total_fn = []
 # Calculate and print metrics for each item
 metrics_list = []
 for keys in imp_datapoints:
 try:
 key = imp_datapoints_mapping[keys]
 values = data[key]
 if values["SUPPORT"] == 0:
 continue
 tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']
 precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)
 metrics = {"Datapoint": key, "F1-Score": f1_score, "Precision": precision, "Recall": recall, "Accuracy": accuracy, "SUPPORT": values["SUPPORT"], "TP": tp, "TN": tn, "FP": fp, "FN": fn}
 metrics_list.append(metrics)
 total_precision.append(precision)
 total_recall.append(recall)
 total_accuracy.append(accuracy)
 total_f1_score.append(f1_score)
 total_support.append(values["SUPPORT"])
 total_tp.append(tp)
 total_tn.append(tn)
 total_fp.append(fp)
 total_fn.append(fn)

 if values["SUPPORT"] > 0 and key > "":
 print("{:<50}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}\t{:<10.0f}\t{:<10.0f}\t{:<10.0f}\t{:<10.0f}\t{:<10.0f}".format(key, f1_score, precision, recall, accuracy, values["SUPPORT"], tp, tn, fp, fn))
 except:
 pass
 total_mean_precision = statistics.mean(total_precision)
 total_mean_recall = statistics.mean(total_recall)
 total_mean_accuracy = statistics.mean(total_accuracy)
 total_mean_f1_score = statistics.mean(total_f1_score)
 total_sum_support = sum(total_support)
 total_sum_tp = sum(total_tp)
 total_sum_tn = sum(total_tn)
 total_sum_fp = sum(total_fp)
 total_sum_fn = sum(total_fn)
 total_metrics = {"Datapoint": "TOTAL", "F1-Score": total_mean_f1_score, "Precision": total_mean_precision, "Recall": total_mean_recall, "Accuracy": total_mean_accuracy, "SUPPORT": total_sum_support, "TP": total_sum_tp, "TN": total_sum_tn, "FP": total_sum_fp, "FN": total_sum_fn}
 metrics_list.append(total_metrics)
 print("{:<50}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}\t{:<10.0f}\t{:<10.0f}\t{:<10.0f}\t{:<10.0f}\t{:<10.0f}".format("TOTAL", total_mean_f1_score, total_mean_precision, total_mean_recall, total_mean_accuracy, total_sum_support, total_sum_tp, total_sum_tn, total_sum_fp, total_sum_fn))
 return metrics_list
 
def create_metrics_df(data):
 # Define a list to hold data for DataFrame
 rows = []
 
 # Iterate through each metric item
 for key in imp_datapoints:
 try:
 mapped_key = imp_datapoints_mapping[key]
 values = data[mapped_key]
 tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']
 precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)
 
 # Only add rows where SUPPORT > 0
 if values["SUPPORT"] > 0:
 row = {
 "Metric": key,
 "Precision": precision,
 "Recall": recall,
 "Accuracy": accuracy,
 "F1-Score": f1_score,
 "SUPPORT": values["SUPPORT"]
 }
 rows.append(row)
 except KeyError as e:
 continue

 # Create a DataFrame from the list of rows
 df_metrics = pd.DataFrame(rows)
 df_metrics.reset_index(inplace=True)
 df_metrics.drop(columns=["index"], inplace=True)
 print(df_metrics)
 return df_metrics



def get_provider_mapping(file_path):
 df = pd.read_excel(file_path)
 df = (df.groupby(["Docid", "ProviderName"]).first())
 df.reset_index(inplace = True)
 return df[["Docid", "ProviderName"]]


def get_provider_names(generated_results_indexed, df_provider_mapping):
 providers_dict = {}
 for doc_id in generated_results_indexed:
 try:
 provider_name = (df_provider_mapping[df_provider_mapping["Docid"] == doc_id]["ProviderName"].values)[0]
 if provider_name in providers_dict:
 providers_dict[provider_name].append(doc_id)
 else:
 providers_dict[provider_name] = []
 providers_dict[provider_name].append(doc_id)

 except:
 pass
 return providers_dict

def get_specified_doc_data(results, doc_list):
 provider_res = {}
 for doc_id in doc_list:
 if doc_id in results:
 provider_res[doc_id] = results[doc_id]
 return provider_res


In [12]:

"""
Blade's updates
1. Set the secondary key to be the share class name, instead of the fund name
2. Remove the data point which support is 0 to calculate the metrics
3. Add the message list to store the error message
4. Support save metrics/ error message to excel file
5. Support statistics for different document list
6. Set F1-Score to the first column in the metrics table
"""

funds_matched = 0
funds_not_matched = 0

# Load the files
headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)
headers_gen, generated_results_data = load_excel(path_generated_results, 0)

# Assuming doc_id is the first column and fund_name is the second column
doc_id_index = 0
fund_name_index = 1

# Index the data
ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)
generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)

intersection = set(headers_gen).intersection(headers_gt)

# Convert the result back to a list (if you need it as a list)
intersection_list = list(intersection)

total_fn = []

# df_provider_mapping = get_provider_mapping(provider_mapping_file_path)

# all_provider_dict = get_provider_names(generated_results_indexed, df_provider_mapping)


# for provider_name in all_provider_dict:
# provider_vise_generated_results = get_specified_doc_data(generated_results_indexed, all_provider_dict[provider_name])
# comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, provider_vise_generated_results, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)
# print("\n")
# print("\n")
# print("Provider Name - " + provider_name + "\t Number of Docs - " + str(len(all_provider_dict[provider_name])))
# #create_metrics_df(comparison_results)
# print_metrics_table(comparison_results)
# print("Total Funds Matched - " + str(funds_matched) + "\nTotal Funds Not Matched - " + str(funds_not_matched))
# print("Percentage of Funds Matched - " + str((funds_matched/(funds_matched + funds_not_matched))*100))



print("\n")
print("\n")
document_list_file_list = [None, 
 "./sample_documents/aus_prospectus_29_documents_sample.txt", 
 "./sample_documents/aus_prospectus_17_documents_sample.txt"]
for document_list_file in document_list_file_list:
 document_list = None
 if document_list_file is not None:
 with open(document_list_file, "r", encoding="utf-8") as f:
 document_list = f.readlines()
 document_list = [doc_id.strip() for doc_id in document_list]
 
 print("All Providers Results: ")
 print("Document List File - ", document_list_file)
 comparison_results, message_list, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, 
 generated_results_indexed, 
 headers_gt, doc_id_index, 
 fund_name_index, 
 intersection_list,
 funds_matched, 
 funds_not_matched,
 document_list)
 metrics_list = print_metrics_table(comparison_results)
 print("Total Funds Matched - " + str(funds_matched) + "\nTotal Funds Not Matched - " + str(funds_not_matched))
 print("Percentage of Funds Matched - " + str((funds_matched/(funds_matched + funds_not_matched))*100))

 metrics_df = pd.DataFrame(metrics_list)
 message_df = pd.DataFrame(message_list)

 output_metrics_folder = r"/data/aus_prospectus/output/metrics_data/"
 os.makedirs(output_metrics_folder, exist_ok=True)
 if os.path.exists(output_metrics_folder):
 generated_file_base_name = os.path.basename(path_generated_results).replace(".xlsx", "")
 metrics_file_name = f"metrics_{generated_file_base_name}"
 if document_list_file is not None:
 metrics_file_name = f"{metrics_file_name}_{len(document_list)}_documents.xlsx"
 else:
 metrics_file_name = f"{metrics_file_name}_all_documents.xlsx"
 metrics_file_path = os.path.join(output_metrics_folder, metrics_file_name)
 with pd.ExcelWriter(metrics_file_path) as writer:
 metrics_df.to_excel(writer, sheet_name="metrics_data", index=False)
 message_df.to_excel(writer, sheet_name="message_data", index=False)






All Providers Results: 
Document List File - None
Metric 	F1-Score 	Precision 	Recall 	Accuracy 	SUPPORT 	TP 	TN 	FP 	FN 
management_fee_and_costs 	0.9354 	0.8870 	0.9893 	0.8786 	419 	369 	0 	47 	4 
management_fee 	0.9591 	0.9303 	0.9898 	0.9214 	419 	387 	0 	29 	4 
performance_fee_costs 	0.9261 	0.8955 	0.9590 	0.9024 	285 	257 	122 	30 	11 
interposed_vehicle_performance_fee_cost 	0.9863 	0.9730 	1.0000 	0.9952 	73 	72 	346 	2 	0 
administration_fees 	0.9940 	0.9881 	1.0000 	0.9976 	83 	83 	336 	1 	0 
total_annual_dollar_based_charges 	1.0000 	1.0000 	1.0000 	1.0000 	70 	70 	350 	0 	0 
buy_spread 	0.9486 	0.9171 	0.9822 	0.9143 	362 	332 	52 	30 	6 
sell_spread 	0.9516 	0.9227 	0.9824 	0.9190 	362 	334 	52 	28 	6 
minimum_initial_investment 	0.9544 	0.9638 	0.9452 	0.9333 	310 	293 	99 	11 	17 
benchmark_name 	0.8971 	0.8652 	0.9313 	0.9333 	142 	122 	270 	19 	9 
TOTAL 	0.9553 	0.9343 	0.9779 	0.9395 	2525 	2319 	1627 	197 	57 
Total Funds Matched - 420
Total Funds Not Matched -

In [13]:
for message_list_element in message_list:
 if message_list_element["data_point"] == "performance_fee_costs":
 print(message_list_element)

{'data_point': 'performance_fee_costs', 'doc_id': 377377369, 'sec_name': 'SPDR® S&P Emerging Markets Carbon Control Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}
{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'truth': '0', 'generated': '0.11', 'error': 'Truth is not equal with generated'}
{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Investment Portfolio-BlackRock Tactical Growth NE', 'truth': '0', 'generated': '0.33', 'error': 'Truth is not equal with generated'}
{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -NE', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}
{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Shares Fund', 'truth': '', 'generated': '0', 'error'

In [7]:
import pandas as pd


# Convert data to DataFrame
df = pd.DataFrame(message_list)

# Sort DataFrame by 'doc_id'
df_sorted = df.sort_values(by=['doc_id'])

# Save DataFrame to Excel file
os.makedirs("/data/aus_prospectus/output/error_analysis/", exist_ok=True)
output_filename = r"/data/aus_prospectus/output/error_analysis/anomalies_found.xlsx"
df_sorted.to_excel(output_filename, index=False)

print(f"Excel file '{output_filename}' has been created successfully.")


Excel file '/data/aus_prospectus/output/error_analysis/anomalies_found.xlsx' has been created successfully.
