{ "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import openpyxl\n", "from collections import defaultdict\n", "import pandas as pd\n", "import statistics\n", "import os\n", "import re\n", "from utils.similarity import Similarity\n", "\n", "\n", "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n", " \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n", " \"Minimum Initial Investment\", \"Benchmark\"]\n", "\n", "\n", "imp_datapoints_mapping = {\n", " \"Management Fee and Costs\": \"management_fee_and_costs\",\n", " \"Management Fee\": \"management_fee\",\n", " \"Performance fee and cost\": \"performance_fee_costs\",\n", " \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n", " \"Administration Fee and costs\": \"administration_fees\",\n", " \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", " \"Buy Spread\": \"buy_spread\",\n", " \"Sell Spread\": \"sell_spread\",\n", " \"Performance Fee\": \"PerformanceFeeCharged\",\n", " \"Minimum Initial Investment\": \"minimum_initial_investment\",\n", " \"Benchmark\": \"benchmark_name\"\n", "}\n", "\n", "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n", "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318203253_new.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "\n", "message_list = []\n", "total_fn = []\n", "def load_excel(filepath, header_row_index):\n", " \"\"\"Load an Excel file and use the specified row as the header.\"\"\"\n", " wb = openpyxl.load_workbook(filepath, data_only=True)\n", " sheet = wb.active\n", " headers = []\n", " data = []\n", "\n", " for index, row in enumerate(sheet.iter_rows(values_only=True)):\n", " if index == header_row_index:\n", " headers = [cell if cell is not None else \"\" for cell in row]\n", " elif index > header_row_index:\n", " data.append([cell if cell is not None else \"\" for cell in row])\n", "\n", " return headers, data\n", "\n", "def index_data_by_key(data, key_index, secondary_key_index, header):\n", " \"\"\"Index data by primary and secondary keys (doc_id and sec_name).\"\"\"\n", " indexed_data = defaultdict(dict)\n", " \n", " for row in data:\n", " row_data = {}\n", " # Store the entire row, which will be useful for full row comparison\n", " for i in range(len(row)):\n", " if header[i] == \"doc_id\":\n", " primary_key = int(row[i])\n", " elif header[i] == \"sec_name\":\n", " # share class should be the comparison level and key\n", " secondary_key = str(row[i])\n", " else:\n", " row_data[header[i]] = convert_if_number(row[i])\n", " if secondary_key is None or (isinstance(secondary_key, str) and len(secondary_key) == 0):\n", " continue\n", " indexed_data[primary_key][secondary_key] = row_data\n", " return indexed_data\n", "\n", "def convert_if_number(value):\n", " \"\"\"Attempt to convert value to a float or int, otherwise return as string.\"\"\"\n", " try:\n", " float_value = round(float(value), 2)\n", " int_value = int(float_value)\n", " return int_value if int_value == float_value else float_value\n", " except (ValueError, TypeError):\n", " return value\n", "\n", "def compare_values(value1, value2):\n", " \"\"\"Convert values to numbers if possible and compare, otherwise compare as strings.\"\"\"\n", " value1 = convert_if_number(value1)\n", " value2 = convert_if_number(value2)\n", " return value1 == value2\n", "\n", "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched, document_list):\n", " \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n", " results = {}\n", " funds_matched, funds_not_matched = 0, 0\n", " # Initialize result dictionaries for each column except 'doc_id'\n", " for keys in headers:\n", " if keys != \"doc_id\":\n", " results[keys] = {}\n", " results[keys][\"TP\"] = 0\n", " results[keys][\"TN\"] = 0\n", " results[keys][\"FP\"] = 0\n", " results[keys][\"FN\"] = 0\n", " results[keys][\"SUPPORT\"] = 0\n", " \n", " # Iterate over the generated results instead of the ground truth\n", " \n", " total = 0\n", " # print(document_list)\n", " for doc_id, secs in ground_truth.items():\n", " if document_list is not None and str(doc_id) not in document_list:\n", " continue\n", " if doc_id in generated_results:\n", " for sec_name, truth_values in secs.items():\n", " if sec_name in generated_results[doc_id]:\n", " generated_values = generated_results[doc_id][sec_name]\n", " # Compare all other columns\n", " for i in intersection_list:\n", " for keys in imp_datapoints:\n", " if i == imp_datapoints_mapping[keys]:\n", " truth = str(truth_values[i]).strip()\n", " generated = str(generated_values[i]).strip()\n", " total = total +1\n", " if truth == \"\":\n", " if truth == generated:\n", " results[i][\"TN\"] = results[i][\"TN\"] + 1\n", " else:\n", " results[i][\"FP\"] = results[i][\"FP\"] + 1\n", " # if \"Performance fee and cost\" in keys:\n", " debug = 0\n", " # print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], sec_name) \n", " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n", " \"truth\": truth, \"generated\": generated, \"error\": \"Truth is null and generated is not null\"}\n", " message_list.append(message) \n", " else:\n", " if truth == generated:\n", " results[i][\"TP\"] = results[i][\"TP\"] + 1\n", " elif generated != \"\":\n", " if i == \"benchmark_name\" and compare_text(truth, generated):\n", " results[i][\"TP\"] = results[i][\"TP\"] + 1\n", " else:\n", " results[i][\"FP\"] = results[i][\"FP\"] + 1\n", " # if \"Performance fee and cost\" in keys:\n", " debug = 0\n", " # print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", sec_name)\n", " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n", " \"truth\": truth, \"generated\": generated, \"error\": \"Truth is not equal with generated\"}\n", " message_list.append(message)\n", " else:\n", " results[i][\"FN\"] = results[i][\"FN\"] + 1\n", " # if \"Performance fee and cost\" in keys:\n", " debug = 0\n", " # print(keys, \" - \" , doc_id, \" generated is null and truth is - \", truth_values[i], sec_name)\n", " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n", " \"truth\": truth, \"generated\": generated, \"error\": \"Generated is null and truth is not null\"}\n", " message_list.append(message)\n", " results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n", " funds_matched += 1\n", " else:\n", " funds_not_matched += 1\n", " else:\n", " # If the entire document is not found, count all funds as not matched\n", " funds_not_matched += len(secs)\n", " return results, message_list, funds_matched, funds_not_matched\n", "\n", "def clean_text(text: str):\n", " if text is None or len(text) == 0:\n", " return text\n", " text = re.sub(r\"\\W\", \" \", text)\n", " text = re.sub(r\"\\s+\", \" \", text)\n", " return text\n", "\n", "def compare_text(source_text, target_text):\n", " source_text = clean_text(source_text)\n", " target_text = clean_text(target_text)\n", " if source_text == target_text or source_text in target_text or target_text in source_text:\n", " return True\n", " similarity = Similarity()\n", " jacard_score = similarity.jaccard_similarity(source_text.lower().split(), target_text.lower().split())\n", " if jacard_score > 0.8:\n", " return True\n", " \n", " \n", "def calculate_metrics(tp, tn, fp, fn):\n", " \"\"\"Calculate precision, recall, accuracy, and F1-score.\"\"\"\n", " precision = tp / (tp + fp) if (tp + fp) != 0 else 0\n", " recall = tp / (tp + fn) if (tp + fn) != 0 else 0\n", " accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0\n", " f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0\n", " return precision, recall, accuracy, f1_score\n", "\n", "def print_metrics_table(data):\n", " # Print table headers\n", " print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"F1-Score\", \"Precision\", \"Recall\", \"Accuracy\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n", " total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]\n", " \n", " total_tp = []\n", " total_tn = []\n", " total_fp = []\n", " #total_fn = []\n", " # Calculate and print metrics for each item\n", " metrics_list = []\n", " for keys in imp_datapoints:\n", " try:\n", " key = imp_datapoints_mapping[keys]\n", " values = data[key]\n", " if values[\"SUPPORT\"] == 0:\n", " continue\n", " tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n", " precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n", " metrics = {\"Datapoint\": key, \"F1-Score\": f1_score, \"Precision\": precision, \"Recall\": recall, \"Accuracy\": accuracy, \"SUPPORT\": values[\"SUPPORT\"], \"TP\": tp, \"TN\": tn, \"FP\": fp, \"FN\": fn}\n", " metrics_list.append(metrics)\n", " total_precision.append(precision)\n", " total_recall.append(recall)\n", " total_accuracy.append(accuracy)\n", " total_f1_score.append(f1_score)\n", " total_support.append(values[\"SUPPORT\"])\n", " total_tp.append(tp)\n", " total_tn.append(tn)\n", " total_fp.append(fp)\n", " total_fn.append(fn)\n", "\n", " if values[\"SUPPORT\"] > 0 and key > \"\":\n", " print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(key, f1_score, precision, recall, accuracy, values[\"SUPPORT\"], tp, tn, fp, fn))\n", " except:\n", " pass\n", " total_mean_precision = statistics.mean(total_precision)\n", " total_mean_recall = statistics.mean(total_recall)\n", " total_mean_accuracy = statistics.mean(total_accuracy)\n", " total_mean_f1_score = statistics.mean(total_f1_score)\n", " total_sum_support = sum(total_support)\n", " total_sum_tp = sum(total_tp)\n", " total_sum_tn = sum(total_tn)\n", " total_sum_fp = sum(total_fp)\n", " total_sum_fn = sum(total_fn)\n", " total_metrics = {\"Datapoint\": \"TOTAL\", \"F1-Score\": total_mean_f1_score, \"Precision\": total_mean_precision, \"Recall\": total_mean_recall, \"Accuracy\": total_mean_accuracy, \"SUPPORT\": total_sum_support, \"TP\": total_sum_tp, \"TN\": total_sum_tn, \"FP\": total_sum_fp, \"FN\": total_sum_fn}\n", " metrics_list.append(total_metrics)\n", " print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", total_mean_f1_score, total_mean_precision, total_mean_recall, total_mean_accuracy, total_sum_support, total_sum_tp, total_sum_tn, total_sum_fp, total_sum_fn))\n", " return metrics_list\n", " \n", "def create_metrics_df(data):\n", " # Define a list to hold data for DataFrame\n", " rows = []\n", " \n", " # Iterate through each metric item\n", " for key in imp_datapoints:\n", " try:\n", " mapped_key = imp_datapoints_mapping[key]\n", " values = data[mapped_key]\n", " tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n", " precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n", " \n", " # Only add rows where SUPPORT > 0\n", " if values[\"SUPPORT\"] > 0:\n", " row = {\n", " \"Metric\": key,\n", " \"Precision\": precision,\n", " \"Recall\": recall,\n", " \"Accuracy\": accuracy,\n", " \"F1-Score\": f1_score,\n", " \"SUPPORT\": values[\"SUPPORT\"]\n", " }\n", " rows.append(row)\n", " except KeyError as e:\n", " continue\n", "\n", " # Create a DataFrame from the list of rows\n", " df_metrics = pd.DataFrame(rows)\n", " df_metrics.reset_index(inplace=True)\n", " df_metrics.drop(columns=[\"index\"], inplace=True)\n", " print(df_metrics)\n", " return df_metrics\n", "\n", "\n", "\n", "def get_provider_mapping(file_path):\n", " df = pd.read_excel(file_path)\n", " df = (df.groupby([\"Docid\", \"ProviderName\"]).first())\n", " df.reset_index(inplace = True)\n", " return df[[\"Docid\", \"ProviderName\"]]\n", "\n", "\n", "def get_provider_names(generated_results_indexed, df_provider_mapping):\n", " providers_dict = {}\n", " for doc_id in generated_results_indexed:\n", " try:\n", " provider_name = (df_provider_mapping[df_provider_mapping[\"Docid\"] == doc_id][\"ProviderName\"].values)[0]\n", " if provider_name in providers_dict:\n", " providers_dict[provider_name].append(doc_id)\n", " else:\n", " providers_dict[provider_name] = []\n", " providers_dict[provider_name].append(doc_id)\n", "\n", " except:\n", " pass\n", " return providers_dict\n", "\n", "def get_specified_doc_data(results, doc_list):\n", " provider_res = {}\n", " for doc_id in doc_list:\n", " if doc_id in results:\n", " provider_res[doc_id] = results[doc_id]\n", " return provider_res\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n", "All Providers Results: \n", "Document List File - None\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", "management_fee_and_costs \t0.9354 \t0.8870 \t0.9893 \t0.8786 \t419 \t369 \t0 \t47 \t4 \n", "management_fee \t0.9591 \t0.9303 \t0.9898 \t0.9214 \t419 \t387 \t0 \t29 \t4 \n", "performance_fee_costs \t0.9261 \t0.8955 \t0.9590 \t0.9024 \t285 \t257 \t122 \t30 \t11 \n", "interposed_vehicle_performance_fee_cost \t0.9863 \t0.9730 \t1.0000 \t0.9952 \t73 \t72 \t346 \t2 \t0 \n", "administration_fees \t0.9940 \t0.9881 \t1.0000 \t0.9976 \t83 \t83 \t336 \t1 \t0 \n", "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t70 \t70 \t350 \t0 \t0 \n", "buy_spread \t0.9486 \t0.9171 \t0.9822 \t0.9143 \t362 \t332 \t52 \t30 \t6 \n", "sell_spread \t0.9516 \t0.9227 \t0.9824 \t0.9190 \t362 \t334 \t52 \t28 \t6 \n", "minimum_initial_investment \t0.9544 \t0.9638 \t0.9452 \t0.9333 \t310 \t293 \t99 \t11 \t17 \n", "benchmark_name \t0.8971 \t0.8652 \t0.9313 \t0.9333 \t142 \t122 \t270 \t19 \t9 \n", "TOTAL \t0.9553 \t0.9343 \t0.9779 \t0.9395 \t2525 \t2319 \t1627 \t197 \t57 \n", "Total Funds Matched - 420\n", "Total Funds Not Matched - 145\n", "Percentage of Funds Matched - 74.33628318584071\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", "management_fee_and_costs \t0.9408 \t0.8883 \t1.0000 \t0.8883 \t178 \t159 \t0 \t20 \t0 \n", "management_fee \t0.9742 \t0.9497 \t1.0000 \t0.9497 \t178 \t170 \t0 \t9 \t0 \n", "performance_fee_costs \t0.9082 \t0.8900 \t0.9271 \t0.8994 \t100 \t89 \t72 \t11 \t7 \n", "interposed_vehicle_performance_fee_cost \t0.9905 \t0.9811 \t1.0000 \t0.9944 \t53 \t52 \t126 \t1 \t0 \n", "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t15 \t15 \t164 \t0 \t0 \n", "buy_spread \t0.9799 \t0.9716 \t0.9884 \t0.9609 \t176 \t171 \t1 \t5 \t2 \n", "sell_spread \t0.9829 \t0.9773 \t0.9885 \t0.9665 \t176 \t172 \t1 \t4 \t2 \n", "minimum_initial_investment \t0.9151 \t0.9538 \t0.8794 \t0.8715 \t141 \t124 \t32 \t6 \t17 \n", "benchmark_name \t0.8957 \t0.8488 \t0.9481 \t0.9050 \t85 \t73 \t89 \t13 \t4 \n", "TOTAL \t0.9541 \t0.9401 \t0.9702 \t0.9373 \t1102 \t1025 \t485 \t69 \t89 \n", "Total Funds Matched - 179\n", "Total Funds Not Matched - 17\n", "Percentage of Funds Matched - 91.3265306122449\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", "management_fee_and_costs \t0.9313 \t0.8861 \t0.9813 \t0.8714 \t241 \t210 \t0 \t27 \t4 \n", "management_fee \t0.9476 \t0.9156 \t0.9819 \t0.9004 \t241 \t217 \t0 \t20 \t4 \n", "performance_fee_costs \t0.9359 \t0.8984 \t0.9767 \t0.9046 \t185 \t168 \t50 \t19 \t4 \n", "interposed_vehicle_performance_fee_cost \t0.9756 \t0.9524 \t1.0000 \t0.9959 \t20 \t20 \t220 \t1 \t0 \n", "administration_fees \t0.9927 \t0.9855 \t1.0000 \t0.9959 \t68 \t68 \t172 \t1 \t0 \n", "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t70 \t70 \t171 \t0 \t0 \n", "buy_spread \t0.9174 \t0.8656 \t0.9758 \t0.8797 \t186 \t161 \t51 \t25 \t4 \n", "sell_spread \t0.9205 \t0.8710 \t0.9759 \t0.8838 \t186 \t162 \t51 \t24 \t4 \n", "minimum_initial_investment \t0.9854 \t0.9713 \t1.0000 \t0.9793 \t169 \t169 \t67 \t5 \t0 \n", "benchmark_name \t0.8991 \t0.8909 \t0.9074 \t0.9544 \t57 \t49 \t181 \t6 \t5 \n", "TOTAL \t0.9505 \t0.9237 \t0.9799 \t0.9365 \t1423 \t1294 \t963 \t128 \t114 \n", "Total Funds Matched - 241\n", "Total Funds Not Matched - 128\n", "Percentage of Funds Matched - 65.31165311653116\n" ] } ], "source": [ "\n", "\"\"\"\n", "Blade's updates\n", "1. Set the secondary key to be the share class name, instead of the fund name\n", "2. Remove the data point which support is 0 to calculate the metrics\n", "3. Add the message list to store the error message\n", "4. Support save metrics/ error message to excel file\n", "5. Support statistics for different document list\n", "6. Set F1-Score to the first column in the metrics table\n", "\"\"\"\n", "\n", "funds_matched = 0\n", "funds_not_matched = 0\n", "\n", "# Load the files\n", "headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n", "headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n", "\n", "# Assuming doc_id is the first column and fund_name is the second column\n", "doc_id_index = 0\n", "fund_name_index = 1\n", "\n", "# Index the data\n", "ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n", "generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n", "\n", "intersection = set(headers_gen).intersection(headers_gt)\n", "\n", "# Convert the result back to a list (if you need it as a list)\n", "intersection_list = list(intersection)\n", "\n", "total_fn = []\n", "\n", "# df_provider_mapping = get_provider_mapping(provider_mapping_file_path)\n", "\n", "# all_provider_dict = get_provider_names(generated_results_indexed, df_provider_mapping)\n", "\n", "\n", "# for provider_name in all_provider_dict:\n", "# provider_vise_generated_results = get_specified_doc_data(generated_results_indexed, all_provider_dict[provider_name])\n", "# comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, provider_vise_generated_results, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n", "# print(\"\\n\")\n", "# print(\"\\n\")\n", "# print(\"Provider Name - \" + provider_name + \"\\t Number of Docs - \" + str(len(all_provider_dict[provider_name])))\n", "# #create_metrics_df(comparison_results)\n", "# print_metrics_table(comparison_results)\n", "# print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n", "# print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n", "\n", "\n", "\n", "print(\"\\n\")\n", "print(\"\\n\")\n", "document_list_file_list = [None, \n", " \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n", " \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n", "for document_list_file in document_list_file_list:\n", " document_list = None\n", " if document_list_file is not None:\n", " with open(document_list_file, \"r\", encoding=\"utf-8\") as f:\n", " document_list = f.readlines()\n", " document_list = [doc_id.strip() for doc_id in document_list]\n", " \n", " print(\"All Providers Results: \")\n", " print(\"Document List File - \", document_list_file)\n", " comparison_results, message_list, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, \n", " generated_results_indexed, \n", " headers_gt, doc_id_index, \n", " fund_name_index, \n", " intersection_list,\n", " funds_matched, \n", " funds_not_matched,\n", " document_list)\n", " metrics_list = print_metrics_table(comparison_results)\n", " print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n", " print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n", "\n", " metrics_df = pd.DataFrame(metrics_list)\n", " message_df = pd.DataFrame(message_list)\n", "\n", " output_metrics_folder = r\"/data/aus_prospectus/output/metrics_data/\"\n", " os.makedirs(output_metrics_folder, exist_ok=True)\n", " if os.path.exists(output_metrics_folder):\n", " generated_file_base_name = os.path.basename(path_generated_results).replace(\".xlsx\", \"\")\n", " metrics_file_name = f\"metrics_{generated_file_base_name}\"\n", " if document_list_file is not None:\n", " metrics_file_name = f\"{metrics_file_name}_{len(document_list)}_documents.xlsx\"\n", " else:\n", " metrics_file_name = f\"{metrics_file_name}_all_documents.xlsx\"\n", " metrics_file_path = os.path.join(output_metrics_folder, metrics_file_name)\n", " with pd.ExcelWriter(metrics_file_path) as writer:\n", " metrics_df.to_excel(writer, sheet_name=\"metrics_data\", index=False)\n", " message_df.to_excel(writer, sheet_name=\"message_data\", index=False)\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'data_point': 'performance_fee_costs', 'doc_id': 377377369, 'sec_name': 'SPDR® S&P Emerging Markets Carbon Control Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'truth': '0', 'generated': '0.11', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Investment Portfolio-BlackRock Tactical Growth NE', 'truth': '0', 'generated': '0.33', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -NE', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard High Yield Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Property Securities Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Macquarie Income Opps', 'truth': '0.03', 'generated': '0.12', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Diversified Inc', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Schroder Fixed Income', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Share Plus L/S', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only) P Class', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Asia Fund', 'truth': '0.27', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund P Class', 'truth': '0.03', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Healthcare Fund', 'truth': '0.86', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum European Fund', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Japan Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 397107472, 'sec_name': 'AMP Capital Specialist Diversified Fixed Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Fairview Eq Ptnr Emg Comp', 'truth': '0.56', 'generated': '0.54', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MasterKey Pension Fundamentals (Pre Retirement) - Perpetual Smll Co Fund No.2', 'truth': '0', 'generated': '0.56', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond - Allan Gray Australian Equity Fund Class A', 'truth': '0.28', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open', 'truth': '0.05', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 539241700, 'sec_name': 'North Professional Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 539261734, 'sec_name': 'ipac life choices Income Generator', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 521606716, 'sec_name': 'CFS Enhanced Index Balanced-Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 527969661, 'sec_name': 'JPMorgan Global Equity Premium Income (Hedged) Complex ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Core Equity Tr AUDHdg', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Core Equity Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Value Trust -Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 544886057, 'sec_name': 'CFS Growth Builder', 'truth': '0.01', 'generated': '0.04', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 557362553, 'sec_name': 'JPMorgan Global Select Equity Active ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 550522985, 'sec_name': 'RQI Global Value – Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 553449663, 'sec_name': 'AMP Capital Specialist International Share (Hedged) Fund - Class A', 'truth': '0', 'generated': '0.07', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 539241700, 'sec_name': 'North Professional Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 539261734, 'sec_name': 'ipac life choices Income Generator', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 521606716, 'sec_name': 'CFS Enhanced Index Balanced-Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 527969661, 'sec_name': 'JPMorgan Global Equity Premium Income (Hedged) Complex ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Core Equity Tr AUDHdg', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Core Equity Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Value Trust -Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 544886057, 'sec_name': 'CFS Growth Builder', 'truth': '0.01', 'generated': '0.04', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 557362553, 'sec_name': 'JPMorgan Global Select Equity Active ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 550522985, 'sec_name': 'RQI Global Value – Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 553449663, 'sec_name': 'AMP Capital Specialist International Share (Hedged) Fund - Class A', 'truth': '0', 'generated': '0.07', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 377377369, 'sec_name': 'SPDR® S&P Emerging Markets Carbon Control Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'truth': '0', 'generated': '0.11', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Investment Portfolio-BlackRock Tactical Growth NE', 'truth': '0', 'generated': '0.33', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -NE', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard High Yield Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Property Securities Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Macquarie Income Opps', 'truth': '0.03', 'generated': '0.12', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Diversified Inc', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Schroder Fixed Income', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Share Plus L/S', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only) P Class', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Asia Fund', 'truth': '0.27', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund P Class', 'truth': '0.03', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Healthcare Fund', 'truth': '0.86', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum European Fund', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Japan Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 397107472, 'sec_name': 'AMP Capital Specialist Diversified Fixed Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Fairview Eq Ptnr Emg Comp', 'truth': '0.56', 'generated': '0.54', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MasterKey Pension Fundamentals (Pre Retirement) - Perpetual Smll Co Fund No.2', 'truth': '0', 'generated': '0.56', 'error': 'Truth is not equal with generated'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond - Allan Gray Australian Equity Fund Class A', 'truth': '0.28', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open', 'truth': '0.05', 'generated': '', 'error': 'Generated is null and truth is not null'}\n" ] } ], "source": [ "for message_list_element in message_list:\n", " if message_list_element[\"data_point\"] == \"performance_fee_costs\":\n", " print(message_list_element)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Excel file '/data/aus_prospectus/output/error_analysis/anomalies_found.xlsx' has been created successfully.\n" ] } ], "source": [ "import pandas as pd\n", "\n", "\n", "# Convert data to DataFrame\n", "df = pd.DataFrame(message_list)\n", "\n", "# Sort DataFrame by 'doc_id'\n", "df_sorted = df.sort_values(by=['doc_id'])\n", "\n", "# Save DataFrame to Excel file\n", "os.makedirs(\"/data/aus_prospectus/output/error_analysis/\", exist_ok=True)\n", "output_filename = r\"/data/aus_prospectus/output/error_analysis/anomalies_found.xlsx\"\n", "df_sorted.to_excel(output_filename, index=False)\n", "\n", "print(f\"Excel file '{output_filename}' has been created successfully.\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "blade", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }