2025-03-13 12:01:54 +00:00
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-03-13 22:52:06 +00:00
|
|
|
"execution_count": 18,
|
2025-03-13 12:01:54 +00:00
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n",
|
|
|
|
|
" \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n",
|
|
|
|
|
" \"Minimum Initial Investment\", \"Benchmark\"]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"imp_datapoints_mapping = {\n",
|
|
|
|
|
" \"Management Fee and Costs\": \"management_fee_and_costs\",\n",
|
|
|
|
|
" \"Management Fee\": \"management_fee\",\n",
|
|
|
|
|
" \"Performance fee and cost\": \"performance_fee_costs\",\n",
|
|
|
|
|
" \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n",
|
|
|
|
|
" \"Administration Fee and costs\": \"administration_fees\",\n",
|
|
|
|
|
" \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n",
|
|
|
|
|
" \"Buy Spread\": \"buy_spread\",\n",
|
|
|
|
|
" \"Sell Spread\": \"sell_spread\",\n",
|
|
|
|
|
" \"Performance Fee\": \"PerformanceFeeCharged\",\n",
|
|
|
|
|
" \"Minimum Initial Investment\": \"minimum_initial_investment\",\n",
|
|
|
|
|
" \"Benchmark\": \"benchmark_name\"\n",
|
|
|
|
|
"}\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2025-03-14 06:04:51 +00:00
|
|
|
"execution_count": 33,
|
2025-03-13 12:01:54 +00:00
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"All Providers Results: \n",
|
2025-03-13 16:53:27 +00:00
|
|
|
"Document List File - None\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
2025-03-14 06:04:51 +00:00
|
|
|
"management_fee_and_costs \t0.9395 \t0.9002 \t0.9823 \t0.8858 \t437 \t388 \t0 \t43 \t7 \n",
|
|
|
|
|
"management_fee \t0.9496 \t0.9188 \t0.9826 \t0.9041 \t437 \t396 \t0 \t35 \t7 \n",
|
|
|
|
|
"performance_fee_costs \t0.8597 \t0.8755 \t0.8445 \t0.8219 \t298 \t239 \t121 \t34 \t44 \n",
|
|
|
|
|
"interposed_vehicle_performance_fee_cost \t0.9362 \t0.9429 \t0.9296 \t0.9795 \t72 \t66 \t363 \t4 \t5 \n",
|
|
|
|
|
"administration_fees \t0.7862 \t0.9828 \t0.6552 \t0.9292 \t87 \t57 \t350 \t1 \t30 \n",
|
|
|
|
|
"total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9772 \t82 \t72 \t356 \t10 \t0 \n",
|
|
|
|
|
"buy_spread \t0.9374 \t0.9208 \t0.9547 \t0.8973 \t375 \t337 \t56 \t29 \t16 \n",
|
|
|
|
|
"sell_spread \t0.9418 \t0.9290 \t0.9551 \t0.9041 \t375 \t340 \t56 \t26 \t16 \n",
|
|
|
|
|
"minimum_initial_investment \t0.9518 \t0.9457 \t0.9579 \t0.9315 \t315 \t296 \t112 \t17 \t13 \n",
|
|
|
|
|
"benchmark_name \t0.8553 \t0.8418 \t0.8693 \t0.8973 \t166 \t133 \t260 \t25 \t20 \n",
|
|
|
|
|
"TOTAL \t0.9093 \t0.9135 \t0.9131 \t0.9128 \t2644 \t2324 \t1674 \t224 \t158 \n",
|
|
|
|
|
"Total Funds Matched - 438\n",
|
|
|
|
|
"Total Funds Not Matched - 127\n",
|
|
|
|
|
"Percentage of Funds Matched - 77.5221238938053\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
"All Providers Results: \n",
|
|
|
|
|
"Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
2025-03-14 06:04:51 +00:00
|
|
|
"management_fee_and_costs \t0.9419 \t0.9059 \t0.9809 \t0.8902 \t172 \t154 \t0 \t16 \t3 \n",
|
|
|
|
|
"management_fee \t0.9547 \t0.9294 \t0.9814 \t0.9133 \t172 \t158 \t0 \t12 \t3 \n",
|
|
|
|
|
"performance_fee_costs \t0.8315 \t0.9024 \t0.7708 \t0.8266 \t97 \t74 \t69 \t8 \t22 \n",
|
|
|
|
|
"interposed_vehicle_performance_fee_cost \t0.9630 \t0.9286 \t1.0000 \t0.9769 \t53 \t52 \t117 \t4 \t0 \n",
|
|
|
|
|
"administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t8 \t8 \t165 \t0 \t0 \n",
|
|
|
|
|
"buy_spread \t0.9699 \t0.9699 \t0.9699 \t0.9422 \t169 \t161 \t2 \t5 \t5 \n",
|
|
|
|
|
"sell_spread \t0.9760 \t0.9819 \t0.9702 \t0.9538 \t169 \t163 \t2 \t3 \t5 \n",
|
|
|
|
|
"minimum_initial_investment \t0.9027 \t0.9062 \t0.8992 \t0.8555 \t135 \t116 \t32 \t12 \t13 \n",
|
|
|
|
|
"benchmark_name \t0.8333 \t0.8025 \t0.8667 \t0.8497 \t85 \t65 \t82 \t16 \t10 \n",
|
|
|
|
|
"TOTAL \t0.9303 \t0.9252 \t0.9377 \t0.9120 \t1060 \t951 \t469 \t76 \t219 \n",
|
|
|
|
|
"Total Funds Matched - 173\n",
|
|
|
|
|
"Total Funds Not Matched - 23\n",
|
|
|
|
|
"Percentage of Funds Matched - 88.26530612244898\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
"All Providers Results: \n",
|
|
|
|
|
"Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
2025-03-14 06:04:51 +00:00
|
|
|
"management_fee_and_costs \t0.9379 \t0.8966 \t0.9832 \t0.8830 \t265 \t234 \t0 \t27 \t4 \n",
|
|
|
|
|
"management_fee \t0.9463 \t0.9119 \t0.9835 \t0.8981 \t265 \t238 \t0 \t23 \t4 \n",
|
|
|
|
|
"performance_fee_costs \t0.8730 \t0.8639 \t0.8824 \t0.8189 \t201 \t165 \t52 \t26 \t22 \n",
|
|
|
|
|
"interposed_vehicle_performance_fee_cost \t0.8485 \t1.0000 \t0.7368 \t0.9811 \t19 \t14 \t246 \t0 \t5 \n",
|
|
|
|
|
"administration_fees \t0.7597 \t0.9800 \t0.6203 \t0.8830 \t79 \t49 \t185 \t1 \t30 \n",
|
|
|
|
|
"total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9623 \t82 \t72 \t183 \t10 \t0 \n",
|
|
|
|
|
"buy_spread \t0.9096 \t0.8800 \t0.9412 \t0.8679 \t206 \t176 \t54 \t24 \t11 \n",
|
|
|
|
|
"sell_spread \t0.9124 \t0.8850 \t0.9415 \t0.8717 \t206 \t177 \t54 \t23 \t11 \n",
|
|
|
|
|
"minimum_initial_investment \t0.9863 \t0.9730 \t1.0000 \t0.9811 \t180 \t180 \t80 \t5 \t0 \n",
|
|
|
|
|
"benchmark_name \t0.8774 \t0.8831 \t0.8718 \t0.9283 \t81 \t68 \t178 \t9 \t10 \n",
|
|
|
|
|
"TOTAL \t0.8986 \t0.9151 \t0.8961 \t0.9075 \t1584 \t1373 \t1032 \t148 \t316 \n",
|
|
|
|
|
"Total Funds Matched - 265\n",
|
|
|
|
|
"Total Funds Not Matched - 104\n",
|
|
|
|
|
"Percentage of Funds Matched - 71.81571815718158\n"
|
2025-03-13 12:01:54 +00:00
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"import openpyxl\n",
|
|
|
|
|
"from collections import defaultdict\n",
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"import statistics\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
"import os\n",
|
|
|
|
|
"import re\n",
|
|
|
|
|
"from utils.similarity import Similarity\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
"\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
"\"\"\"\n",
|
|
|
|
|
"Blade's updates\n",
|
|
|
|
|
"1. Set the secondary key to be the share class name, instead of the fund name\n",
|
|
|
|
|
"2. Remove the data point which support is 0 to calculate the metrics\n",
|
|
|
|
|
"3. Add the message list to store the error message\n",
|
|
|
|
|
"4. Support save metrics/ error message to excel file\n",
|
|
|
|
|
"5. Support statistics for different document list\n",
|
|
|
|
|
"6. Set F1-Score to the first column in the metrics table\n",
|
|
|
|
|
"\"\"\"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
|
2025-03-14 06:04:51 +00:00
|
|
|
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313224747.xlsx\"\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
"provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n",
|
|
|
|
|
"\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
"funds_matched = 0\n",
|
|
|
|
|
"funds_not_matched = 0\n",
|
|
|
|
|
"def load_excel(filepath, header_row_index):\n",
|
|
|
|
|
" \"\"\"Load an Excel file and use the specified row as the header.\"\"\"\n",
|
|
|
|
|
" wb = openpyxl.load_workbook(filepath, data_only=True)\n",
|
|
|
|
|
" sheet = wb.active\n",
|
|
|
|
|
" headers = []\n",
|
|
|
|
|
" data = []\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" for index, row in enumerate(sheet.iter_rows(values_only=True)):\n",
|
|
|
|
|
" if index == header_row_index:\n",
|
|
|
|
|
" headers = [cell if cell is not None else \"\" for cell in row]\n",
|
|
|
|
|
" elif index > header_row_index:\n",
|
|
|
|
|
" data.append([cell if cell is not None else \"\" for cell in row])\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" return headers, data\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def index_data_by_key(data, key_index, secondary_key_index, header):\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" \"\"\"Index data by primary and secondary keys (doc_id and sec_name).\"\"\"\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" indexed_data = defaultdict(dict)\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" for row in data:\n",
|
|
|
|
|
" row_data = {}\n",
|
|
|
|
|
" # Store the entire row, which will be useful for full row comparison\n",
|
|
|
|
|
" for i in range(len(row)):\n",
|
|
|
|
|
" if header[i] == \"doc_id\":\n",
|
|
|
|
|
" primary_key = int(row[i])\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" elif header[i] == \"sec_name\":\n",
|
|
|
|
|
" # share class should be the comparison level and key\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" secondary_key = str(row[i])\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" row_data[header[i]] = convert_if_number(row[i])\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
" if secondary_key is None or (isinstance(secondary_key, str) and len(secondary_key) == 0):\n",
|
|
|
|
|
" continue\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" indexed_data[primary_key][secondary_key] = row_data\n",
|
|
|
|
|
" return indexed_data\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def convert_if_number(value):\n",
|
|
|
|
|
" \"\"\"Attempt to convert value to a float or int, otherwise return as string.\"\"\"\n",
|
|
|
|
|
" try:\n",
|
|
|
|
|
" float_value = round(float(value), 2)\n",
|
|
|
|
|
" int_value = int(float_value)\n",
|
|
|
|
|
" return int_value if int_value == float_value else float_value\n",
|
|
|
|
|
" except (ValueError, TypeError):\n",
|
|
|
|
|
" return value\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def compare_values(value1, value2):\n",
|
|
|
|
|
" \"\"\"Convert values to numbers if possible and compare, otherwise compare as strings.\"\"\"\n",
|
|
|
|
|
" value1 = convert_if_number(value1)\n",
|
|
|
|
|
" value2 = convert_if_number(value2)\n",
|
|
|
|
|
" return value1 == value2\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
"def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched, document_list):\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n",
|
|
|
|
|
" results = {}\n",
|
|
|
|
|
" funds_matched, funds_not_matched = 0, 0\n",
|
|
|
|
|
" # Initialize result dictionaries for each column except 'doc_id'\n",
|
|
|
|
|
" for keys in headers:\n",
|
|
|
|
|
" if keys != \"doc_id\":\n",
|
|
|
|
|
" results[keys] = {}\n",
|
|
|
|
|
" results[keys][\"TP\"] = 0\n",
|
|
|
|
|
" results[keys][\"TN\"] = 0\n",
|
|
|
|
|
" results[keys][\"FP\"] = 0\n",
|
|
|
|
|
" results[keys][\"FN\"] = 0\n",
|
|
|
|
|
" results[keys][\"SUPPORT\"] = 0\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # Iterate over the generated results instead of the ground truth\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" total = 0\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" message_list = []\n",
|
|
|
|
|
" # print(document_list)\n",
|
|
|
|
|
" for doc_id, secs in ground_truth.items():\n",
|
|
|
|
|
" if document_list is not None and str(doc_id) not in document_list:\n",
|
|
|
|
|
" continue\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" if doc_id in generated_results:\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" for sec_name, truth_values in secs.items():\n",
|
|
|
|
|
" if sec_name in generated_results[doc_id]:\n",
|
|
|
|
|
" generated_values = generated_results[doc_id][sec_name]\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" # Compare all other columns\n",
|
|
|
|
|
" for i in intersection_list:\n",
|
|
|
|
|
" for keys in imp_datapoints:\n",
|
|
|
|
|
" if i == imp_datapoints_mapping[keys]:\n",
|
2025-03-14 06:04:51 +00:00
|
|
|
" truth = str(truth_values[i]).strip()\n",
|
|
|
|
|
" generated = str(generated_values[i]).strip()\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" total = total +1\n",
|
2025-03-14 06:04:51 +00:00
|
|
|
" if truth == \"\":\n",
|
|
|
|
|
" if truth == generated:\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" # if \"Performance fee and cost\" in keys:\n",
|
|
|
|
|
" debug = 0\n",
|
|
|
|
|
" # print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], sec_name) \n",
|
2025-03-14 06:04:51 +00:00
|
|
|
" message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n",
|
|
|
|
|
" \"truth\": truth, \"generated\": generated, \"error\": \"Truth is null and generated is not null\"}\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" message_list.append(message) \n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" else:\n",
|
2025-03-14 06:04:51 +00:00
|
|
|
" if truth == generated:\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
|
2025-03-14 06:04:51 +00:00
|
|
|
" elif generated != \"\":\n",
|
|
|
|
|
" if i == \"benchmark_name\" and compare_text(truth, generated):\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
|
|
|
|
|
" # if \"Performance fee and cost\" in keys:\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" debug = 0\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" # print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", sec_name)\n",
|
2025-03-14 06:04:51 +00:00
|
|
|
" message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n",
|
|
|
|
|
" \"truth\": truth, \"generated\": generated, \"error\": \"Truth is not equal with generated\"}\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" message_list.append(message)\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" else:\n",
|
|
|
|
|
" results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" # if \"Performance fee and cost\" in keys:\n",
|
|
|
|
|
" debug = 0\n",
|
|
|
|
|
" # print(keys, \" - \" , doc_id, \" generated is null and truth is - \", truth_values[i], sec_name)\n",
|
2025-03-14 06:04:51 +00:00
|
|
|
" message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n",
|
|
|
|
|
" \"truth\": truth, \"generated\": generated, \"error\": \"Generated is null and truth is not null\"}\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" message_list.append(message)\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
|
|
|
|
|
" funds_matched += 1\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" funds_not_matched += 1\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" # If the entire document is not found, count all funds as not matched\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" funds_not_matched += len(secs)\n",
|
|
|
|
|
" return results, message_list, funds_matched, funds_not_matched\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def clean_text(text: str):\n",
|
|
|
|
|
" if text is None or len(text) == 0:\n",
|
|
|
|
|
" return text\n",
|
|
|
|
|
" text = re.sub(r\"\\W\", \" \", text)\n",
|
|
|
|
|
" text = re.sub(r\"\\s+\", \" \", text)\n",
|
|
|
|
|
" return text\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def compare_text(source_text, target_text):\n",
|
|
|
|
|
" source_text = clean_text(source_text)\n",
|
|
|
|
|
" target_text = clean_text(target_text)\n",
|
|
|
|
|
" if source_text == target_text or source_text in target_text or target_text in source_text:\n",
|
|
|
|
|
" return True\n",
|
|
|
|
|
" similarity = Similarity()\n",
|
|
|
|
|
" jacard_score = similarity.jaccard_similarity(source_text.lower().split(), target_text.lower().split())\n",
|
|
|
|
|
" if jacard_score > 0.8:\n",
|
|
|
|
|
" return True\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
"\n",
|
|
|
|
|
"# Load the files\n",
|
|
|
|
|
"headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n",
|
|
|
|
|
"headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Assuming doc_id is the first column and fund_name is the second column\n",
|
|
|
|
|
"doc_id_index = 0\n",
|
|
|
|
|
"fund_name_index = 1\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Index the data\n",
|
|
|
|
|
"ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n",
|
|
|
|
|
"generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"intersection = set(headers_gen).intersection(headers_gt)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Convert the result back to a list (if you need it as a list)\n",
|
|
|
|
|
"intersection_list = list(intersection)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"total_fn = []\n",
|
|
|
|
|
"def calculate_metrics(tp, tn, fp, fn):\n",
|
|
|
|
|
" \"\"\"Calculate precision, recall, accuracy, and F1-score.\"\"\"\n",
|
|
|
|
|
" precision = tp / (tp + fp) if (tp + fp) != 0 else 0\n",
|
|
|
|
|
" recall = tp / (tp + fn) if (tp + fn) != 0 else 0\n",
|
|
|
|
|
" accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0\n",
|
|
|
|
|
" f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0\n",
|
|
|
|
|
" return precision, recall, accuracy, f1_score\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def print_metrics_table(data):\n",
|
|
|
|
|
" # Print table headers\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
" print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"F1-Score\", \"Precision\", \"Recall\", \"Accuracy\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" total_tp = []\n",
|
|
|
|
|
" total_tn = []\n",
|
|
|
|
|
" total_fp = []\n",
|
|
|
|
|
" #total_fn = []\n",
|
|
|
|
|
" # Calculate and print metrics for each item\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" metrics_list = []\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" for keys in imp_datapoints:\n",
|
|
|
|
|
" try:\n",
|
|
|
|
|
" key = imp_datapoints_mapping[keys]\n",
|
|
|
|
|
" values = data[key]\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
" if values[\"SUPPORT\"] == 0:\n",
|
|
|
|
|
" continue\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
|
|
|
|
|
" precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
" metrics = {\"Datapoint\": key, \"F1-Score\": f1_score, \"Precision\": precision, \"Recall\": recall, \"Accuracy\": accuracy, \"SUPPORT\": values[\"SUPPORT\"], \"TP\": tp, \"TN\": tn, \"FP\": fp, \"FN\": fn}\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" metrics_list.append(metrics)\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" total_precision.append(precision)\n",
|
|
|
|
|
" total_recall.append(recall)\n",
|
|
|
|
|
" total_accuracy.append(accuracy)\n",
|
|
|
|
|
" total_f1_score.append(f1_score)\n",
|
|
|
|
|
" total_support.append(values[\"SUPPORT\"])\n",
|
|
|
|
|
" total_tp.append(tp)\n",
|
|
|
|
|
" total_tn.append(tn)\n",
|
|
|
|
|
" total_fp.append(fp)\n",
|
|
|
|
|
" total_fn.append(fn)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" if values[\"SUPPORT\"] > 0 and key > \"\":\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(key, f1_score, precision, recall, accuracy, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" except:\n",
|
|
|
|
|
" pass\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" total_mean_precision = statistics.mean(total_precision)\n",
|
|
|
|
|
" total_mean_recall = statistics.mean(total_recall)\n",
|
|
|
|
|
" total_mean_accuracy = statistics.mean(total_accuracy)\n",
|
|
|
|
|
" total_mean_f1_score = statistics.mean(total_f1_score)\n",
|
|
|
|
|
" total_sum_support = sum(total_support)\n",
|
|
|
|
|
" total_sum_tp = sum(total_tp)\n",
|
|
|
|
|
" total_sum_tn = sum(total_tn)\n",
|
|
|
|
|
" total_sum_fp = sum(total_fp)\n",
|
|
|
|
|
" total_sum_fn = sum(total_fn)\n",
|
|
|
|
|
" total_metrics = {\"Datapoint\": \"TOTAL\", \"F1-Score\": total_mean_f1_score, \"Precision\": total_mean_precision, \"Recall\": total_mean_recall, \"Accuracy\": total_mean_accuracy, \"SUPPORT\": total_sum_support, \"TP\": total_sum_tp, \"TN\": total_sum_tn, \"FP\": total_sum_fp, \"FN\": total_sum_fn}\n",
|
|
|
|
|
" metrics_list.append(total_metrics)\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", total_mean_f1_score, total_mean_precision, total_mean_recall, total_mean_accuracy, total_sum_support, total_sum_tp, total_sum_tn, total_sum_fp, total_sum_fn))\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" return metrics_list\n",
|
2025-03-13 12:01:54 +00:00
|
|
|
" \n",
|
|
|
|
|
"def create_metrics_df(data):\n",
|
|
|
|
|
" # Define a list to hold data for DataFrame\n",
|
|
|
|
|
" rows = []\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # Iterate through each metric item\n",
|
|
|
|
|
" for key in imp_datapoints:\n",
|
|
|
|
|
" try:\n",
|
|
|
|
|
" mapped_key = imp_datapoints_mapping[key]\n",
|
|
|
|
|
" values = data[mapped_key]\n",
|
|
|
|
|
" tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
|
|
|
|
|
" precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" # Only add rows where SUPPORT > 0\n",
|
|
|
|
|
" if values[\"SUPPORT\"] > 0:\n",
|
|
|
|
|
" row = {\n",
|
|
|
|
|
" \"Metric\": key,\n",
|
|
|
|
|
" \"Precision\": precision,\n",
|
|
|
|
|
" \"Recall\": recall,\n",
|
|
|
|
|
" \"Accuracy\": accuracy,\n",
|
|
|
|
|
" \"F1-Score\": f1_score,\n",
|
|
|
|
|
" \"SUPPORT\": values[\"SUPPORT\"]\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
" rows.append(row)\n",
|
|
|
|
|
" except KeyError as e:\n",
|
|
|
|
|
" continue\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" # Create a DataFrame from the list of rows\n",
|
|
|
|
|
" df_metrics = pd.DataFrame(rows)\n",
|
|
|
|
|
" df_metrics.reset_index(inplace=True)\n",
|
|
|
|
|
" df_metrics.drop(columns=[\"index\"], inplace=True)\n",
|
|
|
|
|
" print(df_metrics)\n",
|
|
|
|
|
" return df_metrics\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def get_provider_mapping(file_path):\n",
|
|
|
|
|
" df = pd.read_excel(file_path)\n",
|
|
|
|
|
" df = (df.groupby([\"Docid\", \"ProviderName\"]).first())\n",
|
|
|
|
|
" df.reset_index(inplace = True)\n",
|
|
|
|
|
" return df[[\"Docid\", \"ProviderName\"]]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def get_provider_names(generated_results_indexed, df_provider_mapping):\n",
|
|
|
|
|
" providers_dict = {}\n",
|
|
|
|
|
" for doc_id in generated_results_indexed:\n",
|
|
|
|
|
" try:\n",
|
|
|
|
|
" provider_name = (df_provider_mapping[df_provider_mapping[\"Docid\"] == doc_id][\"ProviderName\"].values)[0]\n",
|
|
|
|
|
" if provider_name in providers_dict:\n",
|
|
|
|
|
" providers_dict[provider_name].append(doc_id)\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" providers_dict[provider_name] = []\n",
|
|
|
|
|
" providers_dict[provider_name].append(doc_id)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" except:\n",
|
|
|
|
|
" pass\n",
|
|
|
|
|
" return providers_dict\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def get_specified_doc_data(results, doc_list):\n",
|
|
|
|
|
" provider_res = {}\n",
|
|
|
|
|
" for doc_id in doc_list:\n",
|
|
|
|
|
" if doc_id in results:\n",
|
|
|
|
|
" provider_res[doc_id] = results[doc_id]\n",
|
|
|
|
|
" return provider_res\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"df_provider_mapping = get_provider_mapping(provider_mapping_file_path)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"all_provider_dict = get_provider_names(generated_results_indexed, df_provider_mapping)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# for provider_name in all_provider_dict:\n",
|
|
|
|
|
"# provider_vise_generated_results = get_specified_doc_data(generated_results_indexed, all_provider_dict[provider_name])\n",
|
|
|
|
|
"# comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, provider_vise_generated_results, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
|
|
|
|
|
"# print(\"\\n\")\n",
|
|
|
|
|
"# print(\"\\n\")\n",
|
|
|
|
|
"# print(\"Provider Name - \" + provider_name + \"\\t Number of Docs - \" + str(len(all_provider_dict[provider_name])))\n",
|
|
|
|
|
"# #create_metrics_df(comparison_results)\n",
|
|
|
|
|
"# print_metrics_table(comparison_results)\n",
|
|
|
|
|
"# print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
|
|
|
|
|
"# print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(\"\\n\")\n",
|
|
|
|
|
"print(\"\\n\")\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
"document_list_file_list = [None, \n",
|
|
|
|
|
" \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n",
|
|
|
|
|
" \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n",
|
|
|
|
|
"for document_list_file in document_list_file_list:\n",
|
|
|
|
|
" document_list = None\n",
|
|
|
|
|
" if document_list_file is not None:\n",
|
|
|
|
|
" with open(document_list_file, \"r\", encoding=\"utf-8\") as f:\n",
|
|
|
|
|
" document_list = f.readlines()\n",
|
|
|
|
|
" document_list = [doc_id.strip() for doc_id in document_list]\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" print(\"All Providers Results: \")\n",
|
|
|
|
|
" print(\"Document List File - \", document_list_file)\n",
|
|
|
|
|
" comparison_results, message_list, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, \n",
|
|
|
|
|
" generated_results_indexed, \n",
|
|
|
|
|
" headers_gt, doc_id_index, \n",
|
|
|
|
|
" fund_name_index, \n",
|
|
|
|
|
" intersection_list,\n",
|
|
|
|
|
" funds_matched, \n",
|
|
|
|
|
" funds_not_matched,\n",
|
|
|
|
|
" document_list)\n",
|
|
|
|
|
" metrics_list = print_metrics_table(comparison_results)\n",
|
|
|
|
|
" print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
|
|
|
|
|
" print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" metrics_df = pd.DataFrame(metrics_list)\n",
|
|
|
|
|
" message_df = pd.DataFrame(message_list)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" output_metrics_folder = r\"/data/aus_prospectus/output/metrics_data/\"\n",
|
2025-03-13 22:52:06 +00:00
|
|
|
" os.makedirs(output_metrics_folder, exist_ok=True)\n",
|
2025-03-13 16:53:27 +00:00
|
|
|
" if os.path.exists(output_metrics_folder):\n",
|
|
|
|
|
" generated_file_base_name = os.path.basename(path_generated_results).replace(\".xlsx\", \"\")\n",
|
|
|
|
|
" metrics_file_name = f\"metrics_{generated_file_base_name}\"\n",
|
|
|
|
|
" if document_list_file is not None:\n",
|
|
|
|
|
" metrics_file_name = f\"{metrics_file_name}_{len(document_list)}_documents.xlsx\"\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" metrics_file_name = f\"{metrics_file_name}_all_documents.xlsx\"\n",
|
|
|
|
|
" metrics_file_path = os.path.join(output_metrics_folder, metrics_file_name)\n",
|
|
|
|
|
" with pd.ExcelWriter(metrics_file_path) as writer:\n",
|
|
|
|
|
" metrics_df.to_excel(writer, sheet_name=\"metrics_data\", index=False)\n",
|
|
|
|
|
" message_df.to_excel(writer, sheet_name=\"message_data\", index=False)\n"
|
2025-03-13 12:01:54 +00:00
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": "blade",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
2025-03-13 16:53:27 +00:00
|
|
|
"version": "3.12.6"
|
2025-03-13 12:01:54 +00:00
|
|
|
},
|
|
|
|
|
"orig_nbformat": 4
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 2
|
|
|
|
|
}
|