Added code to identify anomaly cases and performance matrix and updated for pdf downloading code

2025-03-13 17:31:54 +05:30 · 2025-03-13 17:31:54 +05:30 · 97da7e4961
parent 336fd9a24f
commit 97da7e4961
2 changed files with 850 additions and 3 deletions
--- a/performance.ipynb
+++ b/performance.ipynb
@ -0,0 +1,842 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n",
+    "                  \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n",
+    "                  \"Minimum Initial Investment\", \"Benchmark\"]\n",
+    "\n",
+    "\n",
+    "imp_datapoints_mapping = {\n",
+    "    \"Management Fee and Costs\": \"management_fee_and_costs\",\n",
+    "    \"Management Fee\": \"management_fee\",\n",
+    "    \"Performance fee and cost\": \"performance_fee_costs\",\n",
+    "    \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n",
+    "    \"Administration Fee and costs\": \"administration_fees\",\n",
+    "    \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n",
+    "    \"Buy Spread\": \"buy_spread\",\n",
+    "    \"Sell Spread\": \"sell_spread\",\n",
+    "    \"Performance Fee\": \"PerformanceFeeCharged\",\n",
+    "    \"Minimum Initial Investment\": \"minimum_initial_investment\",\n",
+    "    \"Benchmark\": \"benchmark_name\"\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "path_ground_truth = r\"C:\\data\\aus_prospectus\\output\\Performance\\46_documents_ground_truth_with_mapping.xlsx\"\n",
+    "path_generated_results = r\"C:\\data\\aus_prospectus\\output\\Performance\\mapping_data_info_46_documents_by_text_20250313024715.xlsx\"\n",
+    "provider_mapping_file_path = r\"C:\\Users\\rmahesh\\OneDrive - MORNINGSTAR INC\\Desktop\\NLP Transitions\\Project\\Exprs\\INO71\\dc-ml-dataextraction-llm-aus-nz-pro-AUS_NZ_EXE_COMBINED_PHASE1_PHASE2\\output_files\\ground_truth\\TopProvidersBiz.xlsx\"\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "All Providers Results: \n",
+      "Metric                                            \tPrecision \tRecall    \tAccuracy  \tF1-Score  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
+      "Management Fee and Costs                          \t0.8790    \t0.9250    \t0.8213    \t0.9014    \t494       \t407       \t2         \t56        \t33        \n",
+      "Management Fee                                    \t0.8985    \t0.9265    \t0.8394    \t0.9123    \t494       \t416       \t2         \t47        \t33        \n",
+      "Performance fee and cost                          \t0.7871    \t0.8472    \t0.7791    \t0.8161    \t327       \t244       \t144       \t66        \t44        \n",
+      "Interposed vehicle Performance fee and Costs      \t0.5000    \t1.0000    \t0.9237    \t0.6667    \t39        \t38        \t422       \t38        \t0         \n",
+      "Administration Fee and costs                      \t0.9787    \t0.9388    \t0.9839    \t0.9583    \t98        \t92        \t398       \t2         \t6         \n",
+      "Total Annual Dollar Based Charges                 \t0.8165    \t1.0000    \t0.9598    \t0.8990    \t90        \t89        \t389       \t20        \t0         \n",
+      "Buy Spread                                        \t0.8957    \t0.8910    \t0.8394    \t0.8933    \t405       \t335       \t83        \t39        \t41        \n",
+      "Sell Spread                                       \t0.9064    \t0.8921    \t0.8474    \t0.8992    \t405       \t339       \t83        \t35        \t41        \n",
+      "Minimum Initial Investment                        \t0.8571    \t0.9671    \t0.8815    \t0.9088    \t310       \t294       \t145       \t49        \t10        \n",
+      "Benchmark                                         \t0.6402    \t0.8582    \t0.8233    \t0.7333    \t173       \t121       \t289       \t68        \t20        \n",
+      "TOTAL                                             \t0.8159    \t0.9246    \t0.8699    \t0.8588    \t2835      \t2375      \t1957      \t420       \t228       \n",
+      "Total Funds Matched - 498\n",
+      "Total Funds Not Matched - 28\n",
+      "Percentage of Funds Matched - 94.67680608365019\n"
+     ]
+    }
+   ],
+   "source": [
+    "import openpyxl\n",
+    "from collections import defaultdict\n",
+    "import pandas as pd\n",
+    "import statistics\n",
+    "\n",
+    "\n",
+    "funds_matched = 0\n",
+    "funds_not_matched = 0\n",
+    "def load_excel(filepath, header_row_index):\n",
+    "    \"\"\"Load an Excel file and use the specified row as the header.\"\"\"\n",
+    "    wb = openpyxl.load_workbook(filepath, data_only=True)\n",
+    "    sheet = wb.active\n",
+    "    headers = []\n",
+    "    data = []\n",
+    "\n",
+    "    for index, row in enumerate(sheet.iter_rows(values_only=True)):\n",
+    "        if index == header_row_index:\n",
+    "            headers = [cell if cell is not None else \"\" for cell in row]\n",
+    "        elif index > header_row_index:\n",
+    "            data.append([cell if cell is not None else \"\" for cell in row])\n",
+    "\n",
+    "    return headers, data\n",
+    "\n",
+    "def index_data_by_key(data, key_index, secondary_key_index, header):\n",
+    "    \"\"\"Index data by primary and secondary keys (doc_id and fund_name).\"\"\"\n",
+    "    indexed_data = defaultdict(dict)\n",
+    "    \n",
+    "    for row in data:\n",
+    "        row_data = {}\n",
+    "        # Store the entire row, which will be useful for full row comparison\n",
+    "        for i in range(len(row)):\n",
+    "            if header[i] == \"doc_id\":\n",
+    "                primary_key = int(row[i])\n",
+    "            elif header[i] == \"fund_name\":\n",
+    "                secondary_key = str(row[i])\n",
+    "            else:\n",
+    "                row_data[header[i]] = convert_if_number(row[i])\n",
+    "        indexed_data[primary_key][secondary_key] = row_data\n",
+    "    return indexed_data\n",
+    "\n",
+    "def convert_if_number(value):\n",
+    "    \"\"\"Attempt to convert value to a float or int, otherwise return as string.\"\"\"\n",
+    "    try:\n",
+    "        float_value = round(float(value), 2)\n",
+    "        int_value = int(float_value)\n",
+    "        return int_value if int_value == float_value else float_value\n",
+    "    except (ValueError, TypeError):\n",
+    "        return value\n",
+    "\n",
+    "def compare_values(value1, value2):\n",
+    "    \"\"\"Convert values to numbers if possible and compare, otherwise compare as strings.\"\"\"\n",
+    "    value1 = convert_if_number(value1)\n",
+    "    value2 = convert_if_number(value2)\n",
+    "    return value1 == value2\n",
+    "\n",
+    "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched):\n",
+    "    \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n",
+    "    results = {}\n",
+    "    funds_matched, funds_not_matched = 0, 0\n",
+    "    # Initialize result dictionaries for each column except 'doc_id'\n",
+    "    for keys in headers:\n",
+    "        if keys != \"doc_id\":\n",
+    "            results[keys] = {}\n",
+    "            results[keys][\"TP\"] = 0\n",
+    "            results[keys][\"TN\"] = 0\n",
+    "            results[keys][\"FP\"] = 0\n",
+    "            results[keys][\"FN\"] = 0\n",
+    "            results[keys][\"SUPPORT\"] = 0\n",
+    "\n",
+    "    # Iterate over the generated results instead of the ground truth\n",
+    "    for doc_id, funds in ground_truth.items():\n",
+    "        if doc_id in generated_results:\n",
+    "            for fund_name, truth_values in funds.items():\n",
+    "                if fund_name in generated_results[doc_id]:\n",
+    "                    generated_values = generated_results[doc_id][fund_name]\n",
+    "                    # Compare all other columns\n",
+    "                    for i in intersection_list:\n",
+    "                        for keys in imp_datapoints:\n",
+    "                            if i == imp_datapoints_mapping[keys]:\n",
+    "                                if truth_values[i] == \"\":\n",
+    "                                    if truth_values[i] == generated_values[i]:\n",
+    "                                        results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
+    "                                    else:\n",
+    "                                        results[i][\"FP\"] = results[i][\"FP\"] + 1                                        \n",
+    "                                else:\n",
+    "                                    if truth_values[i] == generated_values[i]:\n",
+    "                                        results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
+    "                                    elif generated_values[i] != \"\":\n",
+    "                                        results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
+    "                                    else:\n",
+    "                                        results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
+    "                                    results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
+    "\n",
+    "\n",
+    "                                # if truth_values[i] == generated_values[i] and truth_values[i] == \"\":\n",
+    "                                #     results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
+    "                                # elif truth_values[i] == generated_values[i]:\n",
+    "                                #     results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
+    "                                # elif truth_values[i] != \"\" and generated_values[i] == \"\":\n",
+    "                                #     results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
+    "                                # elif truth_values[i] == \"\" and generated_values[i] != \"\":\n",
+    "                                #     results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
+    "                                # else:\n",
+    "                                #     results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
+    "                                # if truth_values[i] != \"\":\n",
+    "                                #     results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
+    "                    funds_matched += 1\n",
+    "                else:\n",
+    "                    funds_not_matched += 1\n",
+    "                    # for keys in headers:\n",
+    "                    #     if keys != \"doc_id\":\n",
+    "                    #         results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
+    "        else:\n",
+    "            # If the entire document is not found, count all funds as not matched\n",
+    "            funds_not_matched += len(funds)\n",
+    "            # for fund_name in funds:\n",
+    "            #     for keys in headers:\n",
+    "            #         if keys != \"doc_id\":\n",
+    "            #             results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
+    "\n",
+    "    return results, funds_matched, funds_not_matched\n",
+    "\n",
+    "\n",
+    "# Load the files\n",
+    "headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n",
+    "headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n",
+    "\n",
+    "# Assuming doc_id is the first column and fund_name is the second column\n",
+    "doc_id_index = 0\n",
+    "fund_name_index = 1\n",
+    "\n",
+    "# Index the data\n",
+    "ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n",
+    "generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n",
+    "\n",
+    "intersection = set(headers_gen).intersection(headers_gt)\n",
+    "\n",
+    "# Convert the result back to a list (if you need it as a list)\n",
+    "intersection_list = list(intersection)\n",
+    "\n",
+    "total_fn = []\n",
+    "def calculate_metrics(tp, tn, fp, fn):\n",
+    "    \"\"\"Calculate precision, recall, accuracy, and F1-score.\"\"\"\n",
+    "    precision = tp / (tp + fp) if (tp + fp) != 0 else 0\n",
+    "    recall = tp / (tp + fn) if (tp + fn) != 0 else 0\n",
+    "    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0\n",
+    "    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0\n",
+    "    return precision, recall, accuracy, f1_score\n",
+    "\n",
+    "def print_metrics_table(data):\n",
+    "    # Print table headers\n",
+    "    print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"Precision\", \"Recall\", \"Accuracy\", \"F1-Score\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n",
+    "    total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]\n",
+    "    \n",
+    "    total_tp = []\n",
+    "    total_tn = []\n",
+    "    total_fp = []\n",
+    "    #total_fn = []\n",
+    "    # Calculate and print metrics for each item\n",
+    "    for keys in imp_datapoints:\n",
+    "        try:\n",
+    "            key = imp_datapoints_mapping[keys]\n",
+    "            values = data[key]\n",
+    "            tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
+    "            precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
+    "            total_precision.append(precision)\n",
+    "            total_recall.append(recall)\n",
+    "            total_accuracy.append(accuracy)\n",
+    "            total_f1_score.append(f1_score)\n",
+    "            total_support.append(values[\"SUPPORT\"])\n",
+    "            total_tp.append(tp)\n",
+    "            total_tn.append(tn)\n",
+    "            total_fp.append(fp)\n",
+    "            total_fn.append(fn)\n",
+    "\n",
+    "            if values[\"SUPPORT\"] > 0 and key > \"\":\n",
+    "                print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(keys, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
+    "        except:\n",
+    "            pass\n",
+    "    print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", statistics.mean(total_precision), statistics.mean(total_recall), statistics.mean(total_accuracy), statistics.mean(total_f1_score), sum(total_support), sum(total_tp), sum(total_tn), sum(total_fp), sum(total_fn)))\n",
+    "    \n",
+    "def create_metrics_df(data):\n",
+    "    # Define a list to hold data for DataFrame\n",
+    "    rows = []\n",
+    "    \n",
+    "    # Iterate through each metric item\n",
+    "    for key in imp_datapoints:\n",
+    "        try:\n",
+    "            mapped_key = imp_datapoints_mapping[key]\n",
+    "            values = data[mapped_key]\n",
+    "            tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
+    "            precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
+    "            \n",
+    "            # Only add rows where SUPPORT > 0\n",
+    "            if values[\"SUPPORT\"] > 0:\n",
+    "                row = {\n",
+    "                    \"Metric\": key,\n",
+    "                    \"Precision\": precision,\n",
+    "                    \"Recall\": recall,\n",
+    "                    \"Accuracy\": accuracy,\n",
+    "                    \"F1-Score\": f1_score,\n",
+    "                    \"SUPPORT\": values[\"SUPPORT\"]\n",
+    "                }\n",
+    "                rows.append(row)\n",
+    "        except KeyError as e:\n",
+    "            continue\n",
+    "\n",
+    "    # Create a DataFrame from the list of rows\n",
+    "    df_metrics = pd.DataFrame(rows)\n",
+    "    df_metrics.reset_index(inplace=True)\n",
+    "    df_metrics.drop(columns=[\"index\"], inplace=True)\n",
+    "    print(df_metrics)\n",
+    "    return df_metrics\n",
+    "\n",
+    "\n",
+    "\n",
+    "def get_provider_mapping(file_path):\n",
+    "    df = pd.read_excel(file_path)\n",
+    "    df = (df.groupby([\"Docid\", \"ProviderName\"]).first())\n",
+    "    df.reset_index(inplace = True)\n",
+    "    return df[[\"Docid\", \"ProviderName\"]]\n",
+    "\n",
+    "\n",
+    "def get_provider_names(generated_results_indexed, df_provider_mapping):\n",
+    "    providers_dict = {}\n",
+    "    for doc_id in generated_results_indexed:\n",
+    "        try:\n",
+    "            provider_name = (df_provider_mapping[df_provider_mapping[\"Docid\"] == doc_id][\"ProviderName\"].values)[0]\n",
+    "            if provider_name in providers_dict:\n",
+    "                providers_dict[provider_name].append(doc_id)\n",
+    "            else:\n",
+    "                providers_dict[provider_name] = []\n",
+    "                providers_dict[provider_name].append(doc_id)\n",
+    "\n",
+    "        except:\n",
+    "            pass\n",
+    "    return providers_dict\n",
+    "\n",
+    "def get_specified_doc_data(results, doc_list):\n",
+    "    provider_res = {}\n",
+    "    for doc_id in doc_list:\n",
+    "        if doc_id in results:\n",
+    "            provider_res[doc_id] = results[doc_id]\n",
+    "    return provider_res\n",
+    "\n",
+    "\n",
+    "df_provider_mapping = get_provider_mapping(provider_mapping_file_path)\n",
+    "\n",
+    "all_provider_dict = get_provider_names(generated_results_indexed, df_provider_mapping)\n",
+    "\n",
+    "\n",
+    "# for provider_name in all_provider_dict:\n",
+    "#     provider_vise_generated_results = get_specified_doc_data(generated_results_indexed, all_provider_dict[provider_name])\n",
+    "#     comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, provider_vise_generated_results, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
+    "#     print(\"\\n\")\n",
+    "#     print(\"\\n\")\n",
+    "#     print(\"Provider Name - \" + provider_name + \"\\t Number of Docs - \" + str(len(all_provider_dict[provider_name])))\n",
+    "#     #create_metrics_df(comparison_results)\n",
+    "#     print_metrics_table(comparison_results)\n",
+    "#     print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
+    "#     print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
+    "\n",
+    "\n",
+    "\n",
+    "print(\"\\n\")\n",
+    "print(\"\\n\")\n",
+    "print(\"All Providers Results: \")\n",
+    "comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, generated_results_indexed, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
+    "\n",
+    "print_metrics_table(comparison_results)\n",
+    "print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
+    "print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "All Providers Results: \n",
+      "Performance fee and cost  -  377377369  truth is null and generated -  0 SPDR® S&P Emerging Markets Carbon Control Fund\n",
+      "Performance fee and cost  -  397107472  truth is null and generated -  0 AMP Capital Specialist Diversified Fixed Income Fund\n",
+      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.11   OnePath OneAnswer Frontier Investment Portfolio-OnePath Multi Asset Income Trust\n",
+      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.07   OA Frontier IP-OnePath Australian Share Trust\n",
+      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.33   OA Frontier Investment Portfolio- BlackRock Tactical Growth\n",
+      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.02   OA Frontier Investment Portfolio- Pendal Monthly Income Plus\n",
+      "Performance fee and cost  -  401212184  truth -  0.41  and generated -  0.13   OnePath Alternatives Growth Trust\n",
+      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.03   OA Frontier IP-Ausbil Australian Emerging Leaders Trust\n",
+      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.15   OA Frontier IP-Perpetual Balanced Growth\n",
+      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.03   OA Frontier IP-Perpetual Conservative Growth\n",
+      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.06   OA Frontier IP-Platinum International\n",
+      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.15   OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth\n",
+      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.01   ANZ OneAnswer Investment Portfolio - OnePath Balanced Index\n",
+      "Performance fee and cost  -  401212184  generated is null and  truth is -  0 ANZ OneAnswer Investment Portfolio - OnePath Growth Index\n",
+      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard Index Diversified Bond\n",
+      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard International Shares Index\n",
+      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard Investor Short Term Fixed Interest Fund\n",
+      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard Index Hedged International Shares Fund\n",
+      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard LifeStrategy Growth\n",
+      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard LifeStrategy Conservative\n",
+      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard LifeStrategy High Growth\n",
+      "Performance fee and cost  -  411062815  truth is null and generated -  13.98 Perpetual WFP-Perpetual Share Plus L/S\n",
+      "Performance fee and cost  -  411062815  truth -  0  and generated -  0.01   WFP Schroder Fixed Income\n",
+      "Performance fee and cost  -  411062815  truth -  0  and generated -  15.38   Perpetual Ausbil Australian Emerg Ldrs\n",
+      "Performance fee and cost  -  411062815  truth -  0.03  and generated -  0.12   WFP Macquarie Income Opportunities\n",
+      "Performance fee and cost  -  411062815  generated is null and  truth is -  0 WFP Diversified Income\n",
+      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.14 \n",
+      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.67 Telstra Property Pension\n",
+      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.01 Telstra Cash Pension\n",
+      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.01 Telstra Australian shares Pension\n",
+      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.14 Telstra Defensive growth Pension\n",
+      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.01 Telstra International shares Pension\n",
+      "Performance fee and cost  -  414751292  truth -  0.24  and generated -  0   Platinum Global Fund (Long Only)\n",
+      "Performance fee and cost  -  414751292  truth -  0.15  and generated -  0   \n",
+      "Performance fee and cost  -  414751292  truth -  0.03  and generated -  0   Platinum International Brands Fund\n",
+      "Performance fee and cost  -  414751292  truth -  0.86  and generated -  0   Platinum International Healthcare\n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 \n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - Ausbil Aus. Emrging Leaders\n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - Investors Mutual Aus. Shre\n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - Macquarie Inc Opportunities\n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Cash\n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - Global Share Fund\n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPF - Hedged Global Share Fund\n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - Hedged Global Share Fund\n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - IncomeBuilder\n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPF - PIMCO Div. Fixed Interest\n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPF - PIMCO Global Bond Fund\n",
+      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - PIMCO Global Bond Fund\n",
+      "Performance fee and cost  -  446324179  generated is null and  truth is -  0.28 Lifeplan Investment Bond - Allan Gray Australian Equity Fund\n",
+      "Performance fee and cost  -  446324179  generated is null and  truth is -  0.05 Lifeplan MLC Horizon 2-Capital Stable Open\n",
+      "Performance fee and cost  -  454036250  generated is null and  truth is -    \n",
+      "Performance fee and cost  -  530101994  truth is null and generated -  0 Dimensional Global Value Trust -Active ETF\n",
+      "Performance fee and cost  -  530101994  truth is null and generated -  0 Dimensional Australia Core Equity Trust - Active ETF\n",
+      "Performance fee and cost  -  530101994  truth is null and generated -  0 Dimensional Australian Value Trust - Active ETF\n",
+      "Performance fee and cost  -  530101994  truth is null and generated -  0 Dimensional Global Core Equity Trust (Unhedged Class) - Active ETF\n",
+      "Performance fee and cost  -  530101994  truth is null and generated -  0 Dimensional Global Core Equity Tr\n",
+      "Performance fee and cost  -  550769189  truth is null and generated -  0 Acadian Global Managed Volatility Equity - Class A\n",
+      "Performance fee and cost  -  550522985  truth is null and generated -  0 RQI Global Value – Class A\n",
+      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP - Generations - BlackRock Australian Fixed Interest Index\n",
+      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP - Generations - BlackRock Australian Equity Index\n",
+      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP Generations - Alliance Capital Cash Management\n",
+      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP - Generations - BlackRock Property Securities Index\n",
+      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP - Generations - BlackRock International Equity Index (Unhedged)\n",
+      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP - Generations - BlackRock International Equity Index (Hedged)\n",
+      "Performance fee and cost  -  539241700  truth -  0.08  and generated -  0.05   North Professional Balanced\n",
+      "Performance fee and cost  -  539241700  truth -  0.06  and generated -  0   North Professional High Growth\n",
+      "Performance fee and cost  -  539241700  truth -  0.08  and generated -  0   North Professional Conservative\n",
+      "Performance fee and cost  -  539241700  truth -  0.08  and generated -  0   North Professional Growth\n",
+      "Performance fee and cost  -  539241700  truth -  0.09  and generated -  0   North Professional Moderately Conservative\n",
+      "Performance fee and cost  -  539261734  truth -  0.01  and generated -  0   ipac life choices Income Generator\n",
+      "Performance fee and cost  -  539261734  truth -  0.06  and generated -  0   ipac life choices Active 100\n",
+      "Performance fee and cost  -  539261734  truth -  0.08  and generated -  0   ipac life choices Active 85\n",
+      "Performance fee and cost  -  539261734  truth -  0.01  and generated -  0   ipac life choices Index 50\n",
+      "Performance fee and cost  -  539261734  truth -  0.09  and generated -  0   ipac life choices Active 50\n",
+      "Performance fee and cost  -  539261734  truth -  0.08  and generated -  0   ipac life choices Active 70\n",
+      "Performance fee and cost  -  506913190  generated is null and  truth is -  0.03 FC W Pen-CFS TTR Moderate\n",
+      "Performance fee and cost  -  506913190  generated is null and  truth is -  0.04 FC W Pen-CFS TTR Growth\n",
+      "Performance fee and cost  -  506913190  generated is null and  truth is -  0.47 \n",
+      "Performance fee and cost  -  553449663  truth -  0  and generated -  0.07   AMP Capital Specialist International Share (Hedged) Fund\n",
+      "Performance fee and cost  -  539266874  truth -  0.03  and generated -  0   SUMMIT Select - Active High Growth Units\n",
+      "Performance fee and cost  -  539266874  truth -  0.05  and generated -  0   SUMMIT Select - Active Moderately Defensive\n",
+      "Performance fee and cost  -  539266874  truth -  0.05  and generated -  0   SUMMIT Select - Active Growth Units\n",
+      "Performance fee and cost  -  539266874  truth -  0.05  and generated -  0   SUMMIT Select - Active Balanced\n",
+      "Performance fee and cost  -  539266874  truth -  0.06  and generated -  0   SUMMIT Select - Active Defensive Units\n",
+      "Performance fee and cost  -  539266880  truth -  0.01  and generated -  0   North Multi-manager Active High Growth\n",
+      "Performance fee and cost  -  539266880  truth -  0.01  and generated -  0   North Multi-manager Active Moderately Defensive\n",
+      "Performance fee and cost  -  539266880  truth -  0.01  and generated -  0   North Multi-manager Active Growth\n",
+      "Performance fee and cost  -  539266880  truth -  0.01  and generated -  0   North Multi-manager Balanced\n",
+      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT Future Goals BTFM\n",
+      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BTFM Asian Share\n",
+      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT International Share BTFM\n",
+      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT Smaller Companies BTFM\n",
+      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT Investment Funds - BT TIME Fund\n",
+      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT European Share Growth\n",
+      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT American Share Growth\n",
+      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT Imputation Share BTFM\n",
+      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 \n",
+      "Performance fee and cost  -  521606755  truth is null and generated -  0 CFS Index Diversified\n",
+      "Performance fee and cost  -  557526129  truth is null and generated -  0 Fortlake Real-Income Fund\n",
+      "Performance fee and cost  -  540028470  truth is null and generated -  0 CFS Wholesale Index Australian Share\n",
+      "Performance fee and cost  -  531373053  truth is null and generated -  0 Dimensional Global Core Equity Trust (Unhedged Class) - Active ETF\n",
+      "Performance fee and cost  -  531373053  truth is null and generated -  0 Dimensional Australian Value Trust - Active ETF\n",
+      "Performance fee and cost  -  531373053  truth is null and generated -  0 Dimensional Global Value Trust -Active ETF\n",
+      "Performance fee and cost  -  531373053  truth is null and generated -  0 Dimensional Australia Core Equity Trust - Active ETF\n",
+      "Performance fee and cost  -  531373053  truth is null and generated -  0 Dimensional Global Small Company Trust\n",
+      "Performance fee and cost  -  557362553  truth is null and generated -  0 JPMorgan Global Select Equity Fund\n",
+      "Performance fee and cost  -  527969661  truth is null and generated -  0 JPMorgan Global Equity Premium Income (Hedged) Complex ETF\n",
+      "Performance fee and cost  -  384508026  generated is null and  truth is -  0 Mercer Multi-manager High Growth Fund\n",
+      "Performance fee and cost  -  384508026  generated is null and  truth is -  0 Mercer Multi-manager Growth Fund\n",
+      "Performance fee and cost  -  384508026  generated is null and  truth is -  0 \n",
+      "total -  452.72727272727275\n",
+      "Metric                                            \tPrecision \tRecall    \tAccuracy  \tF1-Score  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
+      "Management Fee and Costs                          \t0.8790    \t0.9250    \t0.8213    \t0.9014    \t494       \t407       \t2         \t56        \t33        \n",
+      "Management Fee                                    \t0.8985    \t0.9265    \t0.8394    \t0.9123    \t494       \t416       \t2         \t47        \t33        \n",
+      "Performance fee and cost                          \t0.7871    \t0.8472    \t0.7791    \t0.8161    \t327       \t244       \t144       \t66        \t44        \n",
+      "Interposed vehicle Performance fee and Costs      \t0.5000    \t1.0000    \t0.9237    \t0.6667    \t39        \t38        \t422       \t38        \t0         \n",
+      "Administration Fee and costs                      \t0.9787    \t0.9388    \t0.9839    \t0.9583    \t98        \t92        \t398       \t2         \t6         \n",
+      "Total Annual Dollar Based Charges                 \t0.8165    \t1.0000    \t0.9598    \t0.8990    \t90        \t89        \t389       \t20        \t0         \n",
+      "Buy Spread                                        \t0.8957    \t0.8910    \t0.8394    \t0.8933    \t405       \t335       \t83        \t39        \t41        \n",
+      "Sell Spread                                       \t0.9064    \t0.8921    \t0.8474    \t0.8992    \t405       \t339       \t83        \t35        \t41        \n",
+      "Minimum Initial Investment                        \t0.8571    \t0.9671    \t0.8815    \t0.9088    \t310       \t294       \t145       \t49        \t10        \n",
+      "Benchmark                                         \t0.6402    \t0.8582    \t0.8233    \t0.7333    \t173       \t121       \t289       \t68        \t20        \n",
+      "TOTAL                                             \t0.8159    \t0.9246    \t0.8699    \t0.8588    \t2835      \t2375      \t1957      \t420       \t228       \n",
+      "Total Funds Matched - 498\n",
+      "Total Funds Not Matched - 28\n",
+      "Percentage of Funds Matched - 94.67680608365019\n"
+     ]
+    }
+   ],
+   "source": [
+    "import openpyxl\n",
+    "from collections import defaultdict\n",
+    "import pandas as pd\n",
+    "import statistics\n",
+    "\n",
+    "\n",
+    "funds_matched = 0\n",
+    "funds_not_matched = 0\n",
+    "def load_excel(filepath, header_row_index):\n",
+    "    \"\"\"Load an Excel file and use the specified row as the header.\"\"\"\n",
+    "    wb = openpyxl.load_workbook(filepath, data_only=True)\n",
+    "    sheet = wb.active\n",
+    "    headers = []\n",
+    "    data = []\n",
+    "\n",
+    "    for index, row in enumerate(sheet.iter_rows(values_only=True)):\n",
+    "        if index == header_row_index:\n",
+    "            headers = [cell if cell is not None else \"\" for cell in row]\n",
+    "        elif index > header_row_index:\n",
+    "            data.append([cell if cell is not None else \"\" for cell in row])\n",
+    "\n",
+    "    return headers, data\n",
+    "\n",
+    "def index_data_by_key(data, key_index, secondary_key_index, header):\n",
+    "    \"\"\"Index data by primary and secondary keys (doc_id and fund_name).\"\"\"\n",
+    "    indexed_data = defaultdict(dict)\n",
+    "    \n",
+    "    for row in data:\n",
+    "        row_data = {}\n",
+    "        # Store the entire row, which will be useful for full row comparison\n",
+    "        for i in range(len(row)):\n",
+    "            if header[i] == \"doc_id\":\n",
+    "                primary_key = int(row[i])\n",
+    "            elif header[i] == \"fund_name\":\n",
+    "                secondary_key = str(row[i])\n",
+    "            else:\n",
+    "                row_data[header[i]] = convert_if_number(row[i])\n",
+    "        indexed_data[primary_key][secondary_key] = row_data\n",
+    "    return indexed_data\n",
+    "\n",
+    "def convert_if_number(value):\n",
+    "    \"\"\"Attempt to convert value to a float or int, otherwise return as string.\"\"\"\n",
+    "    try:\n",
+    "        float_value = round(float(value), 2)\n",
+    "        int_value = int(float_value)\n",
+    "        return int_value if int_value == float_value else float_value\n",
+    "    except (ValueError, TypeError):\n",
+    "        return value\n",
+    "\n",
+    "def compare_values(value1, value2):\n",
+    "    \"\"\"Convert values to numbers if possible and compare, otherwise compare as strings.\"\"\"\n",
+    "    value1 = convert_if_number(value1)\n",
+    "    value2 = convert_if_number(value2)\n",
+    "    return value1 == value2\n",
+    "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched):\n",
+    "    \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n",
+    "    results = {}\n",
+    "    funds_matched, funds_not_matched = 0, 0\n",
+    "    # Initialize result dictionaries for each column except 'doc_id'\n",
+    "    for keys in headers:\n",
+    "        if keys != \"doc_id\":\n",
+    "            results[keys] = {}\n",
+    "            results[keys][\"TP\"] = 0\n",
+    "            results[keys][\"TN\"] = 0\n",
+    "            results[keys][\"FP\"] = 0\n",
+    "            results[keys][\"FN\"] = 0\n",
+    "            results[keys][\"SUPPORT\"] = 0\n",
+    "    \n",
+    "    # Iterate over the generated results instead of the ground truth\n",
+    "    \n",
+    "    total = 0\n",
+    "    for doc_id, funds in ground_truth.items():\n",
+    "        if doc_id in generated_results:\n",
+    "            for fund_name, truth_values in funds.items():\n",
+    "                if fund_name in generated_results[doc_id]:\n",
+    "                    generated_values = generated_results[doc_id][fund_name]\n",
+    "                    # Compare all other columns\n",
+    "                    for i in intersection_list:\n",
+    "                        for keys in imp_datapoints:\n",
+    "                            if i == imp_datapoints_mapping[keys]:\n",
+    "                                total = total +1\n",
+    "                                if truth_values[i] == \"\":\n",
+    "                                    if truth_values[i] == generated_values[i]:\n",
+    "                                        results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
+    "                                    else:\n",
+    "                                        results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
+    "                                        if \"Performance fee and cost\" in keys:\n",
+    "                                            debug = 0\n",
+    "                                            print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], fund_name)                                        \n",
+    "                                else:\n",
+    "                                    if truth_values[i] == generated_values[i]:\n",
+    "                                        results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
+    "                                    elif generated_values[i] != \"\":\n",
+    "                                        results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
+    "                                        if \"Performance fee and cost\" in keys:\n",
+    "                                            debug = 0\n",
+    "                                            print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", fund_name)\n",
+    "                                    else:\n",
+    "                                        results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
+    "                                        if \"Performance fee and cost\" in keys:\n",
+    "                                            debug = 0\n",
+    "                                            print(keys, \" - \" , doc_id, \" generated is null and  truth is - \", truth_values[i], fund_name)\n",
+    "                                    results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
+    "\n",
+    "\n",
+    "                                # if truth_values[i] == generated_values[i] and truth_values[i] == \"\":\n",
+    "                                #     results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
+    "                                # elif truth_values[i] == generated_values[i]:\n",
+    "                                #     results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
+    "                                # elif truth_values[i] != \"\" and generated_values[i] == \"\":\n",
+    "                                #     results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
+    "                                # elif truth_values[i] == \"\" and generated_values[i] != \"\":\n",
+    "                                #     results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
+    "                                # else:\n",
+    "                                #     results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
+    "                                # if truth_values[i] != \"\":\n",
+    "                                #     results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
+    "                    funds_matched += 1\n",
+    "                else:\n",
+    "                    funds_not_matched += 1\n",
+    "                    # for keys in headers:\n",
+    "                    #     if keys != \"doc_id\":\n",
+    "                    #         results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
+    "        else:\n",
+    "            # If the entire document is not found, count all funds as not matched\n",
+    "            funds_not_matched += len(funds)\n",
+    "            # for fund_name in funds:\n",
+    "            #     for keys in headers:\n",
+    "            #         if keys != \"doc_id\":\n",
+    "            #             results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
+    "    return results, funds_matched, funds_not_matched\n",
+    "\n",
+    "\n",
+    "# Load the files\n",
+    "headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n",
+    "headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n",
+    "\n",
+    "# Assuming doc_id is the first column and fund_name is the second column\n",
+    "doc_id_index = 0\n",
+    "fund_name_index = 1\n",
+    "\n",
+    "# Index the data\n",
+    "ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n",
+    "generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n",
+    "\n",
+    "intersection = set(headers_gen).intersection(headers_gt)\n",
+    "\n",
+    "# Convert the result back to a list (if you need it as a list)\n",
+    "intersection_list = list(intersection)\n",
+    "\n",
+    "total_fn = []\n",
+    "def calculate_metrics(tp, tn, fp, fn):\n",
+    "    \"\"\"Calculate precision, recall, accuracy, and F1-score.\"\"\"\n",
+    "    precision = tp / (tp + fp) if (tp + fp) != 0 else 0\n",
+    "    recall = tp / (tp + fn) if (tp + fn) != 0 else 0\n",
+    "    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0\n",
+    "    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0\n",
+    "    return precision, recall, accuracy, f1_score\n",
+    "\n",
+    "def print_metrics_table(data):\n",
+    "    # Print table headers\n",
+    "    print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"Precision\", \"Recall\", \"Accuracy\", \"F1-Score\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n",
+    "    total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]\n",
+    "    \n",
+    "    total_tp = []\n",
+    "    total_tn = []\n",
+    "    total_fp = []\n",
+    "    #total_fn = []\n",
+    "    # Calculate and print metrics for each item\n",
+    "    for keys in imp_datapoints:\n",
+    "        try:\n",
+    "            key = imp_datapoints_mapping[keys]\n",
+    "            values = data[key]\n",
+    "            tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
+    "            precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
+    "            total_precision.append(precision)\n",
+    "            total_recall.append(recall)\n",
+    "            total_accuracy.append(accuracy)\n",
+    "            total_f1_score.append(f1_score)\n",
+    "            total_support.append(values[\"SUPPORT\"])\n",
+    "            total_tp.append(tp)\n",
+    "            total_tn.append(tn)\n",
+    "            total_fp.append(fp)\n",
+    "            total_fn.append(fn)\n",
+    "\n",
+    "            if values[\"SUPPORT\"] > 0 and key > \"\":\n",
+    "                print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(keys, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
+    "        except:\n",
+    "            pass\n",
+    "    print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", statistics.mean(total_precision), statistics.mean(total_recall), statistics.mean(total_accuracy), statistics.mean(total_f1_score), sum(total_support), sum(total_tp), sum(total_tn), sum(total_fp), sum(total_fn)))\n",
+    "    \n",
+    "def create_metrics_df(data):\n",
+    "    # Define a list to hold data for DataFrame\n",
+    "    rows = []\n",
+    "    \n",
+    "    # Iterate through each metric item\n",
+    "    for key in imp_datapoints:\n",
+    "        try:\n",
+    "            mapped_key = imp_datapoints_mapping[key]\n",
+    "            values = data[mapped_key]\n",
+    "            tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
+    "            precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
+    "            \n",
+    "            # Only add rows where SUPPORT > 0\n",
+    "            if values[\"SUPPORT\"] > 0:\n",
+    "                row = {\n",
+    "                    \"Metric\": key,\n",
+    "                    \"Precision\": precision,\n",
+    "                    \"Recall\": recall,\n",
+    "                    \"Accuracy\": accuracy,\n",
+    "                    \"F1-Score\": f1_score,\n",
+    "                    \"SUPPORT\": values[\"SUPPORT\"]\n",
+    "                }\n",
+    "                rows.append(row)\n",
+    "        except KeyError as e:\n",
+    "            continue\n",
+    "\n",
+    "    # Create a DataFrame from the list of rows\n",
+    "    df_metrics = pd.DataFrame(rows)\n",
+    "    df_metrics.reset_index(inplace=True)\n",
+    "    df_metrics.drop(columns=[\"index\"], inplace=True)\n",
+    "    print(df_metrics)\n",
+    "    return df_metrics\n",
+    "\n",
+    "\n",
+    "\n",
+    "def get_provider_mapping(file_path):\n",
+    "    df = pd.read_excel(file_path)\n",
+    "    df = (df.groupby([\"Docid\", \"ProviderName\"]).first())\n",
+    "    df.reset_index(inplace = True)\n",
+    "    return df[[\"Docid\", \"ProviderName\"]]\n",
+    "\n",
+    "\n",
+    "def get_provider_names(generated_results_indexed, df_provider_mapping):\n",
+    "    providers_dict = {}\n",
+    "    for doc_id in generated_results_indexed:\n",
+    "        try:\n",
+    "            provider_name = (df_provider_mapping[df_provider_mapping[\"Docid\"] == doc_id][\"ProviderName\"].values)[0]\n",
+    "            if provider_name in providers_dict:\n",
+    "                providers_dict[provider_name].append(doc_id)\n",
+    "            else:\n",
+    "                providers_dict[provider_name] = []\n",
+    "                providers_dict[provider_name].append(doc_id)\n",
+    "\n",
+    "        except:\n",
+    "            pass\n",
+    "    return providers_dict\n",
+    "\n",
+    "def get_specified_doc_data(results, doc_list):\n",
+    "    provider_res = {}\n",
+    "    for doc_id in doc_list:\n",
+    "        if doc_id in results:\n",
+    "            provider_res[doc_id] = results[doc_id]\n",
+    "    return provider_res\n",
+    "\n",
+    "\n",
+    "df_provider_mapping = get_provider_mapping(provider_mapping_file_path)\n",
+    "\n",
+    "all_provider_dict = get_provider_names(generated_results_indexed, df_provider_mapping)\n",
+    "\n",
+    "\n",
+    "# for provider_name in all_provider_dict:\n",
+    "#     provider_vise_generated_results = get_specified_doc_data(generated_results_indexed, all_provider_dict[provider_name])\n",
+    "#     comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, provider_vise_generated_results, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
+    "#     print(\"\\n\")\n",
+    "#     print(\"\\n\")\n",
+    "#     print(\"Provider Name - \" + provider_name + \"\\t Number of Docs - \" + str(len(all_provider_dict[provider_name])))\n",
+    "#     #create_metrics_df(comparison_results)\n",
+    "#     print_metrics_table(comparison_results)\n",
+    "#     print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
+    "#     print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
+    "\n",
+    "\n",
+    "\n",
+    "print(\"\\n\")\n",
+    "print(\"\\n\")\n",
+    "print(\"All Providers Results: \")\n",
+    "comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, generated_results_indexed, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
+    "\n",
+    "print_metrics_table(comparison_results)\n",
+    "print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
+    "print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "blade",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/utils/pdf_download.py
+++ b/utils/pdf_download.py
@ -4,6 +4,7 @@ import os
 import platform
 from utils.logger import logger
 import dotenv
+import certifi
 # loads .env file with your OPENAI_API_KEY
 dotenv.load_dotenv()
  
@ -38,9 +39,13 @@ def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str):
    else:
        if os_name == "windows":
            ACCESS_KEY = os.getenv('ACCESS_KEY')
-            SECRET_KEY = os.getenv('SECRET_KEY')  
-            session = boto3.Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
-            s3 = session.client('s3')
+            SECRET_KEY = os.getenv('SECRET_KEY')
+            AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
+            s3 = boto3.client("s3", region_name="us-east-1", verify=certifi.where(), 
+                            aws_access_key_id=ACCESS_KEY,
+                            aws_secret_access_key=SECRET_KEY,
+                            aws_session_token=AWS_SESSION_TOKEN
+                            )
        else:
            s3 = boto3.client('s3')