dc-ml-emea-ar/performance.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n",
    "                  \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n",
    "                  \"Minimum Initial Investment\", \"Benchmark\"]\n",
    "\n",
    "\n",
    "imp_datapoints_mapping = {\n",
    "    \"Management Fee and Costs\": \"management_fee_and_costs\",\n",
    "    \"Management Fee\": \"management_fee\",\n",
    "    \"Performance fee and cost\": \"performance_fee_costs\",\n",
    "    \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n",
    "    \"Administration Fee and costs\": \"administration_fees\",\n",
    "    \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n",
    "    \"Buy Spread\": \"buy_spread\",\n",
    "    \"Sell Spread\": \"sell_spread\",\n",
    "    \"Performance Fee\": \"PerformanceFeeCharged\",\n",
    "    \"Minimum Initial Investment\": \"minimum_initial_investment\",\n",
    "    \"Benchmark\": \"benchmark_name\"\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
    "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313024715.xlsx\"\n",
    "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "\n",
      "All Providers Results: \n",
      "Metric                                            \tPrecision \tRecall    \tAccuracy  \tF1-Score  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
      "Management Fee and Costs                          \t0.8790    \t0.9250    \t0.8213    \t0.9014    \t494       \t407       \t2         \t56        \t33        \n",
      "Management Fee                                    \t0.8985    \t0.9265    \t0.8394    \t0.9123    \t494       \t416       \t2         \t47        \t33        \n",
      "Performance fee and cost                          \t0.7871    \t0.8472    \t0.7791    \t0.8161    \t327       \t244       \t144       \t66        \t44        \n",
      "Interposed vehicle Performance fee and Costs      \t0.5000    \t1.0000    \t0.9237    \t0.6667    \t39        \t38        \t422       \t38        \t0         \n",
      "Administration Fee and costs                      \t0.9787    \t0.9388    \t0.9839    \t0.9583    \t98        \t92        \t398       \t2         \t6         \n",
      "Total Annual Dollar Based Charges                 \t0.8165    \t1.0000    \t0.9598    \t0.8990    \t90        \t89        \t389       \t20        \t0         \n",
      "Buy Spread                                        \t0.8957    \t0.8910    \t0.8394    \t0.8933    \t405       \t335       \t83        \t39        \t41        \n",
      "Sell Spread                                       \t0.9064    \t0.8921    \t0.8474    \t0.8992    \t405       \t339       \t83        \t35        \t41        \n",
      "Minimum Initial Investment                        \t0.8571    \t0.9671    \t0.8815    \t0.9088    \t310       \t294       \t145       \t49        \t10        \n",
      "Benchmark                                         \t0.6402    \t0.8582    \t0.8233    \t0.7333    \t173       \t121       \t289       \t68        \t20        \n",
      "TOTAL                                             \t0.8159    \t0.9246    \t0.8699    \t0.8588    \t2835      \t2375      \t1957      \t420       \t228       \n",
      "Total Funds Matched - 498\n",
      "Total Funds Not Matched - 28\n",
      "Percentage of Funds Matched - 94.67680608365019\n"
     ]
    }
   ],
   "source": [
    "import openpyxl\n",
    "from collections import defaultdict\n",
    "import pandas as pd\n",
    "import statistics\n",
    "\n",
    "\n",
    "funds_matched = 0\n",
    "funds_not_matched = 0\n",
    "def load_excel(filepath, header_row_index):\n",
    "    \"\"\"Load an Excel file and use the specified row as the header.\"\"\"\n",
    "    wb = openpyxl.load_workbook(filepath, data_only=True)\n",
    "    sheet = wb.active\n",
    "    headers = []\n",
    "    data = []\n",
    "\n",
    "    for index, row in enumerate(sheet.iter_rows(values_only=True)):\n",
    "        if index == header_row_index:\n",
    "            headers = [cell if cell is not None else \"\" for cell in row]\n",
    "        elif index > header_row_index:\n",
    "            data.append([cell if cell is not None else \"\" for cell in row])\n",
    "\n",
    "    return headers, data\n",
    "\n",
    "def index_data_by_key(data, key_index, secondary_key_index, header):\n",
    "    \"\"\"Index data by primary and secondary keys (doc_id and fund_name).\"\"\"\n",
    "    indexed_data = defaultdict(dict)\n",
    "    \n",
    "    for row in data:\n",
    "        row_data = {}\n",
    "        # Store the entire row, which will be useful for full row comparison\n",
    "        for i in range(len(row)):\n",
    "            if header[i] == \"doc_id\":\n",
    "                primary_key = int(row[i])\n",
    "            elif header[i] == \"fund_name\":\n",
    "                secondary_key = str(row[i])\n",
    "            else:\n",
    "                row_data[header[i]] = convert_if_number(row[i])\n",
    "        indexed_data[primary_key][secondary_key] = row_data\n",
    "    return indexed_data\n",
    "\n",
    "def convert_if_number(value):\n",
    "    \"\"\"Attempt to convert value to a float or int, otherwise return as string.\"\"\"\n",
    "    try:\n",
    "        float_value = round(float(value), 2)\n",
    "        int_value = int(float_value)\n",
    "        return int_value if int_value == float_value else float_value\n",
    "    except (ValueError, TypeError):\n",
    "        return value\n",
    "\n",
    "def compare_values(value1, value2):\n",
    "    \"\"\"Convert values to numbers if possible and compare, otherwise compare as strings.\"\"\"\n",
    "    value1 = convert_if_number(value1)\n",
    "    value2 = convert_if_number(value2)\n",
    "    return value1 == value2\n",
    "\n",
    "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched):\n",
    "    \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n",
    "    results = {}\n",
    "    funds_matched, funds_not_matched = 0, 0\n",
    "    # Initialize result dictionaries for each column except 'doc_id'\n",
    "    for keys in headers:\n",
    "        if keys != \"doc_id\":\n",
    "            results[keys] = {}\n",
    "            results[keys][\"TP\"] = 0\n",
    "            results[keys][\"TN\"] = 0\n",
    "            results[keys][\"FP\"] = 0\n",
    "            results[keys][\"FN\"] = 0\n",
    "            results[keys][\"SUPPORT\"] = 0\n",
    "\n",
    "    # Iterate over the generated results instead of the ground truth\n",
    "    for doc_id, funds in ground_truth.items():\n",
    "        if doc_id in generated_results:\n",
    "            for fund_name, truth_values in funds.items():\n",
    "                if fund_name in generated_results[doc_id]:\n",
    "                    generated_values = generated_results[doc_id][fund_name]\n",
    "                    # Compare all other columns\n",
    "                    for i in intersection_list:\n",
    "                        for keys in imp_datapoints:\n",
    "                            if i == imp_datapoints_mapping[keys]:\n",
    "                                if truth_values[i] == \"\":\n",
    "                                    if truth_values[i] == generated_values[i]:\n",
    "                                        results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
    "                                    else:\n",
    "                                        results[i][\"FP\"] = results[i][\"FP\"] + 1                                        \n",
    "                                else:\n",
    "                                    if truth_values[i] == generated_values[i]:\n",
    "                                        results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
    "                                    elif generated_values[i] != \"\":\n",
    "                                        results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
    "                                    else:\n",
    "                                        results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
    "                                    results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
    "\n",
    "\n",
    "                                # if truth_values[i] == generated_values[i] and truth_values[i] == \"\":\n",
    "                                #     results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
    "                                # elif truth_values[i] == generated_values[i]:\n",
    "                                #     results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
    "                                # elif truth_values[i] != \"\" and generated_values[i] == \"\":\n",
    "                                #     results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
    "                                # elif truth_values[i] == \"\" and generated_values[i] != \"\":\n",
    "                                #     results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
    "                                # else:\n",
    "                                #     results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
    "                                # if truth_values[i] != \"\":\n",
    "                                #     results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
    "                    funds_matched += 1\n",
    "                else:\n",
    "                    funds_not_matched += 1\n",
    "                    # for keys in headers:\n",
    "                    #     if keys != \"doc_id\":\n",
    "                    #         results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
    "        else:\n",
    "            # If the entire document is not found, count all funds as not matched\n",
    "            funds_not_matched += len(funds)\n",
    "            # for fund_name in funds:\n",
    "            #     for keys in headers:\n",
    "            #         if keys != \"doc_id\":\n",
    "            #             results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
    "\n",
    "    return results, funds_matched, funds_not_matched\n",
    "\n",
    "\n",
    "# Load the files\n",
    "headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n",
    "headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n",
    "\n",
    "# Assuming doc_id is the first column and fund_name is the second column\n",
    "doc_id_index = 0\n",
    "fund_name_index = 1\n",
    "\n",
    "# Index the data\n",
    "ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n",
    "generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n",
    "\n",
    "intersection = set(headers_gen).intersection(headers_gt)\n",
    "\n",
    "# Convert the result back to a list (if you need it as a list)\n",
    "intersection_list = list(intersection)\n",
    "\n",
    "total_fn = []\n",
    "def calculate_metrics(tp, tn, fp, fn):\n",
    "    \"\"\"Calculate precision, recall, accuracy, and F1-score.\"\"\"\n",
    "    precision = tp / (tp + fp) if (tp + fp) != 0 else 0\n",
    "    recall = tp / (tp + fn) if (tp + fn) != 0 else 0\n",
    "    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0\n",
    "    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0\n",
    "    return precision, recall, accuracy, f1_score\n",
    "\n",
    "def print_metrics_table(data):\n",
    "    # Print table headers\n",
    "    print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"Precision\", \"Recall\", \"Accuracy\", \"F1-Score\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n",
    "    total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]\n",
    "    \n",
    "    total_tp = []\n",
    "    total_tn = []\n",
    "    total_fp = []\n",
    "    #total_fn = []\n",
    "    # Calculate and print metrics for each item\n",
    "    for keys in imp_datapoints:\n",
    "        try:\n",
    "            key = imp_datapoints_mapping[keys]\n",
    "            values = data[key]\n",
    "            tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
    "            precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
    "            total_precision.append(precision)\n",
    "            total_recall.append(recall)\n",
    "            total_accuracy.append(accuracy)\n",
    "            total_f1_score.append(f1_score)\n",
    "            total_support.append(values[\"SUPPORT\"])\n",
    "            total_tp.append(tp)\n",
    "            total_tn.append(tn)\n",
    "            total_fp.append(fp)\n",
    "            total_fn.append(fn)\n",
    "\n",
    "            if values[\"SUPPORT\"] > 0 and key > \"\":\n",
    "                print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(keys, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
    "        except:\n",
    "            pass\n",
    "    print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", statistics.mean(total_precision), statistics.mean(total_recall), statistics.mean(total_accuracy), statistics.mean(total_f1_score), sum(total_support), sum(total_tp), sum(total_tn), sum(total_fp), sum(total_fn)))\n",
    "    \n",
    "def create_metrics_df(data):\n",
    "    # Define a list to hold data for DataFrame\n",
    "    rows = []\n",
    "    \n",
    "    # Iterate through each metric item\n",
    "    for key in imp_datapoints:\n",
    "        try:\n",
    "            mapped_key = imp_datapoints_mapping[key]\n",
    "            values = data[mapped_key]\n",
    "            tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
    "            precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
    "            \n",
    "            # Only add rows where SUPPORT > 0\n",
    "            if values[\"SUPPORT\"] > 0:\n",
    "                row = {\n",
    "                    \"Metric\": key,\n",
    "                    \"Precision\": precision,\n",
    "                    \"Recall\": recall,\n",
    "                    \"Accuracy\": accuracy,\n",
    "                    \"F1-Score\": f1_score,\n",
    "                    \"SUPPORT\": values[\"SUPPORT\"]\n",
    "                }\n",
    "                rows.append(row)\n",
    "        except KeyError as e:\n",
    "            continue\n",
    "\n",
    "    # Create a DataFrame from the list of rows\n",
    "    df_metrics = pd.DataFrame(rows)\n",
    "    df_metrics.reset_index(inplace=True)\n",
    "    df_metrics.drop(columns=[\"index\"], inplace=True)\n",
    "    print(df_metrics)\n",
    "    return df_metrics\n",
    "\n",
    "\n",
    "\n",
    "def get_provider_mapping(file_path):\n",
    "    df = pd.read_excel(file_path)\n",
    "    df = (df.groupby([\"Docid\", \"ProviderName\"]).first())\n",
    "    df.reset_index(inplace = True)\n",
    "    return df[[\"Docid\", \"ProviderName\"]]\n",
    "\n",
    "\n",
    "def get_provider_names(generated_results_indexed, df_provider_mapping):\n",
    "    providers_dict = {}\n",
    "    for doc_id in generated_results_indexed:\n",
    "        try:\n",
    "            provider_name = (df_provider_mapping[df_provider_mapping[\"Docid\"] == doc_id][\"ProviderName\"].values)[0]\n",
    "            if provider_name in providers_dict:\n",
    "                providers_dict[provider_name].append(doc_id)\n",
    "            else:\n",
    "                providers_dict[provider_name] = []\n",
    "                providers_dict[provider_name].append(doc_id)\n",
    "\n",
    "        except:\n",
    "            pass\n",
    "    return providers_dict\n",
    "\n",
    "def get_specified_doc_data(results, doc_list):\n",
    "    provider_res = {}\n",
    "    for doc_id in doc_list:\n",
    "        if doc_id in results:\n",
    "            provider_res[doc_id] = results[doc_id]\n",
    "    return provider_res\n",
    "\n",
    "\n",
    "df_provider_mapping = get_provider_mapping(provider_mapping_file_path)\n",
    "\n",
    "all_provider_dict = get_provider_names(generated_results_indexed, df_provider_mapping)\n",
    "\n",
    "\n",
    "# for provider_name in all_provider_dict:\n",
    "#     provider_vise_generated_results = get_specified_doc_data(generated_results_indexed, all_provider_dict[provider_name])\n",
    "#     comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, provider_vise_generated_results, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
    "#     print(\"\\n\")\n",
    "#     print(\"\\n\")\n",
    "#     print(\"Provider Name - \" + provider_name + \"\\t Number of Docs - \" + str(len(all_provider_dict[provider_name])))\n",
    "#     #create_metrics_df(comparison_results)\n",
    "#     print_metrics_table(comparison_results)\n",
    "#     print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
    "#     print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
    "\n",
    "\n",
    "\n",
    "print(\"\\n\")\n",
    "print(\"\\n\")\n",
    "print(\"All Providers Results: \")\n",
    "comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, generated_results_indexed, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
    "\n",
    "print_metrics_table(comparison_results)\n",
    "print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
    "print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "\n",
      "All Providers Results: \n",
      "Document List File -  None\n",
      "Metric                                            \tPrecision \tRecall    \tAccuracy  \tF1-Score  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
      "management_fee_and_costs                          \t0.8907    \t0.9513    \t0.8525    \t0.9200    \t457       \t391       \t2         \t48        \t20        \n",
      "management_fee                                    \t0.9043    \t0.9520    \t0.8655    \t0.9276    \t457       \t397       \t2         \t42        \t20        \n",
      "performance_fee_costs                             \t0.8408    \t0.8556    \t0.8113    \t0.8482    \t303       \t243       \t131       \t46        \t41        \n",
      "interposed_vehicle_performance_fee_cost           \t0.6316    \t1.0000    \t0.9393    \t0.7742    \t49        \t48        \t385       \t28        \t0         \n",
      "administration_fees                               \t0.9767    \t0.9655    \t0.9892    \t0.9711    \t87        \t84        \t372       \t2         \t3         \n",
      "total_annual_dollar_based_charges                 \t0.8350    \t1.0000    \t0.9631    \t0.9101    \t87        \t86        \t358       \t17        \t0         \n",
      "buy_spread                                        \t0.9059    \t0.9258    \t0.8655    \t0.9158    \t391       \t337       \t62        \t35        \t27        \n",
      "sell_spread                                       \t0.9113    \t0.9262    \t0.8698    \t0.9187    \t391       \t339       \t62        \t33        \t27        \n",
      "minimum_initial_investment                        \t0.9463    \t0.9814    \t0.9479    \t0.9635    \t329       \t317       \t120       \t18        \t6         \n",
      "benchmark_name                                    \t0.7444    \t0.8701    \t0.8568    \t0.8024    \t172       \t134       \t261       \t46        \t20        \n",
      "TOTAL                                             \t0.8587    \t0.9428    \t0.8961    \t0.8951    \t2723      \t2376      \t1755      \t315       \t164       \n",
      "Total Funds Matched - 461\n",
      "Total Funds Not Matched - 125\n",
      "Percentage of Funds Matched - 78.66894197952219\n",
      "All Providers Results: \n",
      "Document List File -  ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
      "Metric                                            \tPrecision \tRecall    \tAccuracy  \tF1-Score  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
      "management_fee_and_costs                          \t0.8960    \t0.9451    \t0.8516    \t0.9199    \t180       \t155       \t0         \t18        \t9         \n",
      "management_fee                                    \t0.9017    \t0.9455    \t0.8571    \t0.9231    \t180       \t156       \t0         \t17        \t9         \n",
      "performance_fee_costs                             \t0.8000    \t0.8261    \t0.8077    \t0.8128    \t94        \t76        \t71        \t19        \t16        \n",
      "interposed_vehicle_performance_fee_cost           \t0.5273    \t1.0000    \t0.8571    \t0.6905    \t30        \t29        \t127       \t26        \t0         \n",
      "administration_fees                               \t1.0000    \t0.3333    \t0.9890    \t0.5000    \t3         \t1         \t179       \t0         \t2         \n",
      "buy_spread                                        \t0.9643    \t0.9419    \t0.9121    \t0.9529    \t176       \t162       \t4         \t6         \t10        \n",
      "sell_spread                                       \t0.9702    \t0.9422    \t0.9176    \t0.9560    \t176       \t163       \t4         \t5         \t10        \n",
      "minimum_initial_investment                        \t0.9137    \t0.9549    \t0.9011    \t0.9338    \t139       \t127       \t37        \t12        \t6         \n",
      "benchmark_name                                    \t0.7188    \t0.8734    \t0.7967    \t0.7886    \t91        \t69        \t76        \t27        \t10        \n",
      "TOTAL                                             \t0.7692    \t0.7762    \t0.8885    \t0.7478    \t1069      \t938       \t679       \t131       \t236       \n",
      "Total Funds Matched - 182\n",
      "Total Funds Not Matched - 24\n",
      "Percentage of Funds Matched - 88.3495145631068\n",
      "All Providers Results: \n",
      "Document List File -  ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
      "Metric                                            \tPrecision \tRecall    \tAccuracy  \tF1-Score  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
      "management_fee_and_costs                          \t0.8872    \t0.9555    \t0.8530    \t0.9201    \t277       \t236       \t2         \t30        \t11        \n",
      "management_fee                                    \t0.9060    \t0.9563    \t0.8710    \t0.9305    \t277       \t241       \t2         \t25        \t11        \n",
      "performance_fee_costs                             \t0.8608    \t0.8698    \t0.8136    \t0.8653    \t209       \t167       \t60        \t27        \t25        \n",
      "interposed_vehicle_performance_fee_cost           \t0.9048    \t1.0000    \t0.9928    \t0.9500    \t19        \t19        \t258       \t2         \t0         \n",
      "administration_fees                               \t0.9765    \t0.9881    \t0.9892    \t0.9822    \t84        \t83        \t193       \t2         \t1         \n",
      "total_annual_dollar_based_charges                 \t0.8431    \t1.0000    \t0.9427    \t0.9149    \t87        \t86        \t177       \t16        \t0         \n",
      "buy_spread                                        \t0.8578    \t0.9115    \t0.8351    \t0.8838    \t215       \t175       \t58        \t29        \t17        \n",
      "sell_spread                                       \t0.8627    \t0.9119    \t0.8387    \t0.8866    \t215       \t176       \t58        \t28        \t17        \n",
      "minimum_initial_investment                        \t0.9694    \t1.0000    \t0.9785    \t0.9845    \t190       \t190       \t83        \t6         \t0         \n",
      "benchmark_name                                    \t0.7738    \t0.8667    \t0.8961    \t0.8176    \t81        \t65        \t185       \t19        \t10        \n",
      "TOTAL                                             \t0.8842    \t0.9460    \t0.9011    \t0.9136    \t1654      \t1438      \t1076      \t184       \t328       \n",
      "Total Funds Matched - 279\n",
      "Total Funds Not Matched - 101\n",
      "Percentage of Funds Matched - 73.42105263157895\n"
     ]
    }
   ],
   "source": [
    "import openpyxl\n",
    "from collections import defaultdict\n",
    "import pandas as pd\n",
    "import statistics\n",
    "import os\n",
    "import re\n",
    "from utils.similarity import Similarity\n",
    "\n",
    "funds_matched = 0\n",
    "funds_not_matched = 0\n",
    "def load_excel(filepath, header_row_index):\n",
    "    \"\"\"Load an Excel file and use the specified row as the header.\"\"\"\n",
    "    wb = openpyxl.load_workbook(filepath, data_only=True)\n",
    "    sheet = wb.active\n",
    "    headers = []\n",
    "    data = []\n",
    "\n",
    "    for index, row in enumerate(sheet.iter_rows(values_only=True)):\n",
    "        if index == header_row_index:\n",
    "            headers = [cell if cell is not None else \"\" for cell in row]\n",
    "        elif index > header_row_index:\n",
    "            data.append([cell if cell is not None else \"\" for cell in row])\n",
    "\n",
    "    return headers, data\n",
    "\n",
    "def index_data_by_key(data, key_index, secondary_key_index, header):\n",
    "    \"\"\"Index data by primary and secondary keys (doc_id and sec_name).\"\"\"\n",
    "    indexed_data = defaultdict(dict)\n",
    "    \n",
    "    for row in data:\n",
    "        row_data = {}\n",
    "        # Store the entire row, which will be useful for full row comparison\n",
    "        for i in range(len(row)):\n",
    "            if header[i] == \"doc_id\":\n",
    "                primary_key = int(row[i])\n",
    "            elif header[i] == \"sec_name\":\n",
    "                # share class should be the comparison level and key\n",
    "                secondary_key = str(row[i])\n",
    "            else:\n",
    "                row_data[header[i]] = convert_if_number(row[i])\n",
    "        indexed_data[primary_key][secondary_key] = row_data\n",
    "    return indexed_data\n",
    "\n",
    "def convert_if_number(value):\n",
    "    \"\"\"Attempt to convert value to a float or int, otherwise return as string.\"\"\"\n",
    "    try:\n",
    "        float_value = round(float(value), 2)\n",
    "        int_value = int(float_value)\n",
    "        return int_value if int_value == float_value else float_value\n",
    "    except (ValueError, TypeError):\n",
    "        return value\n",
    "\n",
    "def compare_values(value1, value2):\n",
    "    \"\"\"Convert values to numbers if possible and compare, otherwise compare as strings.\"\"\"\n",
    "    value1 = convert_if_number(value1)\n",
    "    value2 = convert_if_number(value2)\n",
    "    return value1 == value2\n",
    "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched, document_list):\n",
    "    \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n",
    "    results = {}\n",
    "    funds_matched, funds_not_matched = 0, 0\n",
    "    # Initialize result dictionaries for each column except 'doc_id'\n",
    "    for keys in headers:\n",
    "        if keys != \"doc_id\":\n",
    "            results[keys] = {}\n",
    "            results[keys][\"TP\"] = 0\n",
    "            results[keys][\"TN\"] = 0\n",
    "            results[keys][\"FP\"] = 0\n",
    "            results[keys][\"FN\"] = 0\n",
    "            results[keys][\"SUPPORT\"] = 0\n",
    "    \n",
    "    # Iterate over the generated results instead of the ground truth\n",
    "    \n",
    "    total = 0\n",
    "    message_list = []\n",
    "    # print(document_list)\n",
    "    for doc_id, secs in ground_truth.items():\n",
    "        if document_list is not None and str(doc_id) not in document_list:\n",
    "            continue\n",
    "        if doc_id in generated_results:\n",
    "            for sec_name, truth_values in secs.items():\n",
    "                if sec_name in generated_results[doc_id]:\n",
    "                    generated_values = generated_results[doc_id][sec_name]\n",
    "                    # Compare all other columns\n",
    "                    for i in intersection_list:\n",
    "                        for keys in imp_datapoints:\n",
    "                            if i == imp_datapoints_mapping[keys]:\n",
    "                                total = total +1\n",
    "                                if truth_values[i] == \"\":\n",
    "                                    if truth_values[i] == generated_values[i]:\n",
    "                                        results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
    "                                    else:\n",
    "                                        results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
    "                                        # if \"Performance fee and cost\" in keys:\n",
    "                                        debug = 0\n",
    "                                        # print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], sec_name)         \n",
    "                                        message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Truth is null and generated is not null\"}\n",
    "                                        message_list.append(message)                               \n",
    "                                else:\n",
    "                                    if truth_values[i] == generated_values[i]:\n",
    "                                        results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
    "                                    elif generated_values[i] != \"\":\n",
    "                                        if i == \"benchmark_name\" and compare_text(truth_values[i], generated_values[i]):\n",
    "                                            results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
    "                                        else:\n",
    "                                            results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
    "                                            # if \"Performance fee and cost\" in keys:\n",
    "                                            debug = 0\n",
    "                                            # print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", sec_name)\n",
    "                                            message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Truth is not equal with generated\"}\n",
    "                                            message_list.append(message)\n",
    "                                    else:\n",
    "                                        results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
    "                                        # if \"Performance fee and cost\" in keys:\n",
    "                                        debug = 0\n",
    "                                        # print(keys, \" - \" , doc_id, \" generated is null and  truth is - \", truth_values[i], sec_name)\n",
    "                                        message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Generated is null and truth is not null\"}\n",
    "                                        message_list.append(message)\n",
    "                                    results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
    "                    funds_matched += 1\n",
    "                else:\n",
    "                    funds_not_matched += 1\n",
    "        else:\n",
    "            # If the entire document is not found, count all funds as not matched\n",
    "            funds_not_matched += len(secs)\n",
    "    return results, message_list, funds_matched, funds_not_matched\n",
    "\n",
    "def clean_text(text: str):\n",
    "    if text is None or len(text) == 0:\n",
    "        return text\n",
    "    text = re.sub(r\"\\W\", \" \", text)\n",
    "    text = re.sub(r\"\\s+\", \" \", text)\n",
    "    return text\n",
    "\n",
    "def compare_text(source_text, target_text):\n",
    "    source_text = clean_text(source_text)\n",
    "    target_text = clean_text(target_text)\n",
    "    if source_text == target_text or source_text in target_text or target_text in source_text:\n",
    "        return True\n",
    "    similarity = Similarity()\n",
    "    jacard_score = similarity.jaccard_similarity(source_text.lower().split(), target_text.lower().split())\n",
    "    if jacard_score > 0.8:\n",
    "        return True\n",
    "\n",
    "# Load the files\n",
    "headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n",
    "headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n",
    "\n",
    "# Assuming doc_id is the first column and fund_name is the second column\n",
    "doc_id_index = 0\n",
    "fund_name_index = 1\n",
    "\n",
    "# Index the data\n",
    "ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n",
    "generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n",
    "\n",
    "intersection = set(headers_gen).intersection(headers_gt)\n",
    "\n",
    "# Convert the result back to a list (if you need it as a list)\n",
    "intersection_list = list(intersection)\n",
    "\n",
    "total_fn = []\n",
    "def calculate_metrics(tp, tn, fp, fn):\n",
    "    \"\"\"Calculate precision, recall, accuracy, and F1-score.\"\"\"\n",
    "    precision = tp / (tp + fp) if (tp + fp) != 0 else 0\n",
    "    recall = tp / (tp + fn) if (tp + fn) != 0 else 0\n",
    "    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0\n",
    "    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0\n",
    "    return precision, recall, accuracy, f1_score\n",
    "\n",
    "def print_metrics_table(data):\n",
    "    # Print table headers\n",
    "    print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"Precision\", \"Recall\", \"Accuracy\", \"F1-Score\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n",
    "    total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]\n",
    "    \n",
    "    total_tp = []\n",
    "    total_tn = []\n",
    "    total_fp = []\n",
    "    #total_fn = []\n",
    "    # Calculate and print metrics for each item\n",
    "    metrics_list = []\n",
    "    for keys in imp_datapoints:\n",
    "        try:\n",
    "            key = imp_datapoints_mapping[keys]\n",
    "            values = data[key]\n",
    "            tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
    "            precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
    "            metrics = {\"Datapoint\": keys, \"F1-Score\": f1_score, \"Precision\": precision, \"Recall\": recall, \"Accuracy\": accuracy, \"SUPPORT\": values[\"SUPPORT\"], \"TP\": tp, \"TN\": tn, \"FP\": fp, \"FN\": fn}\n",
    "            metrics_list.append(metrics)\n",
    "            total_precision.append(precision)\n",
    "            total_recall.append(recall)\n",
    "            total_accuracy.append(accuracy)\n",
    "            total_f1_score.append(f1_score)\n",
    "            total_support.append(values[\"SUPPORT\"])\n",
    "            total_tp.append(tp)\n",
    "            total_tn.append(tn)\n",
    "            total_fp.append(fp)\n",
    "            total_fn.append(fn)\n",
    "\n",
    "            if values[\"SUPPORT\"] > 0 and key > \"\":\n",
    "                print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(key, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
    "        except:\n",
    "            pass\n",
    "    total_mean_precision = statistics.mean(total_precision)\n",
    "    total_mean_recall = statistics.mean(total_recall)\n",
    "    total_mean_accuracy = statistics.mean(total_accuracy)\n",
    "    total_mean_f1_score = statistics.mean(total_f1_score)\n",
    "    total_sum_support = sum(total_support)\n",
    "    total_sum_tp = sum(total_tp)\n",
    "    total_sum_tn = sum(total_tn)\n",
    "    total_sum_fp = sum(total_fp)\n",
    "    total_sum_fn = sum(total_fn)\n",
    "    total_metrics = {\"Datapoint\": \"TOTAL\", \"F1-Score\": total_mean_f1_score, \"Precision\": total_mean_precision, \"Recall\": total_mean_recall, \"Accuracy\": total_mean_accuracy, \"SUPPORT\": total_sum_support, \"TP\": total_sum_tp, \"TN\": total_sum_tn, \"FP\": total_sum_fp, \"FN\": total_sum_fn}\n",
    "    metrics_list.append(total_metrics)\n",
    "    print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", total_mean_precision, total_mean_recall, total_mean_accuracy, total_mean_f1_score, total_sum_support, total_sum_tp, total_sum_tn, total_sum_fp, total_sum_fn))\n",
    "    return metrics_list\n",
    "    \n",
    "def create_metrics_df(data):\n",
    "    # Define a list to hold data for DataFrame\n",
    "    rows = []\n",
    "    \n",
    "    # Iterate through each metric item\n",
    "    for key in imp_datapoints:\n",
    "        try:\n",
    "            mapped_key = imp_datapoints_mapping[key]\n",
    "            values = data[mapped_key]\n",
    "            tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
    "            precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
    "            \n",
    "            # Only add rows where SUPPORT > 0\n",
    "            if values[\"SUPPORT\"] > 0:\n",
    "                row = {\n",
    "                    \"Metric\": key,\n",
    "                    \"Precision\": precision,\n",
    "                    \"Recall\": recall,\n",
    "                    \"Accuracy\": accuracy,\n",
    "                    \"F1-Score\": f1_score,\n",
    "                    \"SUPPORT\": values[\"SUPPORT\"]\n",
    "                }\n",
    "                rows.append(row)\n",
    "        except KeyError as e:\n",
    "            continue\n",
    "\n",
    "    # Create a DataFrame from the list of rows\n",
    "    df_metrics = pd.DataFrame(rows)\n",
    "    df_metrics.reset_index(inplace=True)\n",
    "    df_metrics.drop(columns=[\"index\"], inplace=True)\n",
    "    print(df_metrics)\n",
    "    return df_metrics\n",
    "\n",
    "\n",
    "\n",
    "def get_provider_mapping(file_path):\n",
    "    df = pd.read_excel(file_path)\n",
    "    df = (df.groupby([\"Docid\", \"ProviderName\"]).first())\n",
    "    df.reset_index(inplace = True)\n",
    "    return df[[\"Docid\", \"ProviderName\"]]\n",
    "\n",
    "\n",
    "def get_provider_names(generated_results_indexed, df_provider_mapping):\n",
    "    providers_dict = {}\n",
    "    for doc_id in generated_results_indexed:\n",
    "        try:\n",
    "            provider_name = (df_provider_mapping[df_provider_mapping[\"Docid\"] == doc_id][\"ProviderName\"].values)[0]\n",
    "            if provider_name in providers_dict:\n",
    "                providers_dict[provider_name].append(doc_id)\n",
    "            else:\n",
    "                providers_dict[provider_name] = []\n",
    "                providers_dict[provider_name].append(doc_id)\n",
    "\n",
    "        except:\n",
    "            pass\n",
    "    return providers_dict\n",
    "\n",
    "def get_specified_doc_data(results, doc_list):\n",
    "    provider_res = {}\n",
    "    for doc_id in doc_list:\n",
    "        if doc_id in results:\n",
    "            provider_res[doc_id] = results[doc_id]\n",
    "    return provider_res\n",
    "\n",
    "\n",
    "df_provider_mapping = get_provider_mapping(provider_mapping_file_path)\n",
    "\n",
    "all_provider_dict = get_provider_names(generated_results_indexed, df_provider_mapping)\n",
    "\n",
    "\n",
    "# for provider_name in all_provider_dict:\n",
    "#     provider_vise_generated_results = get_specified_doc_data(generated_results_indexed, all_provider_dict[provider_name])\n",
    "#     comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, provider_vise_generated_results, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
    "#     print(\"\\n\")\n",
    "#     print(\"\\n\")\n",
    "#     print(\"Provider Name - \" + provider_name + \"\\t Number of Docs - \" + str(len(all_provider_dict[provider_name])))\n",
    "#     #create_metrics_df(comparison_results)\n",
    "#     print_metrics_table(comparison_results)\n",
    "#     print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
    "#     print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
    "\n",
    "\n",
    "\n",
    "print(\"\\n\")\n",
    "print(\"\\n\")\n",
    "document_list_file_list = [None, \n",
    "                           \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n",
    "                           \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n",
    "for document_list_file in document_list_file_list:\n",
    "    document_list = None\n",
    "    if document_list_file is not None:\n",
    "        with open(document_list_file, \"r\", encoding=\"utf-8\") as f:\n",
    "            document_list = f.readlines()\n",
    "            document_list = [doc_id.strip() for doc_id in document_list]\n",
    "    \n",
    "    print(\"All Providers Results: \")\n",
    "    print(\"Document List File - \", document_list_file)\n",
    "    comparison_results, message_list, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, \n",
    "                                                                                      generated_results_indexed, \n",
    "                                                                                      headers_gt, doc_id_index, \n",
    "                                                                                      fund_name_index, \n",
    "                                                                                      intersection_list,\n",
    "                                                                                      funds_matched, \n",
    "                                                                                      funds_not_matched,\n",
    "                                                                                      document_list)\n",
    "    metrics_list = print_metrics_table(comparison_results)\n",
    "    print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
    "    print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
    "\n",
    "    metrics_df = pd.DataFrame(metrics_list)\n",
    "    message_df = pd.DataFrame(message_list)\n",
    "\n",
    "    output_metrics_folder = r\"/data/aus_prospectus/output/metrics_data/\"\n",
    "    if os.path.exists(output_metrics_folder):\n",
    "        generated_file_base_name = os.path.basename(path_generated_results).replace(\".xlsx\", \"\")\n",
    "        metrics_file_name = f\"metrics_{generated_file_base_name}\"\n",
    "        if document_list_file is not None:\n",
    "            metrics_file_name = f\"{metrics_file_name}_{len(document_list)}_documents.xlsx\"\n",
    "        else:\n",
    "            metrics_file_name = f\"{metrics_file_name}_all_documents.xlsx\"\n",
    "        metrics_file_path = os.path.join(output_metrics_folder, metrics_file_name)\n",
    "        with pd.ExcelWriter(metrics_file_path) as writer:\n",
    "            metrics_df.to_excel(writer, sheet_name=\"metrics_data\", index=False)\n",
    "            message_df.to_excel(writer, sheet_name=\"message_data\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "blade",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}