{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import openpyxl\n",
    "from collections import defaultdict\n",
    "import pandas as pd\n",
    "import statistics\n",
    "import os\n",
    "import re\n",
    "from utils.similarity import Similarity\n",
    "\n",
    "\n",
    "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n",
    "                  \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n",
    "                  \"Minimum Initial Investment\", \"Benchmark\"]\n",
    "\n",
    "\n",
    "imp_datapoints_mapping = {\n",
    "    \"Management Fee and Costs\": \"management_fee_and_costs\",\n",
    "    \"Management Fee\": \"management_fee\",\n",
    "    \"Performance fee and cost\": \"performance_fee_costs\",\n",
    "    \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n",
    "    \"Administration Fee and costs\": \"administration_fees\",\n",
    "    \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n",
    "    \"Buy Spread\": \"buy_spread\",\n",
    "    \"Sell Spread\": \"sell_spread\",\n",
    "    \"Performance Fee\": \"PerformanceFeeCharged\",\n",
    "    \"Minimum Initial Investment\": \"minimum_initial_investment\",\n",
    "    \"Benchmark\": \"benchmark_name\"\n",
    "}\n",
    "\n",
    "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
    "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n",
    "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318203253_new.xlsx\"\n",
    "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "message_list = []\n",
    "total_fn = []\n",
    "def load_excel(filepath, header_row_index):\n",
    "    \"\"\"Load an Excel file and use the specified row as the header.\"\"\"\n",
    "    wb = openpyxl.load_workbook(filepath, data_only=True)\n",
    "    sheet = wb.active\n",
    "    headers = []\n",
    "    data = []\n",
    "\n",
    "    for index, row in enumerate(sheet.iter_rows(values_only=True)):\n",
    "        if index == header_row_index:\n",
    "            headers = [cell if cell is not None else \"\" for cell in row]\n",
    "        elif index > header_row_index:\n",
    "            data.append([cell if cell is not None else \"\" for cell in row])\n",
    "\n",
    "    return headers, data\n",
    "\n",
    "def index_data_by_key(data, key_index, secondary_key_index, header):\n",
    "    \"\"\"Index data by primary and secondary keys (doc_id and sec_name).\"\"\"\n",
    "    indexed_data = defaultdict(dict)\n",
    "    \n",
    "    for row in data:\n",
    "        row_data = {}\n",
    "        # Store the entire row, which will be useful for full row comparison\n",
    "        for i in range(len(row)):\n",
    "            if header[i] == \"doc_id\":\n",
    "                primary_key = int(row[i])\n",
    "            elif header[i] == \"sec_name\":\n",
    "                # share class should be the comparison level and key\n",
    "                secondary_key = str(row[i])\n",
    "            else:\n",
    "                row_data[header[i]] = convert_if_number(row[i])\n",
    "        if secondary_key is None or (isinstance(secondary_key, str) and len(secondary_key) == 0):\n",
    "            continue\n",
    "        indexed_data[primary_key][secondary_key] = row_data\n",
    "    return indexed_data\n",
    "\n",
    "def convert_if_number(value):\n",
    "    \"\"\"Attempt to convert value to a float or int, otherwise return as string.\"\"\"\n",
    "    try:\n",
    "        float_value = round(float(value), 2)\n",
    "        int_value = int(float_value)\n",
    "        return int_value if int_value == float_value else float_value\n",
    "    except (ValueError, TypeError):\n",
    "        return value\n",
    "\n",
    "def compare_values(value1, value2):\n",
    "    \"\"\"Convert values to numbers if possible and compare, otherwise compare as strings.\"\"\"\n",
    "    value1 = convert_if_number(value1)\n",
    "    value2 = convert_if_number(value2)\n",
    "    return value1 == value2\n",
    "\n",
    "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched, document_list):\n",
    "    \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n",
    "    results = {}\n",
    "    funds_matched, funds_not_matched = 0, 0\n",
    "    # Initialize result dictionaries for each column except 'doc_id'\n",
    "    for keys in headers:\n",
    "        if keys != \"doc_id\":\n",
    "            results[keys] = {}\n",
    "            results[keys][\"TP\"] = 0\n",
    "            results[keys][\"TN\"] = 0\n",
    "            results[keys][\"FP\"] = 0\n",
    "            results[keys][\"FN\"] = 0\n",
    "            results[keys][\"SUPPORT\"] = 0\n",
    "    \n",
    "    # Iterate over the generated results instead of the ground truth\n",
    "    \n",
    "    total = 0\n",
    "    # print(document_list)\n",
    "    for doc_id, secs in ground_truth.items():\n",
    "        if document_list is not None and str(doc_id) not in document_list:\n",
    "            continue\n",
    "        if doc_id in generated_results:\n",
    "            for sec_name, truth_values in secs.items():\n",
    "                if sec_name in generated_results[doc_id]:\n",
    "                    generated_values = generated_results[doc_id][sec_name]\n",
    "                    # Compare all other columns\n",
    "                    for i in intersection_list:\n",
    "                        for keys in imp_datapoints:\n",
    "                            if i == imp_datapoints_mapping[keys]:\n",
    "                                truth = str(truth_values[i]).strip()\n",
    "                                generated = str(generated_values[i]).strip()\n",
    "                                total = total +1\n",
    "                                if truth == \"\":\n",
    "                                    if truth == generated:\n",
    "                                        results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
    "                                    else:\n",
    "                                        results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
    "                                        # if \"Performance fee and cost\" in keys:\n",
    "                                        debug = 0\n",
    "                                        # print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], sec_name)         \n",
    "                                        message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n",
    "                                                   \"truth\": truth, \"generated\": generated, \"error\": \"Truth is null and generated is not null\"}\n",
    "                                        message_list.append(message)                               \n",
    "                                else:\n",
    "                                    if truth == generated:\n",
    "                                        results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
    "                                    elif generated != \"\":\n",
    "                                        if i == \"benchmark_name\" and compare_text(truth, generated):\n",
    "                                            results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
    "                                        else:\n",
    "                                            results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
    "                                            # if \"Performance fee and cost\" in keys:\n",
    "                                            debug = 0\n",
    "                                            # print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", sec_name)\n",
    "                                            message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n",
    "                                                       \"truth\": truth, \"generated\": generated, \"error\": \"Truth is not equal with generated\"}\n",
    "                                            message_list.append(message)\n",
    "                                    else:\n",
    "                                        results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
    "                                        # if \"Performance fee and cost\" in keys:\n",
    "                                        debug = 0\n",
    "                                        # print(keys, \" - \" , doc_id, \" generated is null and  truth is - \", truth_values[i], sec_name)\n",
    "                                        message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n",
    "                                                   \"truth\": truth, \"generated\": generated, \"error\": \"Generated is null and truth is not null\"}\n",
    "                                        message_list.append(message)\n",
    "                                    results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
    "                    funds_matched += 1\n",
    "                else:\n",
    "                    funds_not_matched += 1\n",
    "        else:\n",
    "            # If the entire document is not found, count all funds as not matched\n",
    "            funds_not_matched += len(secs)\n",
    "    return results, message_list, funds_matched, funds_not_matched\n",
    "\n",
    "def clean_text(text: str):\n",
    "    if text is None or len(text) == 0:\n",
    "        return text\n",
    "    text = re.sub(r\"\\W\", \" \", text)\n",
    "    text = re.sub(r\"\\s+\", \" \", text)\n",
    "    return text\n",
    "\n",
    "def compare_text(source_text, target_text):\n",
    "    source_text = clean_text(source_text)\n",
    "    target_text = clean_text(target_text)\n",
    "    if source_text == target_text or source_text in target_text or target_text in source_text:\n",
    "        return True\n",
    "    similarity = Similarity()\n",
    "    jacard_score = similarity.jaccard_similarity(source_text.lower().split(), target_text.lower().split())\n",
    "    if jacard_score > 0.8:\n",
    "        return True\n",
    "    \n",
    "    \n",
    "def calculate_metrics(tp, tn, fp, fn):\n",
    "    \"\"\"Calculate precision, recall, accuracy, and F1-score.\"\"\"\n",
    "    precision = tp / (tp + fp) if (tp + fp) != 0 else 0\n",
    "    recall = tp / (tp + fn) if (tp + fn) != 0 else 0\n",
    "    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0\n",
    "    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0\n",
    "    return precision, recall, accuracy, f1_score\n",
    "\n",
    "def print_metrics_table(data):\n",
    "    # Print table headers\n",
    "    print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"F1-Score\", \"Precision\", \"Recall\", \"Accuracy\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n",
    "    total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]\n",
    "    \n",
    "    total_tp = []\n",
    "    total_tn = []\n",
    "    total_fp = []\n",
    "    #total_fn = []\n",
    "    # Calculate and print metrics for each item\n",
    "    metrics_list = []\n",
    "    for keys in imp_datapoints:\n",
    "        try:\n",
    "            key = imp_datapoints_mapping[keys]\n",
    "            values = data[key]\n",
    "            if values[\"SUPPORT\"] == 0:\n",
    "                continue\n",
    "            tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
    "            precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
    "            metrics = {\"Datapoint\": key, \"F1-Score\": f1_score, \"Precision\": precision, \"Recall\": recall, \"Accuracy\": accuracy, \"SUPPORT\": values[\"SUPPORT\"], \"TP\": tp, \"TN\": tn, \"FP\": fp, \"FN\": fn}\n",
    "            metrics_list.append(metrics)\n",
    "            total_precision.append(precision)\n",
    "            total_recall.append(recall)\n",
    "            total_accuracy.append(accuracy)\n",
    "            total_f1_score.append(f1_score)\n",
    "            total_support.append(values[\"SUPPORT\"])\n",
    "            total_tp.append(tp)\n",
    "            total_tn.append(tn)\n",
    "            total_fp.append(fp)\n",
    "            total_fn.append(fn)\n",
    "\n",
    "            if values[\"SUPPORT\"] > 0 and key > \"\":\n",
    "                print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(key, f1_score, precision, recall, accuracy, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
    "        except:\n",
    "            pass\n",
    "    total_mean_precision = statistics.mean(total_precision)\n",
    "    total_mean_recall = statistics.mean(total_recall)\n",
    "    total_mean_accuracy = statistics.mean(total_accuracy)\n",
    "    total_mean_f1_score = statistics.mean(total_f1_score)\n",
    "    total_sum_support = sum(total_support)\n",
    "    total_sum_tp = sum(total_tp)\n",
    "    total_sum_tn = sum(total_tn)\n",
    "    total_sum_fp = sum(total_fp)\n",
    "    total_sum_fn = sum(total_fn)\n",
    "    total_metrics = {\"Datapoint\": \"TOTAL\", \"F1-Score\": total_mean_f1_score, \"Precision\": total_mean_precision, \"Recall\": total_mean_recall, \"Accuracy\": total_mean_accuracy, \"SUPPORT\": total_sum_support, \"TP\": total_sum_tp, \"TN\": total_sum_tn, \"FP\": total_sum_fp, \"FN\": total_sum_fn}\n",
    "    metrics_list.append(total_metrics)\n",
    "    print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", total_mean_f1_score, total_mean_precision, total_mean_recall, total_mean_accuracy, total_sum_support, total_sum_tp, total_sum_tn, total_sum_fp, total_sum_fn))\n",
    "    return metrics_list\n",
    "    \n",
    "def create_metrics_df(data):\n",
    "    # Define a list to hold data for DataFrame\n",
    "    rows = []\n",
    "    \n",
    "    # Iterate through each metric item\n",
    "    for key in imp_datapoints:\n",
    "        try:\n",
    "            mapped_key = imp_datapoints_mapping[key]\n",
    "            values = data[mapped_key]\n",
    "            tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
    "            precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
    "            \n",
    "            # Only add rows where SUPPORT > 0\n",
    "            if values[\"SUPPORT\"] > 0:\n",
    "                row = {\n",
    "                    \"Metric\": key,\n",
    "                    \"Precision\": precision,\n",
    "                    \"Recall\": recall,\n",
    "                    \"Accuracy\": accuracy,\n",
    "                    \"F1-Score\": f1_score,\n",
    "                    \"SUPPORT\": values[\"SUPPORT\"]\n",
    "                }\n",
    "                rows.append(row)\n",
    "        except KeyError as e:\n",
    "            continue\n",
    "\n",
    "    # Create a DataFrame from the list of rows\n",
    "    df_metrics = pd.DataFrame(rows)\n",
    "    df_metrics.reset_index(inplace=True)\n",
    "    df_metrics.drop(columns=[\"index\"], inplace=True)\n",
    "    print(df_metrics)\n",
    "    return df_metrics\n",
    "\n",
    "\n",
    "\n",
    "def get_provider_mapping(file_path):\n",
    "    df = pd.read_excel(file_path)\n",
    "    df = (df.groupby([\"Docid\", \"ProviderName\"]).first())\n",
    "    df.reset_index(inplace = True)\n",
    "    return df[[\"Docid\", \"ProviderName\"]]\n",
    "\n",
    "\n",
    "def get_provider_names(generated_results_indexed, df_provider_mapping):\n",
    "    providers_dict = {}\n",
    "    for doc_id in generated_results_indexed:\n",
    "        try:\n",
    "            provider_name = (df_provider_mapping[df_provider_mapping[\"Docid\"] == doc_id][\"ProviderName\"].values)[0]\n",
    "            if provider_name in providers_dict:\n",
    "                providers_dict[provider_name].append(doc_id)\n",
    "            else:\n",
    "                providers_dict[provider_name] = []\n",
    "                providers_dict[provider_name].append(doc_id)\n",
    "\n",
    "        except:\n",
    "            pass\n",
    "    return providers_dict\n",
    "\n",
    "def get_specified_doc_data(results, doc_list):\n",
    "    provider_res = {}\n",
    "    for doc_id in doc_list:\n",
    "        if doc_id in results:\n",
    "            provider_res[doc_id] = results[doc_id]\n",
    "    return provider_res\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "\n",
      "All Providers Results: \n",
      "Document List File -  None\n",
      "Metric                                            \tF1-Score  \tPrecision \tRecall    \tAccuracy  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
      "management_fee_and_costs                          \t0.9354    \t0.8870    \t0.9893    \t0.8786    \t419       \t369       \t0         \t47        \t4         \n",
      "management_fee                                    \t0.9591    \t0.9303    \t0.9898    \t0.9214    \t419       \t387       \t0         \t29        \t4         \n",
      "performance_fee_costs                             \t0.9261    \t0.8955    \t0.9590    \t0.9024    \t285       \t257       \t122       \t30        \t11        \n",
      "interposed_vehicle_performance_fee_cost           \t0.9863    \t0.9730    \t1.0000    \t0.9952    \t73        \t72        \t346       \t2         \t0         \n",
      "administration_fees                               \t0.9940    \t0.9881    \t1.0000    \t0.9976    \t83        \t83        \t336       \t1         \t0         \n",
      "total_annual_dollar_based_charges                 \t1.0000    \t1.0000    \t1.0000    \t1.0000    \t70        \t70        \t350       \t0         \t0         \n",
      "buy_spread                                        \t0.9486    \t0.9171    \t0.9822    \t0.9143    \t362       \t332       \t52        \t30        \t6         \n",
      "sell_spread                                       \t0.9516    \t0.9227    \t0.9824    \t0.9190    \t362       \t334       \t52        \t28        \t6         \n",
      "minimum_initial_investment                        \t0.9544    \t0.9638    \t0.9452    \t0.9333    \t310       \t293       \t99        \t11        \t17        \n",
      "benchmark_name                                    \t0.8971    \t0.8652    \t0.9313    \t0.9333    \t142       \t122       \t270       \t19        \t9         \n",
      "TOTAL                                             \t0.9553    \t0.9343    \t0.9779    \t0.9395    \t2525      \t2319      \t1627      \t197       \t57        \n",
      "Total Funds Matched - 420\n",
      "Total Funds Not Matched - 145\n",
      "Percentage of Funds Matched - 74.33628318584071\n",
      "All Providers Results: \n",
      "Document List File -  ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
      "Metric                                            \tF1-Score  \tPrecision \tRecall    \tAccuracy  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
      "management_fee_and_costs                          \t0.9408    \t0.8883    \t1.0000    \t0.8883    \t178       \t159       \t0         \t20        \t0         \n",
      "management_fee                                    \t0.9742    \t0.9497    \t1.0000    \t0.9497    \t178       \t170       \t0         \t9         \t0         \n",
      "performance_fee_costs                             \t0.9082    \t0.8900    \t0.9271    \t0.8994    \t100       \t89        \t72        \t11        \t7         \n",
      "interposed_vehicle_performance_fee_cost           \t0.9905    \t0.9811    \t1.0000    \t0.9944    \t53        \t52        \t126       \t1         \t0         \n",
      "administration_fees                               \t1.0000    \t1.0000    \t1.0000    \t1.0000    \t15        \t15        \t164       \t0         \t0         \n",
      "buy_spread                                        \t0.9799    \t0.9716    \t0.9884    \t0.9609    \t176       \t171       \t1         \t5         \t2         \n",
      "sell_spread                                       \t0.9829    \t0.9773    \t0.9885    \t0.9665    \t176       \t172       \t1         \t4         \t2         \n",
      "minimum_initial_investment                        \t0.9151    \t0.9538    \t0.8794    \t0.8715    \t141       \t124       \t32        \t6         \t17        \n",
      "benchmark_name                                    \t0.8957    \t0.8488    \t0.9481    \t0.9050    \t85        \t73        \t89        \t13        \t4         \n",
      "TOTAL                                             \t0.9541    \t0.9401    \t0.9702    \t0.9373    \t1102      \t1025      \t485       \t69        \t89        \n",
      "Total Funds Matched - 179\n",
      "Total Funds Not Matched - 17\n",
      "Percentage of Funds Matched - 91.3265306122449\n",
      "All Providers Results: \n",
      "Document List File -  ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
      "Metric                                            \tF1-Score  \tPrecision \tRecall    \tAccuracy  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
      "management_fee_and_costs                          \t0.9313    \t0.8861    \t0.9813    \t0.8714    \t241       \t210       \t0         \t27        \t4         \n",
      "management_fee                                    \t0.9476    \t0.9156    \t0.9819    \t0.9004    \t241       \t217       \t0         \t20        \t4         \n",
      "performance_fee_costs                             \t0.9359    \t0.8984    \t0.9767    \t0.9046    \t185       \t168       \t50        \t19        \t4         \n",
      "interposed_vehicle_performance_fee_cost           \t0.9756    \t0.9524    \t1.0000    \t0.9959    \t20        \t20        \t220       \t1         \t0         \n",
      "administration_fees                               \t0.9927    \t0.9855    \t1.0000    \t0.9959    \t68        \t68        \t172       \t1         \t0         \n",
      "total_annual_dollar_based_charges                 \t1.0000    \t1.0000    \t1.0000    \t1.0000    \t70        \t70        \t171       \t0         \t0         \n",
      "buy_spread                                        \t0.9174    \t0.8656    \t0.9758    \t0.8797    \t186       \t161       \t51        \t25        \t4         \n",
      "sell_spread                                       \t0.9205    \t0.8710    \t0.9759    \t0.8838    \t186       \t162       \t51        \t24        \t4         \n",
      "minimum_initial_investment                        \t0.9854    \t0.9713    \t1.0000    \t0.9793    \t169       \t169       \t67        \t5         \t0         \n",
      "benchmark_name                                    \t0.8991    \t0.8909    \t0.9074    \t0.9544    \t57        \t49        \t181       \t6         \t5         \n",
      "TOTAL                                             \t0.9505    \t0.9237    \t0.9799    \t0.9365    \t1423      \t1294      \t963       \t128       \t114       \n",
      "Total Funds Matched - 241\n",
      "Total Funds Not Matched - 128\n",
      "Percentage of Funds Matched - 65.31165311653116\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\"\"\"\n",
    "Blade's updates\n",
    "1. Set the secondary key to be the share class name, instead of the fund name\n",
    "2. Remove the data point which support is 0 to calculate the metrics\n",
    "3. Add the message list to store the error message\n",
    "4. Support save metrics/ error message to excel file\n",
    "5. Support statistics for different document list\n",
    "6. Set F1-Score to the first column in the metrics table\n",
    "\"\"\"\n",
    "\n",
    "funds_matched = 0\n",
    "funds_not_matched = 0\n",
    "\n",
    "# Load the files\n",
    "headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n",
    "headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n",
    "\n",
    "# Assuming doc_id is the first column and fund_name is the second column\n",
    "doc_id_index = 0\n",
    "fund_name_index = 1\n",
    "\n",
    "# Index the data\n",
    "ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n",
    "generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n",
    "\n",
    "intersection = set(headers_gen).intersection(headers_gt)\n",
    "\n",
    "# Convert the result back to a list (if you need it as a list)\n",
    "intersection_list = list(intersection)\n",
    "\n",
    "total_fn = []\n",
    "\n",
    "# df_provider_mapping = get_provider_mapping(provider_mapping_file_path)\n",
    "\n",
    "# all_provider_dict = get_provider_names(generated_results_indexed, df_provider_mapping)\n",
    "\n",
    "\n",
    "# for provider_name in all_provider_dict:\n",
    "#     provider_vise_generated_results = get_specified_doc_data(generated_results_indexed, all_provider_dict[provider_name])\n",
    "#     comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, provider_vise_generated_results, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
    "#     print(\"\\n\")\n",
    "#     print(\"\\n\")\n",
    "#     print(\"Provider Name - \" + provider_name + \"\\t Number of Docs - \" + str(len(all_provider_dict[provider_name])))\n",
    "#     #create_metrics_df(comparison_results)\n",
    "#     print_metrics_table(comparison_results)\n",
    "#     print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
    "#     print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
    "\n",
    "\n",
    "\n",
    "print(\"\\n\")\n",
    "print(\"\\n\")\n",
    "document_list_file_list = [None, \n",
    "                           \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n",
    "                           \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n",
    "for document_list_file in document_list_file_list:\n",
    "    document_list = None\n",
    "    if document_list_file is not None:\n",
    "        with open(document_list_file, \"r\", encoding=\"utf-8\") as f:\n",
    "            document_list = f.readlines()\n",
    "            document_list = [doc_id.strip() for doc_id in document_list]\n",
    "    \n",
    "    print(\"All Providers Results: \")\n",
    "    print(\"Document List File - \", document_list_file)\n",
    "    comparison_results, message_list, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, \n",
    "                                                                                      generated_results_indexed, \n",
    "                                                                                      headers_gt, doc_id_index, \n",
    "                                                                                      fund_name_index, \n",
    "                                                                                      intersection_list,\n",
    "                                                                                      funds_matched, \n",
    "                                                                                      funds_not_matched,\n",
    "                                                                                      document_list)\n",
    "    metrics_list = print_metrics_table(comparison_results)\n",
    "    print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
    "    print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
    "\n",
    "    metrics_df = pd.DataFrame(metrics_list)\n",
    "    message_df = pd.DataFrame(message_list)\n",
    "\n",
    "    output_metrics_folder = r\"/data/aus_prospectus/output/metrics_data/\"\n",
    "    os.makedirs(output_metrics_folder, exist_ok=True)\n",
    "    if os.path.exists(output_metrics_folder):\n",
    "        generated_file_base_name = os.path.basename(path_generated_results).replace(\".xlsx\", \"\")\n",
    "        metrics_file_name = f\"metrics_{generated_file_base_name}\"\n",
    "        if document_list_file is not None:\n",
    "            metrics_file_name = f\"{metrics_file_name}_{len(document_list)}_documents.xlsx\"\n",
    "        else:\n",
    "            metrics_file_name = f\"{metrics_file_name}_all_documents.xlsx\"\n",
    "        metrics_file_path = os.path.join(output_metrics_folder, metrics_file_name)\n",
    "        with pd.ExcelWriter(metrics_file_path) as writer:\n",
    "            metrics_df.to_excel(writer, sheet_name=\"metrics_data\", index=False)\n",
    "            message_df.to_excel(writer, sheet_name=\"message_data\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'data_point': 'performance_fee_costs', 'doc_id': 377377369, 'sec_name': 'SPDR® S&P Emerging Markets Carbon Control Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'truth': '0', 'generated': '0.11', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Investment Portfolio-BlackRock Tactical Growth NE', 'truth': '0', 'generated': '0.33', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -NE', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard High Yield Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Property Securities Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Macquarie Income Opps', 'truth': '0.03', 'generated': '0.12', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Diversified Inc', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Schroder Fixed Income', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Share Plus L/S', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only) P Class', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Asia Fund', 'truth': '0.27', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund P Class', 'truth': '0.03', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Healthcare Fund', 'truth': '0.86', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum European Fund', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Japan Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 397107472, 'sec_name': 'AMP Capital Specialist Diversified Fixed Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Fairview Eq Ptnr Emg Comp', 'truth': '0.56', 'generated': '0.54', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MasterKey Pension Fundamentals (Pre Retirement) - Perpetual Smll Co Fund No.2', 'truth': '0', 'generated': '0.56', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond - Allan Gray Australian Equity Fund Class A', 'truth': '0.28', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open', 'truth': '0.05', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 539241700, 'sec_name': 'North Professional Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 539261734, 'sec_name': 'ipac life choices Income Generator', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 521606716, 'sec_name': 'CFS Enhanced Index Balanced-Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 527969661, 'sec_name': 'JPMorgan Global Equity Premium Income (Hedged) Complex ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Core Equity Tr AUDHdg', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Core Equity Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Value Trust -Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 544886057, 'sec_name': 'CFS Growth Builder', 'truth': '0.01', 'generated': '0.04', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 557362553, 'sec_name': 'JPMorgan Global Select Equity Active ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 550522985, 'sec_name': 'RQI Global Value – Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 553449663, 'sec_name': 'AMP Capital Specialist International Share (Hedged) Fund - Class A', 'truth': '0', 'generated': '0.07', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 539241700, 'sec_name': 'North Professional Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 539261734, 'sec_name': 'ipac life choices Income Generator', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 521606716, 'sec_name': 'CFS Enhanced Index Balanced-Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 527969661, 'sec_name': 'JPMorgan Global Equity Premium Income (Hedged) Complex ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Core Equity Tr AUDHdg', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Core Equity Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Value Trust -Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 544886057, 'sec_name': 'CFS Growth Builder', 'truth': '0.01', 'generated': '0.04', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 557362553, 'sec_name': 'JPMorgan Global Select Equity Active ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 550522985, 'sec_name': 'RQI Global Value – Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 553449663, 'sec_name': 'AMP Capital Specialist International Share (Hedged) Fund - Class A', 'truth': '0', 'generated': '0.07', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 377377369, 'sec_name': 'SPDR® S&P Emerging Markets Carbon Control Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'truth': '0', 'generated': '0.11', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Investment Portfolio-BlackRock Tactical Growth NE', 'truth': '0', 'generated': '0.33', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -NE', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard High Yield Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Property Securities Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Macquarie Income Opps', 'truth': '0.03', 'generated': '0.12', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Diversified Inc', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Schroder Fixed Income', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Share Plus L/S', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only) P Class', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Asia Fund', 'truth': '0.27', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund P Class', 'truth': '0.03', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Healthcare Fund', 'truth': '0.86', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum European Fund', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Japan Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 397107472, 'sec_name': 'AMP Capital Specialist Diversified Fixed Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Fairview Eq Ptnr Emg Comp', 'truth': '0.56', 'generated': '0.54', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MasterKey Pension Fundamentals (Pre Retirement) - Perpetual Smll Co Fund No.2', 'truth': '0', 'generated': '0.56', 'error': 'Truth is not equal with generated'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond - Allan Gray Australian Equity Fund Class A', 'truth': '0.28', 'generated': '', 'error': 'Generated is null and truth is not null'}\n",
      "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open', 'truth': '0.05', 'generated': '', 'error': 'Generated is null and truth is not null'}\n"
     ]
    }
   ],
   "source": [
    "for message_list_element in message_list:\n",
    "    if message_list_element[\"data_point\"] == \"performance_fee_costs\":\n",
    "        print(message_list_element)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Excel file '/data/aus_prospectus/output/error_analysis/anomalies_found.xlsx' has been created successfully.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "# Convert data to DataFrame\n",
    "df = pd.DataFrame(message_list)\n",
    "\n",
    "# Sort DataFrame by 'doc_id'\n",
    "df_sorted = df.sort_values(by=['doc_id'])\n",
    "\n",
    "# Save DataFrame to Excel file\n",
    "os.makedirs(\"/data/aus_prospectus/output/error_analysis/\", exist_ok=True)\n",
    "output_filename = r\"/data/aus_prospectus/output/error_analysis/anomalies_found.xlsx\"\n",
    "df_sorted.to_excel(output_filename, index=False)\n",
    "\n",
    "print(f\"Excel file '{output_filename}' has been created successfully.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "blade",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}