Added code to identify anomaly cases and performance matrix and updated for pdf downloading code

This commit is contained in:
Ravi Maheshwari 2025-03-13 17:31:54 +05:30
parent 336fd9a24f
commit 97da7e4961
2 changed files with 850 additions and 3 deletions

842
performance.ipynb Normal file
View File

@ -0,0 +1,842 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n",
" \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n",
" \"Minimum Initial Investment\", \"Benchmark\"]\n",
"\n",
"\n",
"imp_datapoints_mapping = {\n",
" \"Management Fee and Costs\": \"management_fee_and_costs\",\n",
" \"Management Fee\": \"management_fee\",\n",
" \"Performance fee and cost\": \"performance_fee_costs\",\n",
" \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n",
" \"Administration Fee and costs\": \"administration_fees\",\n",
" \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n",
" \"Buy Spread\": \"buy_spread\",\n",
" \"Sell Spread\": \"sell_spread\",\n",
" \"Performance Fee\": \"PerformanceFeeCharged\",\n",
" \"Minimum Initial Investment\": \"minimum_initial_investment\",\n",
" \"Benchmark\": \"benchmark_name\"\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"\n",
"path_ground_truth = r\"C:\\data\\aus_prospectus\\output\\Performance\\46_documents_ground_truth_with_mapping.xlsx\"\n",
"path_generated_results = r\"C:\\data\\aus_prospectus\\output\\Performance\\mapping_data_info_46_documents_by_text_20250313024715.xlsx\"\n",
"provider_mapping_file_path = r\"C:\\Users\\rmahesh\\OneDrive - MORNINGSTAR INC\\Desktop\\NLP Transitions\\Project\\Exprs\\INO71\\dc-ml-dataextraction-llm-aus-nz-pro-AUS_NZ_EXE_COMBINED_PHASE1_PHASE2\\output_files\\ground_truth\\TopProvidersBiz.xlsx\"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\n",
"\n",
"All Providers Results: \n",
"Metric \tPrecision \tRecall \tAccuracy \tF1-Score \tSUPPORT \tTP \tTN \tFP \tFN \n",
"Management Fee and Costs \t0.8790 \t0.9250 \t0.8213 \t0.9014 \t494 \t407 \t2 \t56 \t33 \n",
"Management Fee \t0.8985 \t0.9265 \t0.8394 \t0.9123 \t494 \t416 \t2 \t47 \t33 \n",
"Performance fee and cost \t0.7871 \t0.8472 \t0.7791 \t0.8161 \t327 \t244 \t144 \t66 \t44 \n",
"Interposed vehicle Performance fee and Costs \t0.5000 \t1.0000 \t0.9237 \t0.6667 \t39 \t38 \t422 \t38 \t0 \n",
"Administration Fee and costs \t0.9787 \t0.9388 \t0.9839 \t0.9583 \t98 \t92 \t398 \t2 \t6 \n",
"Total Annual Dollar Based Charges \t0.8165 \t1.0000 \t0.9598 \t0.8990 \t90 \t89 \t389 \t20 \t0 \n",
"Buy Spread \t0.8957 \t0.8910 \t0.8394 \t0.8933 \t405 \t335 \t83 \t39 \t41 \n",
"Sell Spread \t0.9064 \t0.8921 \t0.8474 \t0.8992 \t405 \t339 \t83 \t35 \t41 \n",
"Minimum Initial Investment \t0.8571 \t0.9671 \t0.8815 \t0.9088 \t310 \t294 \t145 \t49 \t10 \n",
"Benchmark \t0.6402 \t0.8582 \t0.8233 \t0.7333 \t173 \t121 \t289 \t68 \t20 \n",
"TOTAL \t0.8159 \t0.9246 \t0.8699 \t0.8588 \t2835 \t2375 \t1957 \t420 \t228 \n",
"Total Funds Matched - 498\n",
"Total Funds Not Matched - 28\n",
"Percentage of Funds Matched - 94.67680608365019\n"
]
}
],
"source": [
"import openpyxl\n",
"from collections import defaultdict\n",
"import pandas as pd\n",
"import statistics\n",
"\n",
"\n",
"funds_matched = 0\n",
"funds_not_matched = 0\n",
"def load_excel(filepath, header_row_index):\n",
" \"\"\"Load an Excel file and use the specified row as the header.\"\"\"\n",
" wb = openpyxl.load_workbook(filepath, data_only=True)\n",
" sheet = wb.active\n",
" headers = []\n",
" data = []\n",
"\n",
" for index, row in enumerate(sheet.iter_rows(values_only=True)):\n",
" if index == header_row_index:\n",
" headers = [cell if cell is not None else \"\" for cell in row]\n",
" elif index > header_row_index:\n",
" data.append([cell if cell is not None else \"\" for cell in row])\n",
"\n",
" return headers, data\n",
"\n",
"def index_data_by_key(data, key_index, secondary_key_index, header):\n",
" \"\"\"Index data by primary and secondary keys (doc_id and fund_name).\"\"\"\n",
" indexed_data = defaultdict(dict)\n",
" \n",
" for row in data:\n",
" row_data = {}\n",
" # Store the entire row, which will be useful for full row comparison\n",
" for i in range(len(row)):\n",
" if header[i] == \"doc_id\":\n",
" primary_key = int(row[i])\n",
" elif header[i] == \"fund_name\":\n",
" secondary_key = str(row[i])\n",
" else:\n",
" row_data[header[i]] = convert_if_number(row[i])\n",
" indexed_data[primary_key][secondary_key] = row_data\n",
" return indexed_data\n",
"\n",
"def convert_if_number(value):\n",
" \"\"\"Attempt to convert value to a float or int, otherwise return as string.\"\"\"\n",
" try:\n",
" float_value = round(float(value), 2)\n",
" int_value = int(float_value)\n",
" return int_value if int_value == float_value else float_value\n",
" except (ValueError, TypeError):\n",
" return value\n",
"\n",
"def compare_values(value1, value2):\n",
" \"\"\"Convert values to numbers if possible and compare, otherwise compare as strings.\"\"\"\n",
" value1 = convert_if_number(value1)\n",
" value2 = convert_if_number(value2)\n",
" return value1 == value2\n",
"\n",
"def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched):\n",
" \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n",
" results = {}\n",
" funds_matched, funds_not_matched = 0, 0\n",
" # Initialize result dictionaries for each column except 'doc_id'\n",
" for keys in headers:\n",
" if keys != \"doc_id\":\n",
" results[keys] = {}\n",
" results[keys][\"TP\"] = 0\n",
" results[keys][\"TN\"] = 0\n",
" results[keys][\"FP\"] = 0\n",
" results[keys][\"FN\"] = 0\n",
" results[keys][\"SUPPORT\"] = 0\n",
"\n",
" # Iterate over the generated results instead of the ground truth\n",
" for doc_id, funds in ground_truth.items():\n",
" if doc_id in generated_results:\n",
" for fund_name, truth_values in funds.items():\n",
" if fund_name in generated_results[doc_id]:\n",
" generated_values = generated_results[doc_id][fund_name]\n",
" # Compare all other columns\n",
" for i in intersection_list:\n",
" for keys in imp_datapoints:\n",
" if i == imp_datapoints_mapping[keys]:\n",
" if truth_values[i] == \"\":\n",
" if truth_values[i] == generated_values[i]:\n",
" results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
" else:\n",
" results[i][\"FP\"] = results[i][\"FP\"] + 1 \n",
" else:\n",
" if truth_values[i] == generated_values[i]:\n",
" results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
" elif generated_values[i] != \"\":\n",
" results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
" else:\n",
" results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
" results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
"\n",
"\n",
" # if truth_values[i] == generated_values[i] and truth_values[i] == \"\":\n",
" # results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
" # elif truth_values[i] == generated_values[i]:\n",
" # results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
" # elif truth_values[i] != \"\" and generated_values[i] == \"\":\n",
" # results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
" # elif truth_values[i] == \"\" and generated_values[i] != \"\":\n",
" # results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
" # else:\n",
" # results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
" # if truth_values[i] != \"\":\n",
" # results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
" funds_matched += 1\n",
" else:\n",
" funds_not_matched += 1\n",
" # for keys in headers:\n",
" # if keys != \"doc_id\":\n",
" # results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
" else:\n",
" # If the entire document is not found, count all funds as not matched\n",
" funds_not_matched += len(funds)\n",
" # for fund_name in funds:\n",
" # for keys in headers:\n",
" # if keys != \"doc_id\":\n",
" # results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
"\n",
" return results, funds_matched, funds_not_matched\n",
"\n",
"\n",
"# Load the files\n",
"headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n",
"headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n",
"\n",
"# Assuming doc_id is the first column and fund_name is the second column\n",
"doc_id_index = 0\n",
"fund_name_index = 1\n",
"\n",
"# Index the data\n",
"ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n",
"generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n",
"\n",
"intersection = set(headers_gen).intersection(headers_gt)\n",
"\n",
"# Convert the result back to a list (if you need it as a list)\n",
"intersection_list = list(intersection)\n",
"\n",
"total_fn = []\n",
"def calculate_metrics(tp, tn, fp, fn):\n",
" \"\"\"Calculate precision, recall, accuracy, and F1-score.\"\"\"\n",
" precision = tp / (tp + fp) if (tp + fp) != 0 else 0\n",
" recall = tp / (tp + fn) if (tp + fn) != 0 else 0\n",
" accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0\n",
" f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0\n",
" return precision, recall, accuracy, f1_score\n",
"\n",
"def print_metrics_table(data):\n",
" # Print table headers\n",
" print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"Precision\", \"Recall\", \"Accuracy\", \"F1-Score\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n",
" total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]\n",
" \n",
" total_tp = []\n",
" total_tn = []\n",
" total_fp = []\n",
" #total_fn = []\n",
" # Calculate and print metrics for each item\n",
" for keys in imp_datapoints:\n",
" try:\n",
" key = imp_datapoints_mapping[keys]\n",
" values = data[key]\n",
" tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
" precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
" total_precision.append(precision)\n",
" total_recall.append(recall)\n",
" total_accuracy.append(accuracy)\n",
" total_f1_score.append(f1_score)\n",
" total_support.append(values[\"SUPPORT\"])\n",
" total_tp.append(tp)\n",
" total_tn.append(tn)\n",
" total_fp.append(fp)\n",
" total_fn.append(fn)\n",
"\n",
" if values[\"SUPPORT\"] > 0 and key > \"\":\n",
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(keys, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
" except:\n",
" pass\n",
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", statistics.mean(total_precision), statistics.mean(total_recall), statistics.mean(total_accuracy), statistics.mean(total_f1_score), sum(total_support), sum(total_tp), sum(total_tn), sum(total_fp), sum(total_fn)))\n",
" \n",
"def create_metrics_df(data):\n",
" # Define a list to hold data for DataFrame\n",
" rows = []\n",
" \n",
" # Iterate through each metric item\n",
" for key in imp_datapoints:\n",
" try:\n",
" mapped_key = imp_datapoints_mapping[key]\n",
" values = data[mapped_key]\n",
" tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
" precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
" \n",
" # Only add rows where SUPPORT > 0\n",
" if values[\"SUPPORT\"] > 0:\n",
" row = {\n",
" \"Metric\": key,\n",
" \"Precision\": precision,\n",
" \"Recall\": recall,\n",
" \"Accuracy\": accuracy,\n",
" \"F1-Score\": f1_score,\n",
" \"SUPPORT\": values[\"SUPPORT\"]\n",
" }\n",
" rows.append(row)\n",
" except KeyError as e:\n",
" continue\n",
"\n",
" # Create a DataFrame from the list of rows\n",
" df_metrics = pd.DataFrame(rows)\n",
" df_metrics.reset_index(inplace=True)\n",
" df_metrics.drop(columns=[\"index\"], inplace=True)\n",
" print(df_metrics)\n",
" return df_metrics\n",
"\n",
"\n",
"\n",
"def get_provider_mapping(file_path):\n",
" df = pd.read_excel(file_path)\n",
" df = (df.groupby([\"Docid\", \"ProviderName\"]).first())\n",
" df.reset_index(inplace = True)\n",
" return df[[\"Docid\", \"ProviderName\"]]\n",
"\n",
"\n",
"def get_provider_names(generated_results_indexed, df_provider_mapping):\n",
" providers_dict = {}\n",
" for doc_id in generated_results_indexed:\n",
" try:\n",
" provider_name = (df_provider_mapping[df_provider_mapping[\"Docid\"] == doc_id][\"ProviderName\"].values)[0]\n",
" if provider_name in providers_dict:\n",
" providers_dict[provider_name].append(doc_id)\n",
" else:\n",
" providers_dict[provider_name] = []\n",
" providers_dict[provider_name].append(doc_id)\n",
"\n",
" except:\n",
" pass\n",
" return providers_dict\n",
"\n",
"def get_specified_doc_data(results, doc_list):\n",
" provider_res = {}\n",
" for doc_id in doc_list:\n",
" if doc_id in results:\n",
" provider_res[doc_id] = results[doc_id]\n",
" return provider_res\n",
"\n",
"\n",
"df_provider_mapping = get_provider_mapping(provider_mapping_file_path)\n",
"\n",
"all_provider_dict = get_provider_names(generated_results_indexed, df_provider_mapping)\n",
"\n",
"\n",
"# for provider_name in all_provider_dict:\n",
"# provider_vise_generated_results = get_specified_doc_data(generated_results_indexed, all_provider_dict[provider_name])\n",
"# comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, provider_vise_generated_results, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
"# print(\"\\n\")\n",
"# print(\"\\n\")\n",
"# print(\"Provider Name - \" + provider_name + \"\\t Number of Docs - \" + str(len(all_provider_dict[provider_name])))\n",
"# #create_metrics_df(comparison_results)\n",
"# print_metrics_table(comparison_results)\n",
"# print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
"# print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
"\n",
"\n",
"\n",
"print(\"\\n\")\n",
"print(\"\\n\")\n",
"print(\"All Providers Results: \")\n",
"comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, generated_results_indexed, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
"\n",
"print_metrics_table(comparison_results)\n",
"print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
"print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\n",
"\n",
"All Providers Results: \n",
"Performance fee and cost - 377377369 truth is null and generated - 0 SPDR® S&P Emerging Markets Carbon Control Fund\n",
"Performance fee and cost - 397107472 truth is null and generated - 0 AMP Capital Specialist Diversified Fixed Income Fund\n",
"Performance fee and cost - 401212184 truth - 0 and generated - 0.11 OnePath OneAnswer Frontier Investment Portfolio-OnePath Multi Asset Income Trust\n",
"Performance fee and cost - 401212184 truth - 0 and generated - 0.07 OA Frontier IP-OnePath Australian Share Trust\n",
"Performance fee and cost - 401212184 truth - 0 and generated - 0.33 OA Frontier Investment Portfolio- BlackRock Tactical Growth\n",
"Performance fee and cost - 401212184 truth - 0 and generated - 0.02 OA Frontier Investment Portfolio- Pendal Monthly Income Plus\n",
"Performance fee and cost - 401212184 truth - 0.41 and generated - 0.13 OnePath Alternatives Growth Trust\n",
"Performance fee and cost - 401212184 truth - 0 and generated - 0.03 OA Frontier IP-Ausbil Australian Emerging Leaders Trust\n",
"Performance fee and cost - 401212184 truth - 0 and generated - 0.15 OA Frontier IP-Perpetual Balanced Growth\n",
"Performance fee and cost - 401212184 truth - 0 and generated - 0.03 OA Frontier IP-Perpetual Conservative Growth\n",
"Performance fee and cost - 401212184 truth - 0 and generated - 0.06 OA Frontier IP-Platinum International\n",
"Performance fee and cost - 401212184 truth - 0 and generated - 0.15 OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth\n",
"Performance fee and cost - 401212184 truth - 0 and generated - 0.01 ANZ OneAnswer Investment Portfolio - OnePath Balanced Index\n",
"Performance fee and cost - 401212184 generated is null and truth is - 0 ANZ OneAnswer Investment Portfolio - OnePath Growth Index\n",
"Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard Index Diversified Bond\n",
"Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard International Shares Index\n",
"Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard Investor Short Term Fixed Interest Fund\n",
"Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard Index Hedged International Shares Fund\n",
"Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard LifeStrategy Growth\n",
"Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard LifeStrategy Conservative\n",
"Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard LifeStrategy High Growth\n",
"Performance fee and cost - 411062815 truth is null and generated - 13.98 Perpetual WFP-Perpetual Share Plus L/S\n",
"Performance fee and cost - 411062815 truth - 0 and generated - 0.01 WFP Schroder Fixed Income\n",
"Performance fee and cost - 411062815 truth - 0 and generated - 15.38 Perpetual Ausbil Australian Emerg Ldrs\n",
"Performance fee and cost - 411062815 truth - 0.03 and generated - 0.12 WFP Macquarie Income Opportunities\n",
"Performance fee and cost - 411062815 generated is null and truth is - 0 WFP Diversified Income\n",
"Performance fee and cost - 412778803 generated is null and truth is - 0.14 \n",
"Performance fee and cost - 412778803 generated is null and truth is - 0.67 Telstra Property Pension\n",
"Performance fee and cost - 412778803 generated is null and truth is - 0.01 Telstra Cash Pension\n",
"Performance fee and cost - 412778803 generated is null and truth is - 0.01 Telstra Australian shares Pension\n",
"Performance fee and cost - 412778803 generated is null and truth is - 0.14 Telstra Defensive growth Pension\n",
"Performance fee and cost - 412778803 generated is null and truth is - 0.01 Telstra International shares Pension\n",
"Performance fee and cost - 414751292 truth - 0.24 and generated - 0 Platinum Global Fund (Long Only)\n",
"Performance fee and cost - 414751292 truth - 0.15 and generated - 0 \n",
"Performance fee and cost - 414751292 truth - 0.03 and generated - 0 Platinum International Brands Fund\n",
"Performance fee and cost - 414751292 truth - 0.86 and generated - 0 Platinum International Healthcare\n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 \n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - Ausbil Aus. Emrging Leaders\n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - Investors Mutual Aus. Shre\n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - Macquarie Inc Opportunities\n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Cash\n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - Global Share Fund\n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPF - Hedged Global Share Fund\n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - Hedged Global Share Fund\n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - IncomeBuilder\n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPF - PIMCO Div. Fixed Interest\n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPF - PIMCO Global Bond Fund\n",
"Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - PIMCO Global Bond Fund\n",
"Performance fee and cost - 446324179 generated is null and truth is - 0.28 Lifeplan Investment Bond - Allan Gray Australian Equity Fund\n",
"Performance fee and cost - 446324179 generated is null and truth is - 0.05 Lifeplan MLC Horizon 2-Capital Stable Open\n",
"Performance fee and cost - 454036250 generated is null and truth is -   \n",
"Performance fee and cost - 530101994 truth is null and generated - 0 Dimensional Global Value Trust -Active ETF\n",
"Performance fee and cost - 530101994 truth is null and generated - 0 Dimensional Australia Core Equity Trust - Active ETF\n",
"Performance fee and cost - 530101994 truth is null and generated - 0 Dimensional Australian Value Trust - Active ETF\n",
"Performance fee and cost - 530101994 truth is null and generated - 0 Dimensional Global Core Equity Trust (Unhedged Class) - Active ETF\n",
"Performance fee and cost - 530101994 truth is null and generated - 0 Dimensional Global Core Equity Tr\n",
"Performance fee and cost - 550769189 truth is null and generated - 0 Acadian Global Managed Volatility Equity - Class A\n",
"Performance fee and cost - 550522985 truth is null and generated - 0 RQI Global Value Class A\n",
"Performance fee and cost - 539266893 generated is null and truth is - AMP - Generations - BlackRock Australian Fixed Interest Index\n",
"Performance fee and cost - 539266893 generated is null and truth is - AMP - Generations - BlackRock Australian Equity Index\n",
"Performance fee and cost - 539266893 generated is null and truth is - AMP Generations - Alliance Capital Cash Management\n",
"Performance fee and cost - 539266893 generated is null and truth is - AMP - Generations - BlackRock Property Securities Index\n",
"Performance fee and cost - 539266893 generated is null and truth is - AMP - Generations - BlackRock International Equity Index (Unhedged)\n",
"Performance fee and cost - 539266893 generated is null and truth is - AMP - Generations - BlackRock International Equity Index (Hedged)\n",
"Performance fee and cost - 539241700 truth - 0.08 and generated - 0.05 North Professional Balanced\n",
"Performance fee and cost - 539241700 truth - 0.06 and generated - 0 North Professional High Growth\n",
"Performance fee and cost - 539241700 truth - 0.08 and generated - 0 North Professional Conservative\n",
"Performance fee and cost - 539241700 truth - 0.08 and generated - 0 North Professional Growth\n",
"Performance fee and cost - 539241700 truth - 0.09 and generated - 0 North Professional Moderately Conservative\n",
"Performance fee and cost - 539261734 truth - 0.01 and generated - 0 ipac life choices Income Generator\n",
"Performance fee and cost - 539261734 truth - 0.06 and generated - 0 ipac life choices Active 100\n",
"Performance fee and cost - 539261734 truth - 0.08 and generated - 0 ipac life choices Active 85\n",
"Performance fee and cost - 539261734 truth - 0.01 and generated - 0 ipac life choices Index 50\n",
"Performance fee and cost - 539261734 truth - 0.09 and generated - 0 ipac life choices Active 50\n",
"Performance fee and cost - 539261734 truth - 0.08 and generated - 0 ipac life choices Active 70\n",
"Performance fee and cost - 506913190 generated is null and truth is - 0.03 FC W Pen-CFS TTR Moderate\n",
"Performance fee and cost - 506913190 generated is null and truth is - 0.04 FC W Pen-CFS TTR Growth\n",
"Performance fee and cost - 506913190 generated is null and truth is - 0.47 \n",
"Performance fee and cost - 553449663 truth - 0 and generated - 0.07 AMP Capital Specialist International Share (Hedged) Fund\n",
"Performance fee and cost - 539266874 truth - 0.03 and generated - 0 SUMMIT Select - Active High Growth Units\n",
"Performance fee and cost - 539266874 truth - 0.05 and generated - 0 SUMMIT Select - Active Moderately Defensive\n",
"Performance fee and cost - 539266874 truth - 0.05 and generated - 0 SUMMIT Select - Active Growth Units\n",
"Performance fee and cost - 539266874 truth - 0.05 and generated - 0 SUMMIT Select - Active Balanced\n",
"Performance fee and cost - 539266874 truth - 0.06 and generated - 0 SUMMIT Select - Active Defensive Units\n",
"Performance fee and cost - 539266880 truth - 0.01 and generated - 0 North Multi-manager Active High Growth\n",
"Performance fee and cost - 539266880 truth - 0.01 and generated - 0 North Multi-manager Active Moderately Defensive\n",
"Performance fee and cost - 539266880 truth - 0.01 and generated - 0 North Multi-manager Active Growth\n",
"Performance fee and cost - 539266880 truth - 0.01 and generated - 0 North Multi-manager Balanced\n",
"Performance fee and cost - 526200514 generated is null and truth is - 0 BT Future Goals BTFM\n",
"Performance fee and cost - 526200514 generated is null and truth is - 0 BTFM Asian Share\n",
"Performance fee and cost - 526200514 generated is null and truth is - 0 BT International Share BTFM\n",
"Performance fee and cost - 526200514 generated is null and truth is - 0 BT Smaller Companies BTFM\n",
"Performance fee and cost - 526200514 generated is null and truth is - 0 BT Investment Funds - BT TIME Fund\n",
"Performance fee and cost - 526200514 generated is null and truth is - 0 BT European Share Growth\n",
"Performance fee and cost - 526200514 generated is null and truth is - 0 BT American Share Growth\n",
"Performance fee and cost - 526200514 generated is null and truth is - 0 BT Imputation Share BTFM\n",
"Performance fee and cost - 526200514 generated is null and truth is - 0 \n",
"Performance fee and cost - 521606755 truth is null and generated - 0 CFS Index Diversified\n",
"Performance fee and cost - 557526129 truth is null and generated - 0 Fortlake Real-Income Fund\n",
"Performance fee and cost - 540028470 truth is null and generated - 0 CFS Wholesale Index Australian Share\n",
"Performance fee and cost - 531373053 truth is null and generated - 0 Dimensional Global Core Equity Trust (Unhedged Class) - Active ETF\n",
"Performance fee and cost - 531373053 truth is null and generated - 0 Dimensional Australian Value Trust - Active ETF\n",
"Performance fee and cost - 531373053 truth is null and generated - 0 Dimensional Global Value Trust -Active ETF\n",
"Performance fee and cost - 531373053 truth is null and generated - 0 Dimensional Australia Core Equity Trust - Active ETF\n",
"Performance fee and cost - 531373053 truth is null and generated - 0 Dimensional Global Small Company Trust\n",
"Performance fee and cost - 557362553 truth is null and generated - 0 JPMorgan Global Select Equity Fund\n",
"Performance fee and cost - 527969661 truth is null and generated - 0 JPMorgan Global Equity Premium Income (Hedged) Complex ETF\n",
"Performance fee and cost - 384508026 generated is null and truth is - 0 Mercer Multi-manager High Growth Fund\n",
"Performance fee and cost - 384508026 generated is null and truth is - 0 Mercer Multi-manager Growth Fund\n",
"Performance fee and cost - 384508026 generated is null and truth is - 0 \n",
"total - 452.72727272727275\n",
"Metric \tPrecision \tRecall \tAccuracy \tF1-Score \tSUPPORT \tTP \tTN \tFP \tFN \n",
"Management Fee and Costs \t0.8790 \t0.9250 \t0.8213 \t0.9014 \t494 \t407 \t2 \t56 \t33 \n",
"Management Fee \t0.8985 \t0.9265 \t0.8394 \t0.9123 \t494 \t416 \t2 \t47 \t33 \n",
"Performance fee and cost \t0.7871 \t0.8472 \t0.7791 \t0.8161 \t327 \t244 \t144 \t66 \t44 \n",
"Interposed vehicle Performance fee and Costs \t0.5000 \t1.0000 \t0.9237 \t0.6667 \t39 \t38 \t422 \t38 \t0 \n",
"Administration Fee and costs \t0.9787 \t0.9388 \t0.9839 \t0.9583 \t98 \t92 \t398 \t2 \t6 \n",
"Total Annual Dollar Based Charges \t0.8165 \t1.0000 \t0.9598 \t0.8990 \t90 \t89 \t389 \t20 \t0 \n",
"Buy Spread \t0.8957 \t0.8910 \t0.8394 \t0.8933 \t405 \t335 \t83 \t39 \t41 \n",
"Sell Spread \t0.9064 \t0.8921 \t0.8474 \t0.8992 \t405 \t339 \t83 \t35 \t41 \n",
"Minimum Initial Investment \t0.8571 \t0.9671 \t0.8815 \t0.9088 \t310 \t294 \t145 \t49 \t10 \n",
"Benchmark \t0.6402 \t0.8582 \t0.8233 \t0.7333 \t173 \t121 \t289 \t68 \t20 \n",
"TOTAL \t0.8159 \t0.9246 \t0.8699 \t0.8588 \t2835 \t2375 \t1957 \t420 \t228 \n",
"Total Funds Matched - 498\n",
"Total Funds Not Matched - 28\n",
"Percentage of Funds Matched - 94.67680608365019\n"
]
}
],
"source": [
"import openpyxl\n",
"from collections import defaultdict\n",
"import pandas as pd\n",
"import statistics\n",
"\n",
"\n",
"funds_matched = 0\n",
"funds_not_matched = 0\n",
"def load_excel(filepath, header_row_index):\n",
" \"\"\"Load an Excel file and use the specified row as the header.\"\"\"\n",
" wb = openpyxl.load_workbook(filepath, data_only=True)\n",
" sheet = wb.active\n",
" headers = []\n",
" data = []\n",
"\n",
" for index, row in enumerate(sheet.iter_rows(values_only=True)):\n",
" if index == header_row_index:\n",
" headers = [cell if cell is not None else \"\" for cell in row]\n",
" elif index > header_row_index:\n",
" data.append([cell if cell is not None else \"\" for cell in row])\n",
"\n",
" return headers, data\n",
"\n",
"def index_data_by_key(data, key_index, secondary_key_index, header):\n",
" \"\"\"Index data by primary and secondary keys (doc_id and fund_name).\"\"\"\n",
" indexed_data = defaultdict(dict)\n",
" \n",
" for row in data:\n",
" row_data = {}\n",
" # Store the entire row, which will be useful for full row comparison\n",
" for i in range(len(row)):\n",
" if header[i] == \"doc_id\":\n",
" primary_key = int(row[i])\n",
" elif header[i] == \"fund_name\":\n",
" secondary_key = str(row[i])\n",
" else:\n",
" row_data[header[i]] = convert_if_number(row[i])\n",
" indexed_data[primary_key][secondary_key] = row_data\n",
" return indexed_data\n",
"\n",
"def convert_if_number(value):\n",
" \"\"\"Attempt to convert value to a float or int, otherwise return as string.\"\"\"\n",
" try:\n",
" float_value = round(float(value), 2)\n",
" int_value = int(float_value)\n",
" return int_value if int_value == float_value else float_value\n",
" except (ValueError, TypeError):\n",
" return value\n",
"\n",
"def compare_values(value1, value2):\n",
" \"\"\"Convert values to numbers if possible and compare, otherwise compare as strings.\"\"\"\n",
" value1 = convert_if_number(value1)\n",
" value2 = convert_if_number(value2)\n",
" return value1 == value2\n",
"def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched):\n",
" \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n",
" results = {}\n",
" funds_matched, funds_not_matched = 0, 0\n",
" # Initialize result dictionaries for each column except 'doc_id'\n",
" for keys in headers:\n",
" if keys != \"doc_id\":\n",
" results[keys] = {}\n",
" results[keys][\"TP\"] = 0\n",
" results[keys][\"TN\"] = 0\n",
" results[keys][\"FP\"] = 0\n",
" results[keys][\"FN\"] = 0\n",
" results[keys][\"SUPPORT\"] = 0\n",
" \n",
" # Iterate over the generated results instead of the ground truth\n",
" \n",
" total = 0\n",
" for doc_id, funds in ground_truth.items():\n",
" if doc_id in generated_results:\n",
" for fund_name, truth_values in funds.items():\n",
" if fund_name in generated_results[doc_id]:\n",
" generated_values = generated_results[doc_id][fund_name]\n",
" # Compare all other columns\n",
" for i in intersection_list:\n",
" for keys in imp_datapoints:\n",
" if i == imp_datapoints_mapping[keys]:\n",
" total = total +1\n",
" if truth_values[i] == \"\":\n",
" if truth_values[i] == generated_values[i]:\n",
" results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
" else:\n",
" results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
" if \"Performance fee and cost\" in keys:\n",
" debug = 0\n",
" print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], fund_name) \n",
" else:\n",
" if truth_values[i] == generated_values[i]:\n",
" results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
" elif generated_values[i] != \"\":\n",
" results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
" if \"Performance fee and cost\" in keys:\n",
" debug = 0\n",
" print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", fund_name)\n",
" else:\n",
" results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
" if \"Performance fee and cost\" in keys:\n",
" debug = 0\n",
" print(keys, \" - \" , doc_id, \" generated is null and truth is - \", truth_values[i], fund_name)\n",
" results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
"\n",
"\n",
" # if truth_values[i] == generated_values[i] and truth_values[i] == \"\":\n",
" # results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
" # elif truth_values[i] == generated_values[i]:\n",
" # results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
" # elif truth_values[i] != \"\" and generated_values[i] == \"\":\n",
" # results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
" # elif truth_values[i] == \"\" and generated_values[i] != \"\":\n",
" # results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
" # else:\n",
" # results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
" # if truth_values[i] != \"\":\n",
" # results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
" funds_matched += 1\n",
" else:\n",
" funds_not_matched += 1\n",
" # for keys in headers:\n",
" # if keys != \"doc_id\":\n",
" # results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
" else:\n",
" # If the entire document is not found, count all funds as not matched\n",
" funds_not_matched += len(funds)\n",
" # for fund_name in funds:\n",
" # for keys in headers:\n",
" # if keys != \"doc_id\":\n",
" # results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
" return results, funds_matched, funds_not_matched\n",
"\n",
"\n",
"# Load the files\n",
"headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n",
"headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n",
"\n",
"# Assuming doc_id is the first column and fund_name is the second column\n",
"doc_id_index = 0\n",
"fund_name_index = 1\n",
"\n",
"# Index the data\n",
"ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n",
"generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n",
"\n",
"intersection = set(headers_gen).intersection(headers_gt)\n",
"\n",
"# Convert the result back to a list (if you need it as a list)\n",
"intersection_list = list(intersection)\n",
"\n",
"total_fn = []\n",
"def calculate_metrics(tp, tn, fp, fn):\n",
" \"\"\"Calculate precision, recall, accuracy, and F1-score.\"\"\"\n",
" precision = tp / (tp + fp) if (tp + fp) != 0 else 0\n",
" recall = tp / (tp + fn) if (tp + fn) != 0 else 0\n",
" accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0\n",
" f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0\n",
" return precision, recall, accuracy, f1_score\n",
"\n",
"def print_metrics_table(data):\n",
" # Print table headers\n",
" print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"Precision\", \"Recall\", \"Accuracy\", \"F1-Score\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n",
" total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]\n",
" \n",
" total_tp = []\n",
" total_tn = []\n",
" total_fp = []\n",
" #total_fn = []\n",
" # Calculate and print metrics for each item\n",
" for keys in imp_datapoints:\n",
" try:\n",
" key = imp_datapoints_mapping[keys]\n",
" values = data[key]\n",
" tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
" precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
" total_precision.append(precision)\n",
" total_recall.append(recall)\n",
" total_accuracy.append(accuracy)\n",
" total_f1_score.append(f1_score)\n",
" total_support.append(values[\"SUPPORT\"])\n",
" total_tp.append(tp)\n",
" total_tn.append(tn)\n",
" total_fp.append(fp)\n",
" total_fn.append(fn)\n",
"\n",
" if values[\"SUPPORT\"] > 0 and key > \"\":\n",
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(keys, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
" except:\n",
" pass\n",
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", statistics.mean(total_precision), statistics.mean(total_recall), statistics.mean(total_accuracy), statistics.mean(total_f1_score), sum(total_support), sum(total_tp), sum(total_tn), sum(total_fp), sum(total_fn)))\n",
" \n",
"def create_metrics_df(data):\n",
" # Define a list to hold data for DataFrame\n",
" rows = []\n",
" \n",
" # Iterate through each metric item\n",
" for key in imp_datapoints:\n",
" try:\n",
" mapped_key = imp_datapoints_mapping[key]\n",
" values = data[mapped_key]\n",
" tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
" precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
" \n",
" # Only add rows where SUPPORT > 0\n",
" if values[\"SUPPORT\"] > 0:\n",
" row = {\n",
" \"Metric\": key,\n",
" \"Precision\": precision,\n",
" \"Recall\": recall,\n",
" \"Accuracy\": accuracy,\n",
" \"F1-Score\": f1_score,\n",
" \"SUPPORT\": values[\"SUPPORT\"]\n",
" }\n",
" rows.append(row)\n",
" except KeyError as e:\n",
" continue\n",
"\n",
" # Create a DataFrame from the list of rows\n",
" df_metrics = pd.DataFrame(rows)\n",
" df_metrics.reset_index(inplace=True)\n",
" df_metrics.drop(columns=[\"index\"], inplace=True)\n",
" print(df_metrics)\n",
" return df_metrics\n",
"\n",
"\n",
"\n",
"def get_provider_mapping(file_path):\n",
" df = pd.read_excel(file_path)\n",
" df = (df.groupby([\"Docid\", \"ProviderName\"]).first())\n",
" df.reset_index(inplace = True)\n",
" return df[[\"Docid\", \"ProviderName\"]]\n",
"\n",
"\n",
"def get_provider_names(generated_results_indexed, df_provider_mapping):\n",
" providers_dict = {}\n",
" for doc_id in generated_results_indexed:\n",
" try:\n",
" provider_name = (df_provider_mapping[df_provider_mapping[\"Docid\"] == doc_id][\"ProviderName\"].values)[0]\n",
" if provider_name in providers_dict:\n",
" providers_dict[provider_name].append(doc_id)\n",
" else:\n",
" providers_dict[provider_name] = []\n",
" providers_dict[provider_name].append(doc_id)\n",
"\n",
" except:\n",
" pass\n",
" return providers_dict\n",
"\n",
"def get_specified_doc_data(results, doc_list):\n",
" provider_res = {}\n",
" for doc_id in doc_list:\n",
" if doc_id in results:\n",
" provider_res[doc_id] = results[doc_id]\n",
" return provider_res\n",
"\n",
"\n",
"df_provider_mapping = get_provider_mapping(provider_mapping_file_path)\n",
"\n",
"all_provider_dict = get_provider_names(generated_results_indexed, df_provider_mapping)\n",
"\n",
"\n",
"# for provider_name in all_provider_dict:\n",
"# provider_vise_generated_results = get_specified_doc_data(generated_results_indexed, all_provider_dict[provider_name])\n",
"# comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, provider_vise_generated_results, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
"# print(\"\\n\")\n",
"# print(\"\\n\")\n",
"# print(\"Provider Name - \" + provider_name + \"\\t Number of Docs - \" + str(len(all_provider_dict[provider_name])))\n",
"# #create_metrics_df(comparison_results)\n",
"# print_metrics_table(comparison_results)\n",
"# print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
"# print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
"\n",
"\n",
"\n",
"print(\"\\n\")\n",
"print(\"\\n\")\n",
"print(\"All Providers Results: \")\n",
"comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, generated_results_indexed, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
"\n",
"print_metrics_table(comparison_results)\n",
"print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
"print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "blade",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -4,6 +4,7 @@ import os
import platform
from utils.logger import logger
import dotenv
import certifi
# loads .env file with your OPENAI_API_KEY
dotenv.load_dotenv()
@ -38,9 +39,13 @@ def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str):
else:
if os_name == "windows":
ACCESS_KEY = os.getenv('ACCESS_KEY')
SECRET_KEY = os.getenv('SECRET_KEY')
session = boto3.Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3 = session.client('s3')
SECRET_KEY = os.getenv('SECRET_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
s3 = boto3.client("s3", region_name="us-east-1", verify=certifi.where(),
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
aws_session_token=AWS_SESSION_TOKEN
)
else:
s3 = boto3.client('s3')