1. metrics's key should be share class name: sec_name

2. support output metrics data as Excel file 3. Optimize instructions for performance_fee_costs
2025-03-13 11:53:27 -05:00 · 2025-03-13 11:53:27 -05:00 · a090b5cc9e
parent 1f6b781b12
commit a090b5cc9e
5 changed files with 183 additions and 196 deletions
--- a/calc_metrics.py
+++ b/calc_metrics.py
@ -567,11 +567,11 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
    verify_data_df = pd.DataFrame()
    
    audit_fields = [
-        "DocumentId",
-        "FundLegalName",
-        "FundId",
-        "FundClassLegalName",
-        "FundClassId",
+        "doc_id",
+        "fund_name",
+        "fund_id",
+        "sec_name",
+        "sec_id",
        "management_fee_and_costs",
        "management_fee",
        "administration_fees",
@ -590,11 +590,11 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
    audit_data_df = pd.read_excel(audit_file_path, sheet_name=audit_data_sheet)
    audit_data_df = audit_data_df[audit_fields]
    audit_data_df = audit_data_df.drop_duplicates()
-    audit_data_df = audit_data_df.rename(columns={"DocumentId": "doc_id", 
-                                                  "FundLegalName": "fund_name", 
-                                                  "FundId": "fund_id", 
-                                                  "FundClassLegalName": "sec_name", 
-                                                  "FundClassId": "sec_id"})
+    # audit_data_df = audit_data_df.rename(columns={"DocumentId": "doc_id", 
+    #                                               "FundLegalName": "fund_name", 
+    #                                               "FundId": "fund_id", 
+    #                                               "FundClassLegalName": "sec_name", 
+    #                                               "FundClassId": "sec_id"})
    audit_data_df.fillna("", inplace=True)
    audit_data_df.reset_index(drop=True, inplace=True)
    
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@ -403,14 +403,15 @@
 				"---Example 1 End---",
 				"The relevant values: 0.00 and 2.18, are in the range, so the output should be:",
 				"{\"data\": []}",
-				"B If with pure performance fee in table, please extract relevant values",
+				"B. If with pure performance fee in table, please extract relevant values",
 				"---Example Start---",
 				"\n\nFees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nPerformance fees \nAmounts deducted from your investment in \nrelation to the performance of the product. \nPlatinum International Fund Nil 0.15%\nPlatinum Global Fund (Long Only) Nil 0.24%\n",
 				"---Example End---",
 				"a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.",
 				"b. This example mentioned share classes, please output according to share class.",
 				"The output should be",
-				"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}"
+				"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}",
+				"C. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0"

 			],
 			"minimum_initial_investment": [
@ -518,6 +519,7 @@
 					"For main fund: Platinum Asia with values: 2.14 2.99 0.02 0.00 0.21 2.37 3.22, ",
 					"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
 					"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
+					"Identify the value of the column \"Estimated Performance fees\" and if it is written 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0",
 					"Therefore, the output should be:",
 					"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}"
 					]
@ -597,6 +599,7 @@
 					"---Example Start---",
 					"Performance fee \nPlus other investment fees and costs \nEquals investment fees and costs \nTransaction costs(net) \nBuy-sell spreads \nTransaction costs(gross) \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.04 \n0.10 / 0.10 \n0.09 \n",
 					"---Example End---",
+					"Identify the value of the 1st column \"Performance fee\" and if it is written 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0",
 					"Please ignore the 3rd column: \"Equals investment fees and costs\" values!!",
 					"Please read context carefully, don't miss any data row!!",
 					"The output should be:",
--- a/main.py
+++ b/main.py
@ -1560,7 +1560,7 @@ if __name__ == "__main__":
        #                             "544886057",
        #                             "550769189",
        #                             "553449663"]
-        # special_doc_id_list = ["446324179"]
+        special_doc_id_list = ["420339794", "401212184"]
        # special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"]
        pdf_folder: str = r"/data/aus_prospectus/pdf/"
        output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
--- a/performance.ipynb
+++ b/performance.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -30,16 +30,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
-    "path_ground_truth = r\"C:\\data\\aus_prospectus\\output\\Performance\\46_documents_ground_truth_with_mapping.xlsx\"\n",
-    "path_generated_results = r\"C:\\data\\aus_prospectus\\output\\Performance\\mapping_data_info_46_documents_by_text_20250313024715.xlsx\"\n",
-    "provider_mapping_file_path = r\"C:\\Users\\rmahesh\\OneDrive - MORNINGSTAR INC\\Desktop\\NLP Transitions\\Project\\Exprs\\INO71\\dc-ml-dataextraction-llm-aus-nz-pro-AUS_NZ_EXE_COMBINED_PHASE1_PHASE2\\output_files\\ground_truth\\TopProvidersBiz.xlsx\"\n",
-    "\n",
-    "\n"
+    "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
+    "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313024715.xlsx\"\n",
+    "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n"
   ]
  },
  {
@ -353,7 +351,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
@ -365,132 +363,55 @@
      "\n",
      "\n",
      "All Providers Results: \n",
-      "Performance fee and cost  -  377377369  truth is null and generated -  0 SPDR® S&P Emerging Markets Carbon Control Fund\n",
-      "Performance fee and cost  -  397107472  truth is null and generated -  0 AMP Capital Specialist Diversified Fixed Income Fund\n",
-      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.11   OnePath OneAnswer Frontier Investment Portfolio-OnePath Multi Asset Income Trust\n",
-      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.07   OA Frontier IP-OnePath Australian Share Trust\n",
-      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.33   OA Frontier Investment Portfolio- BlackRock Tactical Growth\n",
-      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.02   OA Frontier Investment Portfolio- Pendal Monthly Income Plus\n",
-      "Performance fee and cost  -  401212184  truth -  0.41  and generated -  0.13   OnePath Alternatives Growth Trust\n",
-      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.03   OA Frontier IP-Ausbil Australian Emerging Leaders Trust\n",
-      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.15   OA Frontier IP-Perpetual Balanced Growth\n",
-      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.03   OA Frontier IP-Perpetual Conservative Growth\n",
-      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.06   OA Frontier IP-Platinum International\n",
-      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.15   OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth\n",
-      "Performance fee and cost  -  401212184  truth -  0  and generated -  0.01   ANZ OneAnswer Investment Portfolio - OnePath Balanced Index\n",
-      "Performance fee and cost  -  401212184  generated is null and  truth is -  0 ANZ OneAnswer Investment Portfolio - OnePath Growth Index\n",
-      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard Index Diversified Bond\n",
-      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard International Shares Index\n",
-      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard Investor Short Term Fixed Interest Fund\n",
-      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard Index Hedged International Shares Fund\n",
-      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard LifeStrategy Growth\n",
-      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard LifeStrategy Conservative\n",
-      "Performance fee and cost  -  409723592  truth is null and generated -  0 Vanguard LifeStrategy High Growth\n",
-      "Performance fee and cost  -  411062815  truth is null and generated -  13.98 Perpetual WFP-Perpetual Share Plus L/S\n",
-      "Performance fee and cost  -  411062815  truth -  0  and generated -  0.01   WFP Schroder Fixed Income\n",
-      "Performance fee and cost  -  411062815  truth -  0  and generated -  15.38   Perpetual Ausbil Australian Emerg Ldrs\n",
-      "Performance fee and cost  -  411062815  truth -  0.03  and generated -  0.12   WFP Macquarie Income Opportunities\n",
-      "Performance fee and cost  -  411062815  generated is null and  truth is -  0 WFP Diversified Income\n",
-      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.14 \n",
-      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.67 Telstra Property Pension\n",
-      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.01 Telstra Cash Pension\n",
-      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.01 Telstra Australian shares Pension\n",
-      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.14 Telstra Defensive growth Pension\n",
-      "Performance fee and cost  -  412778803  generated is null and  truth is -  0.01 Telstra International shares Pension\n",
-      "Performance fee and cost  -  414751292  truth -  0.24  and generated -  0   Platinum Global Fund (Long Only)\n",
-      "Performance fee and cost  -  414751292  truth -  0.15  and generated -  0   \n",
-      "Performance fee and cost  -  414751292  truth -  0.03  and generated -  0   Platinum International Brands Fund\n",
-      "Performance fee and cost  -  414751292  truth -  0.86  and generated -  0   Platinum International Healthcare\n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 \n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - Ausbil Aus. Emrging Leaders\n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - Investors Mutual Aus. Shre\n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - Macquarie Inc Opportunities\n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Cash\n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - Global Share Fund\n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPF - Hedged Global Share Fund\n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - Hedged Global Share Fund\n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - IncomeBuilder\n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPF - PIMCO Div. Fixed Interest\n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPF - PIMCO Global Bond Fund\n",
-      "Performance fee and cost  -  420339794  generated is null and  truth is -  0 MLC MKPFPR - PIMCO Global Bond Fund\n",
-      "Performance fee and cost  -  446324179  generated is null and  truth is -  0.28 Lifeplan Investment Bond - Allan Gray Australian Equity Fund\n",
-      "Performance fee and cost  -  446324179  generated is null and  truth is -  0.05 Lifeplan MLC Horizon 2-Capital Stable Open\n",
-      "Performance fee and cost  -  454036250  generated is null and  truth is -    \n",
-      "Performance fee and cost  -  530101994  truth is null and generated -  0 Dimensional Global Value Trust -Active ETF\n",
-      "Performance fee and cost  -  530101994  truth is null and generated -  0 Dimensional Australia Core Equity Trust - Active ETF\n",
-      "Performance fee and cost  -  530101994  truth is null and generated -  0 Dimensional Australian Value Trust - Active ETF\n",
-      "Performance fee and cost  -  530101994  truth is null and generated -  0 Dimensional Global Core Equity Trust (Unhedged Class) - Active ETF\n",
-      "Performance fee and cost  -  530101994  truth is null and generated -  0 Dimensional Global Core Equity Tr\n",
-      "Performance fee and cost  -  550769189  truth is null and generated -  0 Acadian Global Managed Volatility Equity - Class A\n",
-      "Performance fee and cost  -  550522985  truth is null and generated -  0 RQI Global Value – Class A\n",
-      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP - Generations - BlackRock Australian Fixed Interest Index\n",
-      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP - Generations - BlackRock Australian Equity Index\n",
-      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP Generations - Alliance Capital Cash Management\n",
-      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP - Generations - BlackRock Property Securities Index\n",
-      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP - Generations - BlackRock International Equity Index (Unhedged)\n",
-      "Performance fee and cost  -  539266893  generated is null and  truth is -    AMP - Generations - BlackRock International Equity Index (Hedged)\n",
-      "Performance fee and cost  -  539241700  truth -  0.08  and generated -  0.05   North Professional Balanced\n",
-      "Performance fee and cost  -  539241700  truth -  0.06  and generated -  0   North Professional High Growth\n",
-      "Performance fee and cost  -  539241700  truth -  0.08  and generated -  0   North Professional Conservative\n",
-      "Performance fee and cost  -  539241700  truth -  0.08  and generated -  0   North Professional Growth\n",
-      "Performance fee and cost  -  539241700  truth -  0.09  and generated -  0   North Professional Moderately Conservative\n",
-      "Performance fee and cost  -  539261734  truth -  0.01  and generated -  0   ipac life choices Income Generator\n",
-      "Performance fee and cost  -  539261734  truth -  0.06  and generated -  0   ipac life choices Active 100\n",
-      "Performance fee and cost  -  539261734  truth -  0.08  and generated -  0   ipac life choices Active 85\n",
-      "Performance fee and cost  -  539261734  truth -  0.01  and generated -  0   ipac life choices Index 50\n",
-      "Performance fee and cost  -  539261734  truth -  0.09  and generated -  0   ipac life choices Active 50\n",
-      "Performance fee and cost  -  539261734  truth -  0.08  and generated -  0   ipac life choices Active 70\n",
-      "Performance fee and cost  -  506913190  generated is null and  truth is -  0.03 FC W Pen-CFS TTR Moderate\n",
-      "Performance fee and cost  -  506913190  generated is null and  truth is -  0.04 FC W Pen-CFS TTR Growth\n",
-      "Performance fee and cost  -  506913190  generated is null and  truth is -  0.47 \n",
-      "Performance fee and cost  -  553449663  truth -  0  and generated -  0.07   AMP Capital Specialist International Share (Hedged) Fund\n",
-      "Performance fee and cost  -  539266874  truth -  0.03  and generated -  0   SUMMIT Select - Active High Growth Units\n",
-      "Performance fee and cost  -  539266874  truth -  0.05  and generated -  0   SUMMIT Select - Active Moderately Defensive\n",
-      "Performance fee and cost  -  539266874  truth -  0.05  and generated -  0   SUMMIT Select - Active Growth Units\n",
-      "Performance fee and cost  -  539266874  truth -  0.05  and generated -  0   SUMMIT Select - Active Balanced\n",
-      "Performance fee and cost  -  539266874  truth -  0.06  and generated -  0   SUMMIT Select - Active Defensive Units\n",
-      "Performance fee and cost  -  539266880  truth -  0.01  and generated -  0   North Multi-manager Active High Growth\n",
-      "Performance fee and cost  -  539266880  truth -  0.01  and generated -  0   North Multi-manager Active Moderately Defensive\n",
-      "Performance fee and cost  -  539266880  truth -  0.01  and generated -  0   North Multi-manager Active Growth\n",
-      "Performance fee and cost  -  539266880  truth -  0.01  and generated -  0   North Multi-manager Balanced\n",
-      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT Future Goals BTFM\n",
-      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BTFM Asian Share\n",
-      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT International Share BTFM\n",
-      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT Smaller Companies BTFM\n",
-      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT Investment Funds - BT TIME Fund\n",
-      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT European Share Growth\n",
-      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT American Share Growth\n",
-      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 BT Imputation Share BTFM\n",
-      "Performance fee and cost  -  526200514  generated is null and  truth is -  0 \n",
-      "Performance fee and cost  -  521606755  truth is null and generated -  0 CFS Index Diversified\n",
-      "Performance fee and cost  -  557526129  truth is null and generated -  0 Fortlake Real-Income Fund\n",
-      "Performance fee and cost  -  540028470  truth is null and generated -  0 CFS Wholesale Index Australian Share\n",
-      "Performance fee and cost  -  531373053  truth is null and generated -  0 Dimensional Global Core Equity Trust (Unhedged Class) - Active ETF\n",
-      "Performance fee and cost  -  531373053  truth is null and generated -  0 Dimensional Australian Value Trust - Active ETF\n",
-      "Performance fee and cost  -  531373053  truth is null and generated -  0 Dimensional Global Value Trust -Active ETF\n",
-      "Performance fee and cost  -  531373053  truth is null and generated -  0 Dimensional Australia Core Equity Trust - Active ETF\n",
-      "Performance fee and cost  -  531373053  truth is null and generated -  0 Dimensional Global Small Company Trust\n",
-      "Performance fee and cost  -  557362553  truth is null and generated -  0 JPMorgan Global Select Equity Fund\n",
-      "Performance fee and cost  -  527969661  truth is null and generated -  0 JPMorgan Global Equity Premium Income (Hedged) Complex ETF\n",
-      "Performance fee and cost  -  384508026  generated is null and  truth is -  0 Mercer Multi-manager High Growth Fund\n",
-      "Performance fee and cost  -  384508026  generated is null and  truth is -  0 Mercer Multi-manager Growth Fund\n",
-      "Performance fee and cost  -  384508026  generated is null and  truth is -  0 \n",
-      "total -  452.72727272727275\n",
+      "Document List File -  None\n",
      "Metric                                            \tPrecision \tRecall    \tAccuracy  \tF1-Score  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
-      "Management Fee and Costs                          \t0.8790    \t0.9250    \t0.8213    \t0.9014    \t494       \t407       \t2         \t56        \t33        \n",
-      "Management Fee                                    \t0.8985    \t0.9265    \t0.8394    \t0.9123    \t494       \t416       \t2         \t47        \t33        \n",
-      "Performance fee and cost                          \t0.7871    \t0.8472    \t0.7791    \t0.8161    \t327       \t244       \t144       \t66        \t44        \n",
-      "Interposed vehicle Performance fee and Costs      \t0.5000    \t1.0000    \t0.9237    \t0.6667    \t39        \t38        \t422       \t38        \t0         \n",
-      "Administration Fee and costs                      \t0.9787    \t0.9388    \t0.9839    \t0.9583    \t98        \t92        \t398       \t2         \t6         \n",
-      "Total Annual Dollar Based Charges                 \t0.8165    \t1.0000    \t0.9598    \t0.8990    \t90        \t89        \t389       \t20        \t0         \n",
-      "Buy Spread                                        \t0.8957    \t0.8910    \t0.8394    \t0.8933    \t405       \t335       \t83        \t39        \t41        \n",
-      "Sell Spread                                       \t0.9064    \t0.8921    \t0.8474    \t0.8992    \t405       \t339       \t83        \t35        \t41        \n",
-      "Minimum Initial Investment                        \t0.8571    \t0.9671    \t0.8815    \t0.9088    \t310       \t294       \t145       \t49        \t10        \n",
-      "Benchmark                                         \t0.6402    \t0.8582    \t0.8233    \t0.7333    \t173       \t121       \t289       \t68        \t20        \n",
-      "TOTAL                                             \t0.8159    \t0.9246    \t0.8699    \t0.8588    \t2835      \t2375      \t1957      \t420       \t228       \n",
-      "Total Funds Matched - 498\n",
-      "Total Funds Not Matched - 28\n",
-      "Percentage of Funds Matched - 94.67680608365019\n"
+      "management_fee_and_costs                          \t0.8907    \t0.9513    \t0.8525    \t0.9200    \t457       \t391       \t2         \t48        \t20        \n",
+      "management_fee                                    \t0.9043    \t0.9520    \t0.8655    \t0.9276    \t457       \t397       \t2         \t42        \t20        \n",
+      "performance_fee_costs                             \t0.8408    \t0.8556    \t0.8113    \t0.8482    \t303       \t243       \t131       \t46        \t41        \n",
+      "interposed_vehicle_performance_fee_cost           \t0.6316    \t1.0000    \t0.9393    \t0.7742    \t49        \t48        \t385       \t28        \t0         \n",
+      "administration_fees                               \t0.9767    \t0.9655    \t0.9892    \t0.9711    \t87        \t84        \t372       \t2         \t3         \n",
+      "total_annual_dollar_based_charges                 \t0.8350    \t1.0000    \t0.9631    \t0.9101    \t87        \t86        \t358       \t17        \t0         \n",
+      "buy_spread                                        \t0.9059    \t0.9258    \t0.8655    \t0.9158    \t391       \t337       \t62        \t35        \t27        \n",
+      "sell_spread                                       \t0.9113    \t0.9262    \t0.8698    \t0.9187    \t391       \t339       \t62        \t33        \t27        \n",
+      "minimum_initial_investment                        \t0.9463    \t0.9814    \t0.9479    \t0.9635    \t329       \t317       \t120       \t18        \t6         \n",
+      "benchmark_name                                    \t0.7444    \t0.8701    \t0.8568    \t0.8024    \t172       \t134       \t261       \t46        \t20        \n",
+      "TOTAL                                             \t0.8587    \t0.9428    \t0.8961    \t0.8951    \t2723      \t2376      \t1755      \t315       \t164       \n",
+      "Total Funds Matched - 461\n",
+      "Total Funds Not Matched - 125\n",
+      "Percentage of Funds Matched - 78.66894197952219\n",
+      "All Providers Results: \n",
+      "Document List File -  ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
+      "Metric                                            \tPrecision \tRecall    \tAccuracy  \tF1-Score  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
+      "management_fee_and_costs                          \t0.8960    \t0.9451    \t0.8516    \t0.9199    \t180       \t155       \t0         \t18        \t9         \n",
+      "management_fee                                    \t0.9017    \t0.9455    \t0.8571    \t0.9231    \t180       \t156       \t0         \t17        \t9         \n",
+      "performance_fee_costs                             \t0.8000    \t0.8261    \t0.8077    \t0.8128    \t94        \t76        \t71        \t19        \t16        \n",
+      "interposed_vehicle_performance_fee_cost           \t0.5273    \t1.0000    \t0.8571    \t0.6905    \t30        \t29        \t127       \t26        \t0         \n",
+      "administration_fees                               \t1.0000    \t0.3333    \t0.9890    \t0.5000    \t3         \t1         \t179       \t0         \t2         \n",
+      "buy_spread                                        \t0.9643    \t0.9419    \t0.9121    \t0.9529    \t176       \t162       \t4         \t6         \t10        \n",
+      "sell_spread                                       \t0.9702    \t0.9422    \t0.9176    \t0.9560    \t176       \t163       \t4         \t5         \t10        \n",
+      "minimum_initial_investment                        \t0.9137    \t0.9549    \t0.9011    \t0.9338    \t139       \t127       \t37        \t12        \t6         \n",
+      "benchmark_name                                    \t0.7188    \t0.8734    \t0.7967    \t0.7886    \t91        \t69        \t76        \t27        \t10        \n",
+      "TOTAL                                             \t0.7692    \t0.7762    \t0.8885    \t0.7478    \t1069      \t938       \t679       \t131       \t236       \n",
+      "Total Funds Matched - 182\n",
+      "Total Funds Not Matched - 24\n",
+      "Percentage of Funds Matched - 88.3495145631068\n",
+      "All Providers Results: \n",
+      "Document List File -  ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
+      "Metric                                            \tPrecision \tRecall    \tAccuracy  \tF1-Score  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
+      "management_fee_and_costs                          \t0.8872    \t0.9555    \t0.8530    \t0.9201    \t277       \t236       \t2         \t30        \t11        \n",
+      "management_fee                                    \t0.9060    \t0.9563    \t0.8710    \t0.9305    \t277       \t241       \t2         \t25        \t11        \n",
+      "performance_fee_costs                             \t0.8608    \t0.8698    \t0.8136    \t0.8653    \t209       \t167       \t60        \t27        \t25        \n",
+      "interposed_vehicle_performance_fee_cost           \t0.9048    \t1.0000    \t0.9928    \t0.9500    \t19        \t19        \t258       \t2         \t0         \n",
+      "administration_fees                               \t0.9765    \t0.9881    \t0.9892    \t0.9822    \t84        \t83        \t193       \t2         \t1         \n",
+      "total_annual_dollar_based_charges                 \t0.8431    \t1.0000    \t0.9427    \t0.9149    \t87        \t86        \t177       \t16        \t0         \n",
+      "buy_spread                                        \t0.8578    \t0.9115    \t0.8351    \t0.8838    \t215       \t175       \t58        \t29        \t17        \n",
+      "sell_spread                                       \t0.8627    \t0.9119    \t0.8387    \t0.8866    \t215       \t176       \t58        \t28        \t17        \n",
+      "minimum_initial_investment                        \t0.9694    \t1.0000    \t0.9785    \t0.9845    \t190       \t190       \t83        \t6         \t0         \n",
+      "benchmark_name                                    \t0.7738    \t0.8667    \t0.8961    \t0.8176    \t81        \t65        \t185       \t19        \t10        \n",
+      "TOTAL                                             \t0.8842    \t0.9460    \t0.9011    \t0.9136    \t1654      \t1438      \t1076      \t184       \t328       \n",
+      "Total Funds Matched - 279\n",
+      "Total Funds Not Matched - 101\n",
+      "Percentage of Funds Matched - 73.42105263157895\n"
     ]
    }
   ],
@ -499,7 +420,9 @@
    "from collections import defaultdict\n",
    "import pandas as pd\n",
    "import statistics\n",
-    "\n",
+    "import os\n",
+    "import re\n",
+    "from utils.similarity import Similarity\n",
    "\n",
    "funds_matched = 0\n",
    "funds_not_matched = 0\n",
@ -519,7 +442,7 @@
    "    return headers, data\n",
    "\n",
    "def index_data_by_key(data, key_index, secondary_key_index, header):\n",
-    "    \"\"\"Index data by primary and secondary keys (doc_id and fund_name).\"\"\"\n",
+    "    \"\"\"Index data by primary and secondary keys (doc_id and sec_name).\"\"\"\n",
    "    indexed_data = defaultdict(dict)\n",
    "    \n",
    "    for row in data:\n",
@ -528,7 +451,8 @@
    "        for i in range(len(row)):\n",
    "            if header[i] == \"doc_id\":\n",
    "                primary_key = int(row[i])\n",
-    "            elif header[i] == \"fund_name\":\n",
+    "            elif header[i] == \"sec_name\":\n",
+    "                # share class should be the comparison level and key\n",
    "                secondary_key = str(row[i])\n",
    "            else:\n",
    "                row_data[header[i]] = convert_if_number(row[i])\n",
@ -549,7 +473,7 @@
    "    value1 = convert_if_number(value1)\n",
    "    value2 = convert_if_number(value2)\n",
    "    return value1 == value2\n",
-    "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched):\n",
+    "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched, document_list):\n",
    "    \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n",
    "    results = {}\n",
    "    funds_matched, funds_not_matched = 0, 0\n",
@ -566,11 +490,15 @@
    "    # Iterate over the generated results instead of the ground truth\n",
    "    \n",
    "    total = 0\n",
-    "    for doc_id, funds in ground_truth.items():\n",
+    "    message_list = []\n",
+    "    # print(document_list)\n",
+    "    for doc_id, secs in ground_truth.items():\n",
+    "        if document_list is not None and str(doc_id) not in document_list:\n",
+    "            continue\n",
    "        if doc_id in generated_results:\n",
-    "            for fund_name, truth_values in funds.items():\n",
-    "                if fund_name in generated_results[doc_id]:\n",
-    "                    generated_values = generated_results[doc_id][fund_name]\n",
+    "            for sec_name, truth_values in secs.items():\n",
+    "                if sec_name in generated_results[doc_id]:\n",
+    "                    generated_values = generated_results[doc_id][sec_name]\n",
    "                    # Compare all other columns\n",
    "                    for i in intersection_list:\n",
    "                        for keys in imp_datapoints:\n",
@ -581,52 +509,56 @@
    "                                        results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
    "                                    else:\n",
    "                                        results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
-    "                                        if \"Performance fee and cost\" in keys:\n",
-    "                                            debug = 0\n",
-    "                                            print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], fund_name)                                        \n",
+    "                                        # if \"Performance fee and cost\" in keys:\n",
+    "                                        debug = 0\n",
+    "                                        # print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], sec_name)         \n",
+    "                                        message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Truth is null and generated is not null\"}\n",
+    "                                        message_list.append(message)                               \n",
    "                                else:\n",
    "                                    if truth_values[i] == generated_values[i]:\n",
    "                                        results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
    "                                    elif generated_values[i] != \"\":\n",
-    "                                        results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
-    "                                        if \"Performance fee and cost\" in keys:\n",
+    "                                        if i == \"benchmark_name\" and compare_text(truth_values[i], generated_values[i]):\n",
+    "                                            results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
+    "                                        else:\n",
+    "                                            results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
+    "                                            # if \"Performance fee and cost\" in keys:\n",
    "                                            debug = 0\n",
-    "                                            print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", fund_name)\n",
+    "                                            # print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", sec_name)\n",
+    "                                            message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Truth is not equal with generated\"}\n",
+    "                                            message_list.append(message)\n",
    "                                    else:\n",
    "                                        results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
-    "                                        if \"Performance fee and cost\" in keys:\n",
-    "                                            debug = 0\n",
-    "                                            print(keys, \" - \" , doc_id, \" generated is null and  truth is - \", truth_values[i], fund_name)\n",
+    "                                        # if \"Performance fee and cost\" in keys:\n",
+    "                                        debug = 0\n",
+    "                                        # print(keys, \" - \" , doc_id, \" generated is null and  truth is - \", truth_values[i], sec_name)\n",
+    "                                        message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Generated is null and truth is not null\"}\n",
+    "                                        message_list.append(message)\n",
    "                                    results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
-    "\n",
-    "\n",
-    "                                # if truth_values[i] == generated_values[i] and truth_values[i] == \"\":\n",
-    "                                #     results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
-    "                                # elif truth_values[i] == generated_values[i]:\n",
-    "                                #     results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
-    "                                # elif truth_values[i] != \"\" and generated_values[i] == \"\":\n",
-    "                                #     results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
-    "                                # elif truth_values[i] == \"\" and generated_values[i] != \"\":\n",
-    "                                #     results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
-    "                                # else:\n",
-    "                                #     results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
-    "                                # if truth_values[i] != \"\":\n",
-    "                                #     results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
    "                    funds_matched += 1\n",
    "                else:\n",
    "                    funds_not_matched += 1\n",
-    "                    # for keys in headers:\n",
-    "                    #     if keys != \"doc_id\":\n",
-    "                    #         results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
    "        else:\n",
    "            # If the entire document is not found, count all funds as not matched\n",
-    "            funds_not_matched += len(funds)\n",
-    "            # for fund_name in funds:\n",
-    "            #     for keys in headers:\n",
-    "            #         if keys != \"doc_id\":\n",
-    "            #             results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
-    "    return results, funds_matched, funds_not_matched\n",
+    "            funds_not_matched += len(secs)\n",
+    "    return results, message_list, funds_matched, funds_not_matched\n",
    "\n",
+    "def clean_text(text: str):\n",
+    "    if text is None or len(text) == 0:\n",
+    "        return text\n",
+    "    text = re.sub(r\"\\W\", \" \", text)\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    return text\n",
+    "\n",
+    "def compare_text(source_text, target_text):\n",
+    "    source_text = clean_text(source_text)\n",
+    "    target_text = clean_text(target_text)\n",
+    "    if source_text == target_text or source_text in target_text or target_text in source_text:\n",
+    "        return True\n",
+    "    similarity = Similarity()\n",
+    "    jacard_score = similarity.jaccard_similarity(source_text.lower().split(), target_text.lower().split())\n",
+    "    if jacard_score > 0.8:\n",
+    "        return True\n",
    "\n",
    "# Load the files\n",
    "headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n",
@ -664,12 +596,15 @@
    "    total_fp = []\n",
    "    #total_fn = []\n",
    "    # Calculate and print metrics for each item\n",
+    "    metrics_list = []\n",
    "    for keys in imp_datapoints:\n",
    "        try:\n",
    "            key = imp_datapoints_mapping[keys]\n",
    "            values = data[key]\n",
    "            tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
    "            precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
+    "            metrics = {\"Datapoint\": keys, \"F1-Score\": f1_score, \"Precision\": precision, \"Recall\": recall, \"Accuracy\": accuracy, \"SUPPORT\": values[\"SUPPORT\"], \"TP\": tp, \"TN\": tn, \"FP\": fp, \"FN\": fn}\n",
+    "            metrics_list.append(metrics)\n",
    "            total_precision.append(precision)\n",
    "            total_recall.append(recall)\n",
    "            total_accuracy.append(accuracy)\n",
@ -681,10 +616,22 @@
    "            total_fn.append(fn)\n",
    "\n",
    "            if values[\"SUPPORT\"] > 0 and key > \"\":\n",
-    "                print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(keys, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
+    "                print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(key, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
    "        except:\n",
    "            pass\n",
-    "    print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", statistics.mean(total_precision), statistics.mean(total_recall), statistics.mean(total_accuracy), statistics.mean(total_f1_score), sum(total_support), sum(total_tp), sum(total_tn), sum(total_fp), sum(total_fn)))\n",
+    "    total_mean_precision = statistics.mean(total_precision)\n",
+    "    total_mean_recall = statistics.mean(total_recall)\n",
+    "    total_mean_accuracy = statistics.mean(total_accuracy)\n",
+    "    total_mean_f1_score = statistics.mean(total_f1_score)\n",
+    "    total_sum_support = sum(total_support)\n",
+    "    total_sum_tp = sum(total_tp)\n",
+    "    total_sum_tn = sum(total_tn)\n",
+    "    total_sum_fp = sum(total_fp)\n",
+    "    total_sum_fn = sum(total_fn)\n",
+    "    total_metrics = {\"Datapoint\": \"TOTAL\", \"F1-Score\": total_mean_f1_score, \"Precision\": total_mean_precision, \"Recall\": total_mean_recall, \"Accuracy\": total_mean_accuracy, \"SUPPORT\": total_sum_support, \"TP\": total_sum_tp, \"TN\": total_sum_tn, \"FP\": total_sum_fp, \"FN\": total_sum_fn}\n",
+    "    metrics_list.append(total_metrics)\n",
+    "    print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", total_mean_precision, total_mean_recall, total_mean_accuracy, total_mean_f1_score, total_sum_support, total_sum_tp, total_sum_tn, total_sum_fp, total_sum_fn))\n",
+    "    return metrics_list\n",
    "    \n",
    "def create_metrics_df(data):\n",
    "    # Define a list to hold data for DataFrame\n",
@ -771,14 +718,45 @@
    "\n",
    "print(\"\\n\")\n",
    "print(\"\\n\")\n",
-    "print(\"All Providers Results: \")\n",
-    "comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, generated_results_indexed, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
+    "document_list_file_list = [None, \n",
+    "                           \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n",
+    "                           \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n",
+    "for document_list_file in document_list_file_list:\n",
+    "    document_list = None\n",
+    "    if document_list_file is not None:\n",
+    "        with open(document_list_file, \"r\", encoding=\"utf-8\") as f:\n",
+    "            document_list = f.readlines()\n",
+    "            document_list = [doc_id.strip() for doc_id in document_list]\n",
+    "    \n",
+    "    print(\"All Providers Results: \")\n",
+    "    print(\"Document List File - \", document_list_file)\n",
+    "    comparison_results, message_list, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, \n",
+    "                                                                                      generated_results_indexed, \n",
+    "                                                                                      headers_gt, doc_id_index, \n",
+    "                                                                                      fund_name_index, \n",
+    "                                                                                      intersection_list,\n",
+    "                                                                                      funds_matched, \n",
+    "                                                                                      funds_not_matched,\n",
+    "                                                                                      document_list)\n",
+    "    metrics_list = print_metrics_table(comparison_results)\n",
+    "    print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
+    "    print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
    "\n",
-    "print_metrics_table(comparison_results)\n",
-    "print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
-    "print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
+    "    metrics_df = pd.DataFrame(metrics_list)\n",
+    "    message_df = pd.DataFrame(message_list)\n",
    "\n",
-    "\n"
+    "    output_metrics_folder = r\"/data/aus_prospectus/output/metrics_data/\"\n",
+    "    if os.path.exists(output_metrics_folder):\n",
+    "        generated_file_base_name = os.path.basename(path_generated_results).replace(\".xlsx\", \"\")\n",
+    "        metrics_file_name = f\"metrics_{generated_file_base_name}\"\n",
+    "        if document_list_file is not None:\n",
+    "            metrics_file_name = f\"{metrics_file_name}_{len(document_list)}_documents.xlsx\"\n",
+    "        else:\n",
+    "            metrics_file_name = f\"{metrics_file_name}_all_documents.xlsx\"\n",
+    "        metrics_file_path = os.path.join(output_metrics_folder, metrics_file_name)\n",
+    "        with pd.ExcelWriter(metrics_file_path) as writer:\n",
+    "            metrics_df.to_excel(writer, sheet_name=\"metrics_data\", index=False)\n",
+    "            message_df.to_excel(writer, sheet_name=\"message_data\", index=False)\n"
   ]
  },
  {
@ -833,7 +811,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.12.6"
  },
  "orig_nbformat": 4
 },
--- a/utils/pdf_download.py
+++ b/utils/pdf_download.py
@ -41,11 +41,17 @@ def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str):
            ACCESS_KEY = os.getenv('ACCESS_KEY')
            SECRET_KEY = os.getenv('SECRET_KEY')
            AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
-            s3 = boto3.client("s3", region_name="us-east-1", verify=certifi.where(), 
+            if AWS_SESSION_TOKEN:
+                s3 = boto3.client("s3", region_name="us-east-1", verify=certifi.where(), 
                            aws_access_key_id=ACCESS_KEY,
                            aws_secret_access_key=SECRET_KEY,
                            aws_session_token=AWS_SESSION_TOKEN
                            )
+            else:
+                s3 = boto3.client("s3", region_name="us-east-1", verify=certifi.where(), 
+                                aws_access_key_id=ACCESS_KEY,
+                                aws_secret_access_key=SECRET_KEY
+                                )
        else:
            s3 = boto3.client('s3')