From c71936c5ff0d12217a1e7280fb835ba2c88c1972 Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Tue, 18 Mar 2025 17:22:21 -0500
Subject: [PATCH] 1. optimize benchmark_name instructions 2. consider possible
 with multiple same raw fund names in documents, not to remove
 unmatched_db_list when match relevant raw fund/ share name Otherwise, it will
 occur some raw names couldn't match db name issue.

---
 calc_metrics.py                               | 32 ++++---
 core/auz_nz/hybrid_solution_script.py         | 68 ++++++++------
 core/data_extraction.py                       |  2 +-
 .../data_extraction_prompts_config.json       | 18 +++-
 main.py                                       |  2 +-
 performance.ipynb                             | 92 +++++++++----------
 6 files changed, 123 insertions(+), 91 deletions(-)

diff --git a/calc_metrics.py b/calc_metrics.py
index bb286fc..dbde368 100644
--- a/calc_metrics.py
+++ b/calc_metrics.py
@@ -1376,8 +1376,8 @@ def clean_text(text: str):
 
 
 def merge_inference_data():
-    file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317_Ravi.xlsx"
-    file2 = r"/data/aus_prospectus/output/merged_data/docs/excel/merged_420339794.xlsx"
+    file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530.xlsx"
+    file2 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_3_documents_by_text_20250318171348.xlsx"
     columns = [
         "doc_id",
         "effective_date",
@@ -1404,20 +1404,30 @@ def merge_inference_data():
         "change_recoverable_expenses"
     ]
     
-    file1_data_df = pd.read_excel(file1, sheet_name="total_mapping_data")
-    file1_data_df = file1_data_df[columns]
+    document_id_list = [384508026, 521606755, 544886057]
+    file1_total_mapping_data_df = pd.read_excel(file1, sheet_name="total_mapping_data")
+    file1_total_mapping_data_df = file1_total_mapping_data_df[columns]
     # remove the rows which doc_id is 420339794 from file1_data_df
-    file1_data_df = file1_data_df[file1_data_df["doc_id"] != 420339794]
+    file1_total_mapping_data_df = file1_total_mapping_data_df[~(file1_total_mapping_data_df["doc_id"].isin(document_id_list))]
+    
+    file1_extract_data_df = pd.read_excel(file1, sheet_name="extract_data")
+    file1_extract_data_df = file1_extract_data_df[~(file1_extract_data_df["doc_id"].isin(document_id_list))]
+    
+    file2_total_mapping_data_df = pd.read_excel(file2, sheet_name="total_mapping_data")
+    file2_total_mapping_data_df = file2_total_mapping_data_df[columns]
+    total_mapping_data_df = pd.concat([file1_total_mapping_data_df, file2_total_mapping_data_df])
+    total_mapping_data_df.reset_index(drop=True, inplace=True)
+    
+    file2_extract_data_df = pd.read_excel(file2, sheet_name="extract_data")
+    total_extract_data_df = pd.concat([file1_extract_data_df, file2_extract_data_df])
+    total_extract_data_df.reset_index(drop=True, inplace=True)
     
-    file2_data_df = pd.read_excel(file2, sheet_name="merged_data")
-    file2_data_df = file2_data_df[columns]
-    total_data_df = pd.concat([file1_data_df, file2_data_df])
-    total_data_df.reset_index(drop=True, inplace=True)
     
     output_folder = r"/data/aus_prospectus/output/mapping_data/total/"
-    output_file = os.path.join(output_folder, "mapping_data_info_46_documents_by_text_20250317_Ravi_modified.xlsx")
+    output_file = os.path.join(output_folder, "mapping_data_info_46_documents_by_text_20250318124530_new.xlsx")
     with pd.ExcelWriter(output_file) as f:
-        total_data_df.to_excel(f, index=False, sheet_name="total_mapping_data")
+        total_mapping_data_df.to_excel(f, index=False, sheet_name="total_mapping_data")
+        total_extract_data_df.to_excel(f, index=False, sheet_name="total_extract_data")
     
     
 
diff --git a/core/auz_nz/hybrid_solution_script.py b/core/auz_nz/hybrid_solution_script.py
index 9c2a603..64693ca 100644
--- a/core/auz_nz/hybrid_solution_script.py
+++ b/core/auz_nz/hybrid_solution_script.py
@@ -460,8 +460,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
                                             step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
                                             step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2], 
                                             llm_flag=False))
-                if db_list[matched_index] in unmatched_db_list:
-                    unmatched_db_list.remove(db_list[matched_index])
+                # if db_list[matched_index] in unmatched_db_list:
+                #     unmatched_db_list.remove(db_list[matched_index])
                 # unmatched_db_list.remove(db_list[matched_index])
                 if pred_list[index] in unmatched_pred_list:
                     unmatched_pred_list.remove(pred_list[index])
@@ -485,8 +485,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
                                             step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
                                             step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
                                             step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False))
-                    if db_list[matched_index] in unmatched_db_list:
-                        unmatched_db_list.remove(db_list[matched_index])
+                    # if db_list[matched_index] in unmatched_db_list:
+                    #     unmatched_db_list.remove(db_list[matched_index])
                     # unmatched_db_list.remove(db_list[matched_index])
                     if pred_list[index] in unmatched_pred_list:
                         unmatched_pred_list.remove(pred_list[index])
@@ -513,8 +513,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
                                             step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
                                             step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
                                             step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False))
-                        if db_list[matched_index] in unmatched_db_list:
-                            unmatched_db_list.remove(db_list[matched_index])
+                        # if db_list[matched_index] in unmatched_db_list:
+                        #     unmatched_db_list.remove(db_list[matched_index])
                         # unmatched_db_list.remove(db_list[matched_index])
                         if pred_list[index] in unmatched_pred_list:
                             unmatched_pred_list.remove(pred_list[index])
@@ -543,8 +543,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
                                     step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3,
                                     step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2],
                                     step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False))
-                            if db_list[matched_index] in unmatched_db_list:
-                                unmatched_db_list.remove(db_list[matched_index])
+                            # if db_list[matched_index] in unmatched_db_list:
+                            #     unmatched_db_list.remove(db_list[matched_index])
                             # unmatched_db_list.remove(db_list[matched_index])
                             if pred_list[index] in unmatched_pred_list:
                                 unmatched_pred_list.remove(pred_list[index])
@@ -585,8 +585,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
                                 # print("unmatched_pred_list: ",unmatched_pred_list)
                                 # print("db_list[matched_index]: ",db_list[matched_index])
                                 # print("pred_list[index]: ",pred_list[index])
-                                if db_list[matched_index] in unmatched_db_list:
-                                    unmatched_db_list.remove(db_list[matched_index])
+                                # if db_list[matched_index] in unmatched_db_list:
+                                #     unmatched_db_list.remove(db_list[matched_index])
                                 # unmatched_db_list.remove(db_list[matched_index])
                                 if pred_list[index] in unmatched_pred_list:
                                     unmatched_pred_list.remove(pred_list[index])
@@ -663,20 +663,27 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
                 # print("k: ",k)
                 # print("v: ",v)
                 og_db_index=-1
-                og_pred_index = -1
+                # og_pred_index = -1
+                og_pred_index_list = []
                 if k in cleaned_unmatched_pred_list:
-                    og_pred_index = cleaned_unmatched_pred_list.index(k)
+                    for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
+                        if c_item==k:
+                            og_pred_index_list.append(c_idx)
+                    # og_pred_index = cleaned_unmatched_pred_list.index(k)
                 
-                if og_pred_index == -1:
+                if len(og_pred_index_list) == 0:
                     # sometimes, the raw name and db name reversed from the LLM response
                     if v in cleaned_unmatched_pred_list and k in cleaned_unmatched_db_list:
-                        og_pred_index = cleaned_unmatched_pred_list.index(v)
+                        for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
+                            if c_item==v:
+                                og_pred_index_list.append(c_idx)
+                        # og_pred_index = cleaned_unmatched_pred_list.index(v)
                         og_db_index = cleaned_unmatched_db_list.index(k)
                         # v and k are swapped
                         temp = v
                         v = k
                         k = temp
-                if og_pred_index==-1:
+                if len(og_pred_index_list)==0:
                     continue
                 # og_db_index = cleaned_unmatched_db_list.index(v)
                 if og_db_index == -1 and v in cleaned_unmatched_db_list:
@@ -685,21 +692,22 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
                 # print("unmatched_db_list: ",unmatched_db_list)
                 
                 for i in df_data:
-                    if i['pred_fund']==unmatched_pred_list[og_pred_index]:
-                        if og_db_index!=-1:
-                            i['db_fund']=unmatched_db_list[og_db_index]
-                            i['cleaned_db_fund_name'] = v
-                            final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
-                        else:
-                            i['db_fund'] = ''
-                            i['cleaned_db_fund_name'] = ''
-                            final_result.update({unmatched_pred_list[og_pred_index]:""})
-                        i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
-                        i['llm_clean_db_list'] = cleaned_unmatched_db_list,
-                        i['llm_pred_fund'] = k
-                        i['llm_matched_db_name'] = v
-                        i['llm_result'] = llm_result
-                        break
+                    for og_pred_index in og_pred_index_list:
+                        if i['pred_fund']==unmatched_pred_list[og_pred_index]:
+                            if og_db_index!=-1:
+                                i['db_fund']=unmatched_db_list[og_db_index]
+                                i['cleaned_db_fund_name'] = v
+                                final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
+                            else:
+                                i['db_fund'] = ''
+                                i['cleaned_db_fund_name'] = ''
+                                final_result.update({unmatched_pred_list[og_pred_index]:""})
+                            i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
+                            i['llm_clean_db_list'] = cleaned_unmatched_db_list,
+                            i['llm_pred_fund'] = k
+                            i['llm_matched_db_name'] = v
+                            i['llm_result'] = llm_result
+                            break
                         
                 
         # break
diff --git a/core/data_extraction.py b/core/data_extraction.py
index b470792..b3c83c8 100644
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@@ -793,7 +793,7 @@ class DataExtraction:
         previous_page_datapoints = []
         previous_page_fund_name = None
         for page_num, page_text in self.page_text_dict.items():
-            # if page_num not in [4, 5]:
+            # if page_num not in [14]:
             #     continue
             if page_num in handled_page_num_list:
                 continue
diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json
index e1db089..38c5bac 100644
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@@ -555,15 +555,23 @@
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders Fund\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
 				"---Example 3 Start---",
-				"\n\nFund name \nComposite benchmark \nCFS Select High \nGrowth \n1.0% Bloomberg AusBond Bank Bill Index, 31.0% MSCI All Country World ex Australia Net Index, 18.0% \nMSCI All Country World ex Australia Net Index Hedged AUD, 4.0% MSCI World Small Cap Index, 5.0% \nMSCI Emerging Markets Index, 37.0% S&P/ASX 300 Accumulation Index, 4.0% S&P/ASX Small \nOrdinaries Index. ",
+				"Composite benchmarks \n\nThe objective for some funds includes a reference to a composite benchmark. They may be subject to change \nat any time within the allocation ranges. \n\nFund name \nComposite benchmark \nCFS Defensive \nBuilder \n4.0% S&P/ASX 300 Accumulation Index, 4.0% MSCI All Country World ex Australia Net Index, 2.0% \nMSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI/Mercer Australia Core \nWholesale Monthly Property Fund Index, 4.0% MSCI Australia Quarterly Private Infrastructure Index \n(Unfrozen) – Post-fee Total Return (50th Percentile), 25.0% Bloomberg Global Aggregate Corporate \nIndex (hedged AUD), 18.0% Bloomberg Global Aggregate Index (AUD hedged), 20.0% Bloomberg \nAusBond Composite 0+Yr Index, 20.0% Bloomberg AusBond Bank Bill Index. \nCFS Growth Builder \n29.0% S&P/ASX 300 Accumulation Index, 25.0% MSCI All Country World ex Australia Net Index, 14.0% \nMSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI World Small Cap Index, \n5.0% MSCI Emerging Markets Index, 2.0% FTSE EPRA Nareit Developed ex Aus Rental Index (AUD \nhedged), 2.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 2.0% FTSE \nDeveloped Core Infra 50/50 100% Hedged to AUD, 4.0% MSCI Australia Quarterly Private Infrastructure \nIndex (Unfrozen) – Post-fee Total Return (50th Percentile), 12.0% Bloomberg Global Aggregate \nCorporate Index (hedged AUD), 2.0% Bloomberg AusBond Bank Bill Index. \n",
 				"---Example 3 End---",
+				"For this example, there are multiple fund names with multiple benchmark names with weightings, please extract them all.",
 				"The output should be:",
-				"{\"data\": [{\"fund name\": \"CFS Select High Growth\", \"benchmark_name\": \"1.0% Bloomberg AusBond Bank Bill Index, 31.0% MSCI All Country World ex Australia Net Index, 18.0% MSCI All Country World ex Australia Net Index Hedged AUD, 4.0% MSCI World Small Cap Index, 5.0% MSCI Emerging Markets Index, 37.0% S&P/ASX 300 Accumulation Index, 4.0% S&P/ASX Small Ordinaries Index\"}]}",
+				"{\"data\": [{\"fund name\": \"CFS Defensive Builder\", \"benchmark_name\": \"4.0% S&P/ASX 300 Accumulation Index, 4.0% MSCI All Country World ex Australia Net Index, 2.0% MSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 4.0% MSCI Australia Quarterly Private Infrastructure Index (Unfrozen) – Post-fee Total Return (50th Percentile), 25.0% Bloomberg Global Aggregate Corporate Index (hedged AUD), 18.0% Bloomberg Global Aggregate Index (AUD hedged), 20.0% Bloomberg AusBond Composite 0+Yr Index, 20.0% Bloomberg AusBond Bank Bill Index\"}, {\"fund name\": \"CFS Growth Builder\", \"benchmark_name\": \"29.0% S&P/ASX 300 Accumulation Index, 25.0% MSCI All Country World ex Australia Net Index, 14.0% MSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI World Small Cap Index, 5.0% MSCI Emerging Markets Index, 2.0% FTSE EPRA Nareit Developed ex Aus Rental Index (AUD hedged), 2.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 2.0% FTSE Developed Core Infra 50/50 100% Hedged to AUD, 4.0% MSCI Australia Quarterly Private Infrastructure Index (Unfrozen) – Post-fee Total Return (50th Percentile), 12.0% Bloomberg Global Aggregate Corporate Index (hedged AUD), 2.0% Bloomberg AusBond Bank Bill Index\"}]}",
 				"---Example 4 Start---",
 				"\n\nInvestment option \n1 \nPerformance - fee rate \n2 \nPerformance hurdle \n4 \n(maximum fee \n3 \n) \nPayment \nfrequency \nAusbil Australian \nEmerging Leaders \n15.375% \n(maximum 15% ex-GST) \nComposite benchmark (70% S&P/ASX Midcap 50 \nAccumulation Index and 30% S&P/ASX Small Ordinaries \nAccumulation Index) plus 0.85% pa \nMonthly \n",
 				"---Example 4 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index and 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
+				"---Example 5 Start---",
+				"Fixed interest / Income \nCash \nUBS Diversified Fixed Income BT Cash Management Trust \nBenchmark \n50% Bloomberg Barclays Global \nAggregate Index (A$ hedged), 50% \nBloomberg AusBond Composite \n0+ Yr Index* \nBloomberg AusBond Bank Bill Index* \n",
+				"---Example 5 End---",
+				"For this example, please read carefully for fund names in same line: \"UBS Diversified Fixed Income BT Cash Management Trust\", there are 2 fund names: \"UBS Diversified Fixed Income\" and \"BT Cash Management Trust\".",
+				"There are 2 benchmark names: \"50% Bloomberg Barclays Global Aggregate Index (A$ hedged), 50% Bloomberg AusBond Composite 0+ Yr Index\" and \"Bloomberg AusBond Bank Bill Index\".",
+				"The output should be:",
+				"{\"data\": [{\"fund name\": \"UBS Diversified Fixed Income\", \"benchmark_name\": \"50% Bloomberg Barclays Global Aggregate Index (A$ hedged), 50% Bloomberg AusBond Composite 0+ Yr Index\"}, {\"fund name\": \"BT Cash Management Trust\", \"benchmark_name\": \"Bloomberg AusBond Bank Bill Index\"}]}",
 				"\n",
 				"C. Don't extract benchmark name from context when fit below cases.",
 				"1. Exclude benchmark name when its reported name is \"Return target\".",
@@ -586,6 +594,12 @@
 				"Return targets, on the other hand, are goals set by the fund to achieve a certain level of performance over a specified period, in this case, 20 years. ", 
 				"They indicate the desired outcome rather than serving as a comparative measure against market performance.",
 				"The output should be:",
+				"{\"data\": []}",
+				"D. If extracted multiple benchmark names, but without weightings, e.g. 50% or 30%, please ignore and output empty.",
+				"---Example Start---",
+				"This is calculated by using the weighted average of the \nasset allocation neutral position and the index returns for each asset class. \n\nBT Multi-manager Growth Fund, BT Multi-manager Balanced Fund, BT Multi-manager Conservative Fund \nand BT Multi-manager High Growth Fund \n\nAsset class Indices \nAustralian shares S&P/ASX 300 Accumulation Index \nInternational shares MSCI World ex Australia $A (Net Dividends Reinvested) \nMSCI World ex Australia Hedged $A (Net Dividends Reinvested) \nMSCI Emerging Market (Net Dividends Reinvested) in AUD \nAustralian property S&P/ASX 300 A-REIT Accumulation Index \nInternational property FTSE EPRA/NAREIT Developed Hedged in AUD Net TRI \n",
+				"---Example End---",
+				"The output should be:",
 				"{\"data\": []}"
 			]
 		},
diff --git a/main.py b/main.py
index d7b0046..b5197f0 100644
--- a/main.py
+++ b/main.py
@@ -1538,7 +1538,7 @@ if __name__ == "__main__":
         with open(document_sample_file, "r", encoding="utf-8") as f:
             special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
         document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
-        # special_doc_id_list = ["420339794"]
+        # special_doc_id_list = ["521606755", "384508026", "544886057"]
         pdf_folder: str = r"/data/aus_prospectus/pdf/"
         output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
         output_extract_data_child_folder: str = (
diff --git a/performance.ipynb b/performance.ipynb
index 7a1f3d2..a4ab839 100644
--- a/performance.ipynb
+++ b/performance.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -36,14 +36,14 @@
     "\n",
     "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
     "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n",
-    "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530.xlsx\"\n",
+    "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530_new.xlsx\"\n",
     "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n",
     "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -316,7 +316,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -330,53 +330,53 @@
       "All Providers Results: \n",
       "Document List File -  None\n",
       "Metric                                            \tF1-Score  \tPrecision \tRecall    \tAccuracy  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
-      "management_fee_and_costs                          \t0.9211    \t0.8861    \t0.9589    \t0.8544    \t409       \t350       \t2         \t45        \t15        \n",
-      "management_fee                                    \t0.9419    \t0.9241    \t0.9605    \t0.8908    \t409       \t365       \t2         \t30        \t15        \n",
-      "performance_fee_costs                             \t0.8987    \t0.9325    \t0.8672    \t0.8714    \t284       \t235       \t124       \t17        \t36        \n",
-      "interposed_vehicle_performance_fee_cost           \t0.9600    \t0.9231    \t1.0000    \t0.9854    \t73        \t72        \t334       \t6         \t0         \n",
-      "administration_fees                               \t0.9853    \t0.9710    \t1.0000    \t0.9951    \t67        \t67        \t343       \t2         \t0         \n",
-      "total_annual_dollar_based_charges                 \t1.0000    \t1.0000    \t1.0000    \t1.0000    \t69        \t69        \t343       \t0         \t0         \n",
-      "buy_spread                                        \t0.9365    \t0.9242    \t0.9491    \t0.8956    \t352       \t317       \t52        \t26        \t17        \n",
-      "sell_spread                                       \t0.9412    \t0.9329    \t0.9496    \t0.9029    \t352       \t320       \t52        \t23        \t17        \n",
-      "minimum_initial_investment                        \t0.9737    \t0.9642    \t0.9834    \t0.9612    \t301       \t296       \t100       \t11        \t5         \n",
-      "benchmark_name                                    \t0.8137    \t0.8295    \t0.7985    \t0.8811    \t144       \t107       \t256       \t22        \t27        \n",
-      "TOTAL                                             \t0.9372    \t0.9288    \t0.9467    \t0.9238    \t2460      \t2198      \t1608      \t182       \t132       \n",
-      "Total Funds Matched - 412\n",
-      "Total Funds Not Matched - 153\n",
-      "Percentage of Funds Matched - 72.9203539823009\n",
+      "management_fee_and_costs                          \t0.9321    \t0.8838    \t0.9859    \t0.8734    \t400       \t350       \t2         \t46        \t5         \n",
+      "management_fee                                    \t0.9516    \t0.9192    \t0.9864    \t0.9082    \t400       \t364       \t2         \t32        \t5         \n",
+      "performance_fee_costs                             \t0.8992    \t0.8821    \t0.9170    \t0.8710    \t275       \t232       \t119       \t31        \t21        \n",
+      "interposed_vehicle_performance_fee_cost           \t0.9600    \t0.9231    \t1.0000    \t0.9851    \t73        \t72        \t325       \t6         \t0         \n",
+      "administration_fees                               \t0.9920    \t0.9841    \t1.0000    \t0.9975    \t62        \t62        \t340       \t1         \t0         \n",
+      "total_annual_dollar_based_charges                 \t1.0000    \t1.0000    \t1.0000    \t1.0000    \t63        \t63        \t340       \t0         \t0         \n",
+      "buy_spread                                        \t0.9339    \t0.9129    \t0.9560    \t0.8933    \t340       \t304       \t56        \t29        \t14        \n",
+      "sell_spread                                       \t0.9388    \t0.9219    \t0.9564    \t0.9007    \t340       \t307       \t56        \t26        \t14        \n",
+      "minimum_initial_investment                        \t0.9694    \t0.9628    \t0.9760    \t0.9553    \t292       \t285       \t100       \t11        \t7         \n",
+      "benchmark_name                                    \t0.9023    \t0.8759    \t0.9302    \t0.9355    \t140       \t120       \t257       \t17        \t9         \n",
+      "TOTAL                                             \t0.9479    \t0.9266    \t0.9708    \t0.9320    \t2385      \t2159      \t1597      \t199       \t75        \n",
+      "Total Funds Matched - 403\n",
+      "Total Funds Not Matched - 162\n",
+      "Percentage of Funds Matched - 71.32743362831859\n",
       "All Providers Results: \n",
       "Document List File -  ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
       "Metric                                            \tF1-Score  \tPrecision \tRecall    \tAccuracy  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
-      "management_fee_and_costs                          \t0.9457    \t0.8970    \t1.0000    \t0.8970    \t164       \t148       \t0         \t17        \t0         \n",
-      "management_fee                                    \t0.9783    \t0.9576    \t1.0000    \t0.9576    \t164       \t158       \t0         \t7         \t0         \n",
-      "performance_fee_costs                             \t0.8263    \t0.8846    \t0.7753    \t0.8242    \t95        \t69        \t67        \t9         \t20        \n",
-      "interposed_vehicle_performance_fee_cost           \t0.9455    \t0.8966    \t1.0000    \t0.9636    \t53        \t52        \t107       \t6         \t0         \n",
-      "administration_fees                               \t1.0000    \t1.0000    \t1.0000    \t1.0000    \t1         \t1         \t164       \t0         \t0         \n",
-      "buy_spread                                        \t0.9812    \t0.9752    \t0.9874    \t0.9636    \t162       \t157       \t2         \t4         \t2         \n",
-      "sell_spread                                       \t0.9876    \t0.9876    \t0.9876    \t0.9758    \t162       \t159       \t2         \t2         \t2         \n",
-      "minimum_initial_investment                        \t0.9569    \t0.9531    \t0.9606    \t0.9333    \t127       \t122       \t32        \t6         \t5         \n",
-      "benchmark_name                                    \t0.7733    \t0.7945    \t0.7532    \t0.7939    \t85        \t58        \t73        \t15        \t19        \n",
-      "TOTAL                                             \t0.9328    \t0.9273    \t0.9405    \t0.9232    \t1013      \t924       \t447       \t66        \t180       \n",
-      "Total Funds Matched - 165\n",
-      "Total Funds Not Matched - 31\n",
-      "Percentage of Funds Matched - 84.18367346938776\n",
+      "management_fee_and_costs                          \t0.9494    \t0.9036    \t1.0000    \t0.9036    \t165       \t150       \t0         \t16        \t0         \n",
+      "management_fee                                    \t0.9753    \t0.9518    \t1.0000    \t0.9518    \t165       \t158       \t0         \t8         \t0         \n",
+      "performance_fee_costs                             \t0.8427    \t0.7979    \t0.8929    \t0.8313    \t96        \t75        \t63        \t19        \t9         \n",
+      "interposed_vehicle_performance_fee_cost           \t0.9455    \t0.8966    \t1.0000    \t0.9639    \t53        \t52        \t108       \t6         \t0         \n",
+      "administration_fees                               \t1.0000    \t1.0000    \t1.0000    \t1.0000    \t2         \t2         \t164       \t0         \t0         \n",
+      "buy_spread                                        \t0.9718    \t0.9568    \t0.9873    \t0.9458    \t163       \t155       \t2         \t7         \t2         \n",
+      "sell_spread                                       \t0.9782    \t0.9691    \t0.9874    \t0.9578    \t163       \t157       \t2         \t5         \t2         \n",
+      "minimum_initial_investment                        \t0.9490    \t0.9528    \t0.9453    \t0.9217    \t128       \t121       \t32        \t6         \t7         \n",
+      "benchmark_name                                    \t0.8944    \t0.8571    \t0.9351    \t0.8976    \t85        \t72        \t77        \t12        \t5         \n",
+      "TOTAL                                             \t0.9451    \t0.9206    \t0.9720    \t0.9304    \t1020      \t942       \t448       \t79        \t100       \n",
+      "Total Funds Matched - 166\n",
+      "Total Funds Not Matched - 30\n",
+      "Percentage of Funds Matched - 84.6938775510204\n",
       "All Providers Results: \n",
       "Document List File -  ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
       "Metric                                            \tF1-Score  \tPrecision \tRecall    \tAccuracy  \tSUPPORT   \tTP        \tTN        \tFP        \tFN        \n",
-      "management_fee_and_costs                          \t0.9038    \t0.8783    \t0.9309    \t0.8259    \t245       \t202       \t2         \t28        \t15        \n",
-      "management_fee                                    \t0.9159    \t0.9000    \t0.9324    \t0.8462    \t245       \t207       \t2         \t23        \t15        \n",
-      "performance_fee_costs                             \t0.9326    \t0.9540    \t0.9121    \t0.9028    \t189       \t166       \t57        \t8         \t16        \n",
-      "interposed_vehicle_performance_fee_cost           \t1.0000    \t1.0000    \t1.0000    \t1.0000    \t20        \t20        \t227       \t0         \t0         \n",
-      "administration_fees                               \t0.9851    \t0.9706    \t1.0000    \t0.9919    \t66        \t66        \t179       \t2         \t0         \n",
-      "total_annual_dollar_based_charges                 \t1.0000    \t1.0000    \t1.0000    \t1.0000    \t69        \t69        \t178       \t0         \t0         \n",
-      "buy_spread                                        \t0.8964    \t0.8791    \t0.9143    \t0.8502    \t190       \t160       \t50        \t22        \t15        \n",
-      "sell_spread                                       \t0.8994    \t0.8846    \t0.9148    \t0.8543    \t190       \t161       \t50        \t21        \t15        \n",
-      "minimum_initial_investment                        \t0.9858    \t0.9721    \t1.0000    \t0.9798    \t174       \t174       \t68        \t5         \t0         \n",
-      "benchmark_name                                    \t0.8673    \t0.8750    \t0.8596    \t0.9393    \t59        \t49        \t183       \t7         \t8         \n",
-      "TOTAL                                             \t0.9386    \t0.9314    \t0.9464    \t0.9190    \t1447      \t1274      \t996       \t116       \t264       \n",
-      "Total Funds Matched - 247\n",
-      "Total Funds Not Matched - 122\n",
-      "Percentage of Funds Matched - 66.93766937669376\n"
+      "management_fee_and_costs                          \t0.9195    \t0.8696    \t0.9756    \t0.8523    \t235       \t200       \t2         \t30        \t5         \n",
+      "management_fee                                    \t0.9342    \t0.8957    \t0.9763    \t0.8776    \t235       \t206       \t2         \t24        \t5         \n",
+      "performance_fee_costs                             \t0.9290    \t0.9290    \t0.9290    \t0.8987    \t179       \t157       \t56        \t12        \t12        \n",
+      "interposed_vehicle_performance_fee_cost           \t1.0000    \t1.0000    \t1.0000    \t1.0000    \t20        \t20        \t217       \t0         \t0         \n",
+      "administration_fees                               \t0.9917    \t0.9836    \t1.0000    \t0.9958    \t60        \t60        \t176       \t1         \t0         \n",
+      "total_annual_dollar_based_charges                 \t1.0000    \t1.0000    \t1.0000    \t1.0000    \t63        \t63        \t174       \t0         \t0         \n",
+      "buy_spread                                        \t0.8976    \t0.8713    \t0.9255    \t0.8565    \t177       \t149       \t54        \t22        \t12        \n",
+      "sell_spread                                       \t0.9009    \t0.8772    \t0.9259    \t0.8608    \t177       \t150       \t54        \t21        \t12        \n",
+      "minimum_initial_investment                        \t0.9850    \t0.9704    \t1.0000    \t0.9789    \t164       \t164       \t68        \t5         \t0         \n",
+      "benchmark_name                                    \t0.9143    \t0.9057    \t0.9231    \t0.9620    \t55        \t48        \t180       \t5         \t4         \n",
+      "TOTAL                                             \t0.9472    \t0.9302    \t0.9655    \t0.9283    \t1365      \t1217      \t983       \t120       \t150       \n",
+      "Total Funds Matched - 237\n",
+      "Total Funds Not Matched - 132\n",
+      "Percentage of Funds Matched - 64.22764227642277\n"
      ]
     }
    ],
@@ -478,7 +478,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {