From c71936c5ff0d12217a1e7280fb835ba2c88c1972 Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 18 Mar 2025 17:22:21 -0500 Subject: [PATCH] 1. optimize benchmark_name instructions 2. consider possible with multiple same raw fund names in documents, not to remove unmatched_db_list when match relevant raw fund/ share name Otherwise, it will occur some raw names couldn't match db name issue. --- calc_metrics.py | 32 ++++--- core/auz_nz/hybrid_solution_script.py | 68 ++++++++------ core/data_extraction.py | 2 +- .../data_extraction_prompts_config.json | 18 +++- main.py | 2 +- performance.ipynb | 92 +++++++++---------- 6 files changed, 123 insertions(+), 91 deletions(-) diff --git a/calc_metrics.py b/calc_metrics.py index bb286fc..dbde368 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -1376,8 +1376,8 @@ def clean_text(text: str): def merge_inference_data(): - file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317_Ravi.xlsx" - file2 = r"/data/aus_prospectus/output/merged_data/docs/excel/merged_420339794.xlsx" + file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530.xlsx" + file2 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_3_documents_by_text_20250318171348.xlsx" columns = [ "doc_id", "effective_date", @@ -1404,20 +1404,30 @@ def merge_inference_data(): "change_recoverable_expenses" ] - file1_data_df = pd.read_excel(file1, sheet_name="total_mapping_data") - file1_data_df = file1_data_df[columns] + document_id_list = [384508026, 521606755, 544886057] + file1_total_mapping_data_df = pd.read_excel(file1, sheet_name="total_mapping_data") + file1_total_mapping_data_df = file1_total_mapping_data_df[columns] # remove the rows which doc_id is 420339794 from file1_data_df - file1_data_df = file1_data_df[file1_data_df["doc_id"] != 420339794] + file1_total_mapping_data_df = file1_total_mapping_data_df[~(file1_total_mapping_data_df["doc_id"].isin(document_id_list))] + + file1_extract_data_df = pd.read_excel(file1, sheet_name="extract_data") + file1_extract_data_df = file1_extract_data_df[~(file1_extract_data_df["doc_id"].isin(document_id_list))] + + file2_total_mapping_data_df = pd.read_excel(file2, sheet_name="total_mapping_data") + file2_total_mapping_data_df = file2_total_mapping_data_df[columns] + total_mapping_data_df = pd.concat([file1_total_mapping_data_df, file2_total_mapping_data_df]) + total_mapping_data_df.reset_index(drop=True, inplace=True) + + file2_extract_data_df = pd.read_excel(file2, sheet_name="extract_data") + total_extract_data_df = pd.concat([file1_extract_data_df, file2_extract_data_df]) + total_extract_data_df.reset_index(drop=True, inplace=True) - file2_data_df = pd.read_excel(file2, sheet_name="merged_data") - file2_data_df = file2_data_df[columns] - total_data_df = pd.concat([file1_data_df, file2_data_df]) - total_data_df.reset_index(drop=True, inplace=True) output_folder = r"/data/aus_prospectus/output/mapping_data/total/" - output_file = os.path.join(output_folder, "mapping_data_info_46_documents_by_text_20250317_Ravi_modified.xlsx") + output_file = os.path.join(output_folder, "mapping_data_info_46_documents_by_text_20250318124530_new.xlsx") with pd.ExcelWriter(output_file) as f: - total_data_df.to_excel(f, index=False, sheet_name="total_mapping_data") + total_mapping_data_df.to_excel(f, index=False, sheet_name="total_mapping_data") + total_extract_data_df.to_excel(f, index=False, sheet_name="total_extract_data") diff --git a/core/auz_nz/hybrid_solution_script.py b/core/auz_nz/hybrid_solution_script.py index 9c2a603..64693ca 100644 --- a/core/auz_nz/hybrid_solution_script.py +++ b/core/auz_nz/hybrid_solution_script.py @@ -460,8 +460,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2], step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2], llm_flag=False)) - if db_list[matched_index] in unmatched_db_list: - unmatched_db_list.remove(db_list[matched_index]) + # if db_list[matched_index] in unmatched_db_list: + # unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index]) if pred_list[index] in unmatched_pred_list: unmatched_pred_list.remove(pred_list[index]) @@ -485,8 +485,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1, step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2], step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False)) - if db_list[matched_index] in unmatched_db_list: - unmatched_db_list.remove(db_list[matched_index]) + # if db_list[matched_index] in unmatched_db_list: + # unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index]) if pred_list[index] in unmatched_pred_list: unmatched_pred_list.remove(pred_list[index]) @@ -513,8 +513,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2, step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2], step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False)) - if db_list[matched_index] in unmatched_db_list: - unmatched_db_list.remove(db_list[matched_index]) + # if db_list[matched_index] in unmatched_db_list: + # unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index]) if pred_list[index] in unmatched_pred_list: unmatched_pred_list.remove(pred_list[index]) @@ -543,8 +543,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3, step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2], step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False)) - if db_list[matched_index] in unmatched_db_list: - unmatched_db_list.remove(db_list[matched_index]) + # if db_list[matched_index] in unmatched_db_list: + # unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index]) if pred_list[index] in unmatched_pred_list: unmatched_pred_list.remove(pred_list[index]) @@ -585,8 +585,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc # print("unmatched_pred_list: ",unmatched_pred_list) # print("db_list[matched_index]: ",db_list[matched_index]) # print("pred_list[index]: ",pred_list[index]) - if db_list[matched_index] in unmatched_db_list: - unmatched_db_list.remove(db_list[matched_index]) + # if db_list[matched_index] in unmatched_db_list: + # unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index]) if pred_list[index] in unmatched_pred_list: unmatched_pred_list.remove(pred_list[index]) @@ -663,20 +663,27 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc # print("k: ",k) # print("v: ",v) og_db_index=-1 - og_pred_index = -1 + # og_pred_index = -1 + og_pred_index_list = [] if k in cleaned_unmatched_pred_list: - og_pred_index = cleaned_unmatched_pred_list.index(k) + for c_idx, c_item in enumerate(cleaned_unmatched_pred_list): + if c_item==k: + og_pred_index_list.append(c_idx) + # og_pred_index = cleaned_unmatched_pred_list.index(k) - if og_pred_index == -1: + if len(og_pred_index_list) == 0: # sometimes, the raw name and db name reversed from the LLM response if v in cleaned_unmatched_pred_list and k in cleaned_unmatched_db_list: - og_pred_index = cleaned_unmatched_pred_list.index(v) + for c_idx, c_item in enumerate(cleaned_unmatched_pred_list): + if c_item==v: + og_pred_index_list.append(c_idx) + # og_pred_index = cleaned_unmatched_pred_list.index(v) og_db_index = cleaned_unmatched_db_list.index(k) # v and k are swapped temp = v v = k k = temp - if og_pred_index==-1: + if len(og_pred_index_list)==0: continue # og_db_index = cleaned_unmatched_db_list.index(v) if og_db_index == -1 and v in cleaned_unmatched_db_list: @@ -685,21 +692,22 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc # print("unmatched_db_list: ",unmatched_db_list) for i in df_data: - if i['pred_fund']==unmatched_pred_list[og_pred_index]: - if og_db_index!=-1: - i['db_fund']=unmatched_db_list[og_db_index] - i['cleaned_db_fund_name'] = v - final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]}) - else: - i['db_fund'] = '' - i['cleaned_db_fund_name'] = '' - final_result.update({unmatched_pred_list[og_pred_index]:""}) - i['llm_clean_pred_list'] = cleaned_unmatched_pred_list - i['llm_clean_db_list'] = cleaned_unmatched_db_list, - i['llm_pred_fund'] = k - i['llm_matched_db_name'] = v - i['llm_result'] = llm_result - break + for og_pred_index in og_pred_index_list: + if i['pred_fund']==unmatched_pred_list[og_pred_index]: + if og_db_index!=-1: + i['db_fund']=unmatched_db_list[og_db_index] + i['cleaned_db_fund_name'] = v + final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]}) + else: + i['db_fund'] = '' + i['cleaned_db_fund_name'] = '' + final_result.update({unmatched_pred_list[og_pred_index]:""}) + i['llm_clean_pred_list'] = cleaned_unmatched_pred_list + i['llm_clean_db_list'] = cleaned_unmatched_db_list, + i['llm_pred_fund'] = k + i['llm_matched_db_name'] = v + i['llm_result'] = llm_result + break # break diff --git a/core/data_extraction.py b/core/data_extraction.py index b470792..b3c83c8 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -793,7 +793,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num not in [4, 5]: + # if page_num not in [14]: # continue if page_num in handled_page_num_list: continue diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index e1db089..38c5bac 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -555,15 +555,23 @@ "The output should be:", "{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders Fund\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}", "---Example 3 Start---", - "\n\nFund name \nComposite benchmark \nCFS Select High \nGrowth \n1.0% Bloomberg AusBond Bank Bill Index, 31.0% MSCI All Country World ex Australia Net Index, 18.0% \nMSCI All Country World ex Australia Net Index Hedged AUD, 4.0% MSCI World Small Cap Index, 5.0% \nMSCI Emerging Markets Index, 37.0% S&P/ASX 300 Accumulation Index, 4.0% S&P/ASX Small \nOrdinaries Index. ", + "Composite benchmarks \n\nThe objective for some funds includes a reference to a composite benchmark. They may be subject to change \nat any time within the allocation ranges. \n\nFund name \nComposite benchmark \nCFS Defensive \nBuilder \n4.0% S&P/ASX 300 Accumulation Index, 4.0% MSCI All Country World ex Australia Net Index, 2.0% \nMSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI/Mercer Australia Core \nWholesale Monthly Property Fund Index, 4.0% MSCI Australia Quarterly Private Infrastructure Index \n(Unfrozen) – Post-fee Total Return (50th Percentile), 25.0% Bloomberg Global Aggregate Corporate \nIndex (hedged AUD), 18.0% Bloomberg Global Aggregate Index (AUD hedged), 20.0% Bloomberg \nAusBond Composite 0+Yr Index, 20.0% Bloomberg AusBond Bank Bill Index. \nCFS Growth Builder \n29.0% S&P/ASX 300 Accumulation Index, 25.0% MSCI All Country World ex Australia Net Index, 14.0% \nMSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI World Small Cap Index, \n5.0% MSCI Emerging Markets Index, 2.0% FTSE EPRA Nareit Developed ex Aus Rental Index (AUD \nhedged), 2.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 2.0% FTSE \nDeveloped Core Infra 50/50 100% Hedged to AUD, 4.0% MSCI Australia Quarterly Private Infrastructure \nIndex (Unfrozen) – Post-fee Total Return (50th Percentile), 12.0% Bloomberg Global Aggregate \nCorporate Index (hedged AUD), 2.0% Bloomberg AusBond Bank Bill Index. \n", "---Example 3 End---", + "For this example, there are multiple fund names with multiple benchmark names with weightings, please extract them all.", "The output should be:", - "{\"data\": [{\"fund name\": \"CFS Select High Growth\", \"benchmark_name\": \"1.0% Bloomberg AusBond Bank Bill Index, 31.0% MSCI All Country World ex Australia Net Index, 18.0% MSCI All Country World ex Australia Net Index Hedged AUD, 4.0% MSCI World Small Cap Index, 5.0% MSCI Emerging Markets Index, 37.0% S&P/ASX 300 Accumulation Index, 4.0% S&P/ASX Small Ordinaries Index\"}]}", + "{\"data\": [{\"fund name\": \"CFS Defensive Builder\", \"benchmark_name\": \"4.0% S&P/ASX 300 Accumulation Index, 4.0% MSCI All Country World ex Australia Net Index, 2.0% MSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 4.0% MSCI Australia Quarterly Private Infrastructure Index (Unfrozen) – Post-fee Total Return (50th Percentile), 25.0% Bloomberg Global Aggregate Corporate Index (hedged AUD), 18.0% Bloomberg Global Aggregate Index (AUD hedged), 20.0% Bloomberg AusBond Composite 0+Yr Index, 20.0% Bloomberg AusBond Bank Bill Index\"}, {\"fund name\": \"CFS Growth Builder\", \"benchmark_name\": \"29.0% S&P/ASX 300 Accumulation Index, 25.0% MSCI All Country World ex Australia Net Index, 14.0% MSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI World Small Cap Index, 5.0% MSCI Emerging Markets Index, 2.0% FTSE EPRA Nareit Developed ex Aus Rental Index (AUD hedged), 2.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 2.0% FTSE Developed Core Infra 50/50 100% Hedged to AUD, 4.0% MSCI Australia Quarterly Private Infrastructure Index (Unfrozen) – Post-fee Total Return (50th Percentile), 12.0% Bloomberg Global Aggregate Corporate Index (hedged AUD), 2.0% Bloomberg AusBond Bank Bill Index\"}]}", "---Example 4 Start---", "\n\nInvestment option \n1 \nPerformance - fee rate \n2 \nPerformance hurdle \n4 \n(maximum fee \n3 \n) \nPayment \nfrequency \nAusbil Australian \nEmerging Leaders \n15.375% \n(maximum 15% ex-GST) \nComposite benchmark (70% S&P/ASX Midcap 50 \nAccumulation Index and 30% S&P/ASX Small Ordinaries \nAccumulation Index) plus 0.85% pa \nMonthly \n", "---Example 4 End---", "The output should be:", "{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index and 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}", + "---Example 5 Start---", + "Fixed interest / Income \nCash \nUBS Diversified Fixed Income BT Cash Management Trust \nBenchmark \n50% Bloomberg Barclays Global \nAggregate Index (A$ hedged), 50% \nBloomberg AusBond Composite \n0+ Yr Index* \nBloomberg AusBond Bank Bill Index* \n", + "---Example 5 End---", + "For this example, please read carefully for fund names in same line: \"UBS Diversified Fixed Income BT Cash Management Trust\", there are 2 fund names: \"UBS Diversified Fixed Income\" and \"BT Cash Management Trust\".", + "There are 2 benchmark names: \"50% Bloomberg Barclays Global Aggregate Index (A$ hedged), 50% Bloomberg AusBond Composite 0+ Yr Index\" and \"Bloomberg AusBond Bank Bill Index\".", + "The output should be:", + "{\"data\": [{\"fund name\": \"UBS Diversified Fixed Income\", \"benchmark_name\": \"50% Bloomberg Barclays Global Aggregate Index (A$ hedged), 50% Bloomberg AusBond Composite 0+ Yr Index\"}, {\"fund name\": \"BT Cash Management Trust\", \"benchmark_name\": \"Bloomberg AusBond Bank Bill Index\"}]}", "\n", "C. Don't extract benchmark name from context when fit below cases.", "1. Exclude benchmark name when its reported name is \"Return target\".", @@ -586,6 +594,12 @@ "Return targets, on the other hand, are goals set by the fund to achieve a certain level of performance over a specified period, in this case, 20 years. ", "They indicate the desired outcome rather than serving as a comparative measure against market performance.", "The output should be:", + "{\"data\": []}", + "D. If extracted multiple benchmark names, but without weightings, e.g. 50% or 30%, please ignore and output empty.", + "---Example Start---", + "This is calculated by using the weighted average of the \nasset allocation neutral position and the index returns for each asset class. \n\nBT Multi-manager Growth Fund, BT Multi-manager Balanced Fund, BT Multi-manager Conservative Fund \nand BT Multi-manager High Growth Fund \n\nAsset class Indices \nAustralian shares S&P/ASX 300 Accumulation Index \nInternational shares MSCI World ex Australia $A (Net Dividends Reinvested) \nMSCI World ex Australia Hedged $A (Net Dividends Reinvested) \nMSCI Emerging Market (Net Dividends Reinvested) in AUD \nAustralian property S&P/ASX 300 A-REIT Accumulation Index \nInternational property FTSE EPRA/NAREIT Developed Hedged in AUD Net TRI \n", + "---Example End---", + "The output should be:", "{\"data\": []}" ] }, diff --git a/main.py b/main.py index d7b0046..b5197f0 100644 --- a/main.py +++ b/main.py @@ -1538,7 +1538,7 @@ if __name__ == "__main__": with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" - # special_doc_id_list = ["420339794"] + # special_doc_id_list = ["521606755", "384508026", "544886057"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/performance.ipynb b/performance.ipynb index 7a1f3d2..a4ab839 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -36,14 +36,14 @@ "\n", "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n", - "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530.xlsx\"\n", + "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530_new.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", "\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -316,7 +316,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -330,53 +330,53 @@ "All Providers Results: \n", "Document List File - None\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9211 \t0.8861 \t0.9589 \t0.8544 \t409 \t350 \t2 \t45 \t15 \n", - "management_fee \t0.9419 \t0.9241 \t0.9605 \t0.8908 \t409 \t365 \t2 \t30 \t15 \n", - "performance_fee_costs \t0.8987 \t0.9325 \t0.8672 \t0.8714 \t284 \t235 \t124 \t17 \t36 \n", - "interposed_vehicle_performance_fee_cost \t0.9600 \t0.9231 \t1.0000 \t0.9854 \t73 \t72 \t334 \t6 \t0 \n", - "administration_fees \t0.9853 \t0.9710 \t1.0000 \t0.9951 \t67 \t67 \t343 \t2 \t0 \n", - "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t69 \t69 \t343 \t0 \t0 \n", - "buy_spread \t0.9365 \t0.9242 \t0.9491 \t0.8956 \t352 \t317 \t52 \t26 \t17 \n", - "sell_spread \t0.9412 \t0.9329 \t0.9496 \t0.9029 \t352 \t320 \t52 \t23 \t17 \n", - "minimum_initial_investment \t0.9737 \t0.9642 \t0.9834 \t0.9612 \t301 \t296 \t100 \t11 \t5 \n", - "benchmark_name \t0.8137 \t0.8295 \t0.7985 \t0.8811 \t144 \t107 \t256 \t22 \t27 \n", - "TOTAL \t0.9372 \t0.9288 \t0.9467 \t0.9238 \t2460 \t2198 \t1608 \t182 \t132 \n", - "Total Funds Matched - 412\n", - "Total Funds Not Matched - 153\n", - "Percentage of Funds Matched - 72.9203539823009\n", + "management_fee_and_costs \t0.9321 \t0.8838 \t0.9859 \t0.8734 \t400 \t350 \t2 \t46 \t5 \n", + "management_fee \t0.9516 \t0.9192 \t0.9864 \t0.9082 \t400 \t364 \t2 \t32 \t5 \n", + "performance_fee_costs \t0.8992 \t0.8821 \t0.9170 \t0.8710 \t275 \t232 \t119 \t31 \t21 \n", + "interposed_vehicle_performance_fee_cost \t0.9600 \t0.9231 \t1.0000 \t0.9851 \t73 \t72 \t325 \t6 \t0 \n", + "administration_fees \t0.9920 \t0.9841 \t1.0000 \t0.9975 \t62 \t62 \t340 \t1 \t0 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t63 \t63 \t340 \t0 \t0 \n", + "buy_spread \t0.9339 \t0.9129 \t0.9560 \t0.8933 \t340 \t304 \t56 \t29 \t14 \n", + "sell_spread \t0.9388 \t0.9219 \t0.9564 \t0.9007 \t340 \t307 \t56 \t26 \t14 \n", + "minimum_initial_investment \t0.9694 \t0.9628 \t0.9760 \t0.9553 \t292 \t285 \t100 \t11 \t7 \n", + "benchmark_name \t0.9023 \t0.8759 \t0.9302 \t0.9355 \t140 \t120 \t257 \t17 \t9 \n", + "TOTAL \t0.9479 \t0.9266 \t0.9708 \t0.9320 \t2385 \t2159 \t1597 \t199 \t75 \n", + "Total Funds Matched - 403\n", + "Total Funds Not Matched - 162\n", + "Percentage of Funds Matched - 71.32743362831859\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9457 \t0.8970 \t1.0000 \t0.8970 \t164 \t148 \t0 \t17 \t0 \n", - "management_fee \t0.9783 \t0.9576 \t1.0000 \t0.9576 \t164 \t158 \t0 \t7 \t0 \n", - "performance_fee_costs \t0.8263 \t0.8846 \t0.7753 \t0.8242 \t95 \t69 \t67 \t9 \t20 \n", - "interposed_vehicle_performance_fee_cost \t0.9455 \t0.8966 \t1.0000 \t0.9636 \t53 \t52 \t107 \t6 \t0 \n", - "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t1 \t1 \t164 \t0 \t0 \n", - "buy_spread \t0.9812 \t0.9752 \t0.9874 \t0.9636 \t162 \t157 \t2 \t4 \t2 \n", - "sell_spread \t0.9876 \t0.9876 \t0.9876 \t0.9758 \t162 \t159 \t2 \t2 \t2 \n", - "minimum_initial_investment \t0.9569 \t0.9531 \t0.9606 \t0.9333 \t127 \t122 \t32 \t6 \t5 \n", - "benchmark_name \t0.7733 \t0.7945 \t0.7532 \t0.7939 \t85 \t58 \t73 \t15 \t19 \n", - "TOTAL \t0.9328 \t0.9273 \t0.9405 \t0.9232 \t1013 \t924 \t447 \t66 \t180 \n", - "Total Funds Matched - 165\n", - "Total Funds Not Matched - 31\n", - "Percentage of Funds Matched - 84.18367346938776\n", + "management_fee_and_costs \t0.9494 \t0.9036 \t1.0000 \t0.9036 \t165 \t150 \t0 \t16 \t0 \n", + "management_fee \t0.9753 \t0.9518 \t1.0000 \t0.9518 \t165 \t158 \t0 \t8 \t0 \n", + "performance_fee_costs \t0.8427 \t0.7979 \t0.8929 \t0.8313 \t96 \t75 \t63 \t19 \t9 \n", + "interposed_vehicle_performance_fee_cost \t0.9455 \t0.8966 \t1.0000 \t0.9639 \t53 \t52 \t108 \t6 \t0 \n", + "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t2 \t2 \t164 \t0 \t0 \n", + "buy_spread \t0.9718 \t0.9568 \t0.9873 \t0.9458 \t163 \t155 \t2 \t7 \t2 \n", + "sell_spread \t0.9782 \t0.9691 \t0.9874 \t0.9578 \t163 \t157 \t2 \t5 \t2 \n", + "minimum_initial_investment \t0.9490 \t0.9528 \t0.9453 \t0.9217 \t128 \t121 \t32 \t6 \t7 \n", + "benchmark_name \t0.8944 \t0.8571 \t0.9351 \t0.8976 \t85 \t72 \t77 \t12 \t5 \n", + "TOTAL \t0.9451 \t0.9206 \t0.9720 \t0.9304 \t1020 \t942 \t448 \t79 \t100 \n", + "Total Funds Matched - 166\n", + "Total Funds Not Matched - 30\n", + "Percentage of Funds Matched - 84.6938775510204\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9038 \t0.8783 \t0.9309 \t0.8259 \t245 \t202 \t2 \t28 \t15 \n", - "management_fee \t0.9159 \t0.9000 \t0.9324 \t0.8462 \t245 \t207 \t2 \t23 \t15 \n", - "performance_fee_costs \t0.9326 \t0.9540 \t0.9121 \t0.9028 \t189 \t166 \t57 \t8 \t16 \n", - "interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t227 \t0 \t0 \n", - "administration_fees \t0.9851 \t0.9706 \t1.0000 \t0.9919 \t66 \t66 \t179 \t2 \t0 \n", - "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t69 \t69 \t178 \t0 \t0 \n", - "buy_spread \t0.8964 \t0.8791 \t0.9143 \t0.8502 \t190 \t160 \t50 \t22 \t15 \n", - "sell_spread \t0.8994 \t0.8846 \t0.9148 \t0.8543 \t190 \t161 \t50 \t21 \t15 \n", - "minimum_initial_investment \t0.9858 \t0.9721 \t1.0000 \t0.9798 \t174 \t174 \t68 \t5 \t0 \n", - "benchmark_name \t0.8673 \t0.8750 \t0.8596 \t0.9393 \t59 \t49 \t183 \t7 \t8 \n", - "TOTAL \t0.9386 \t0.9314 \t0.9464 \t0.9190 \t1447 \t1274 \t996 \t116 \t264 \n", - "Total Funds Matched - 247\n", - "Total Funds Not Matched - 122\n", - "Percentage of Funds Matched - 66.93766937669376\n" + "management_fee_and_costs \t0.9195 \t0.8696 \t0.9756 \t0.8523 \t235 \t200 \t2 \t30 \t5 \n", + "management_fee \t0.9342 \t0.8957 \t0.9763 \t0.8776 \t235 \t206 \t2 \t24 \t5 \n", + "performance_fee_costs \t0.9290 \t0.9290 \t0.9290 \t0.8987 \t179 \t157 \t56 \t12 \t12 \n", + "interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t217 \t0 \t0 \n", + "administration_fees \t0.9917 \t0.9836 \t1.0000 \t0.9958 \t60 \t60 \t176 \t1 \t0 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t63 \t63 \t174 \t0 \t0 \n", + "buy_spread \t0.8976 \t0.8713 \t0.9255 \t0.8565 \t177 \t149 \t54 \t22 \t12 \n", + "sell_spread \t0.9009 \t0.8772 \t0.9259 \t0.8608 \t177 \t150 \t54 \t21 \t12 \n", + "minimum_initial_investment \t0.9850 \t0.9704 \t1.0000 \t0.9789 \t164 \t164 \t68 \t5 \t0 \n", + "benchmark_name \t0.9143 \t0.9057 \t0.9231 \t0.9620 \t55 \t48 \t180 \t5 \t4 \n", + "TOTAL \t0.9472 \t0.9302 \t0.9655 \t0.9283 \t1365 \t1217 \t983 \t120 \t150 \n", + "Total Funds Matched - 237\n", + "Total Funds Not Matched - 132\n", + "Percentage of Funds Matched - 64.22764227642277\n" ] } ], @@ -478,7 +478,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ {