1. optimize benchmark_name instructions

2. consider possible with multiple same raw fund names in documents, not to remove unmatched_db_list when match relevant raw fund/ share name
Otherwise, it will occur some raw names couldn't match db name issue.
This commit is contained in:
Blade He 2025-03-18 17:22:21 -05:00
parent 0cea2e501b
commit c71936c5ff
6 changed files with 123 additions and 91 deletions

View File

@ -1376,8 +1376,8 @@ def clean_text(text: str):
def merge_inference_data():
file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317_Ravi.xlsx"
file2 = r"/data/aus_prospectus/output/merged_data/docs/excel/merged_420339794.xlsx"
file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530.xlsx"
file2 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_3_documents_by_text_20250318171348.xlsx"
columns = [
"doc_id",
"effective_date",
@ -1404,20 +1404,30 @@ def merge_inference_data():
"change_recoverable_expenses"
]
file1_data_df = pd.read_excel(file1, sheet_name="total_mapping_data")
file1_data_df = file1_data_df[columns]
document_id_list = [384508026, 521606755, 544886057]
file1_total_mapping_data_df = pd.read_excel(file1, sheet_name="total_mapping_data")
file1_total_mapping_data_df = file1_total_mapping_data_df[columns]
# remove the rows which doc_id is 420339794 from file1_data_df
file1_data_df = file1_data_df[file1_data_df["doc_id"] != 420339794]
file1_total_mapping_data_df = file1_total_mapping_data_df[~(file1_total_mapping_data_df["doc_id"].isin(document_id_list))]
file1_extract_data_df = pd.read_excel(file1, sheet_name="extract_data")
file1_extract_data_df = file1_extract_data_df[~(file1_extract_data_df["doc_id"].isin(document_id_list))]
file2_total_mapping_data_df = pd.read_excel(file2, sheet_name="total_mapping_data")
file2_total_mapping_data_df = file2_total_mapping_data_df[columns]
total_mapping_data_df = pd.concat([file1_total_mapping_data_df, file2_total_mapping_data_df])
total_mapping_data_df.reset_index(drop=True, inplace=True)
file2_extract_data_df = pd.read_excel(file2, sheet_name="extract_data")
total_extract_data_df = pd.concat([file1_extract_data_df, file2_extract_data_df])
total_extract_data_df.reset_index(drop=True, inplace=True)
file2_data_df = pd.read_excel(file2, sheet_name="merged_data")
file2_data_df = file2_data_df[columns]
total_data_df = pd.concat([file1_data_df, file2_data_df])
total_data_df.reset_index(drop=True, inplace=True)
output_folder = r"/data/aus_prospectus/output/mapping_data/total/"
output_file = os.path.join(output_folder, "mapping_data_info_46_documents_by_text_20250317_Ravi_modified.xlsx")
output_file = os.path.join(output_folder, "mapping_data_info_46_documents_by_text_20250318124530_new.xlsx")
with pd.ExcelWriter(output_file) as f:
total_data_df.to_excel(f, index=False, sheet_name="total_mapping_data")
total_mapping_data_df.to_excel(f, index=False, sheet_name="total_mapping_data")
total_extract_data_df.to_excel(f, index=False, sheet_name="total_extract_data")

View File

@ -460,8 +460,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
llm_flag=False))
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# if db_list[matched_index] in unmatched_db_list:
# unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
@ -485,8 +485,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False))
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# if db_list[matched_index] in unmatched_db_list:
# unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
@ -513,8 +513,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False))
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# if db_list[matched_index] in unmatched_db_list:
# unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
@ -543,8 +543,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3,
step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2],
step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False))
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# if db_list[matched_index] in unmatched_db_list:
# unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
@ -585,8 +585,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
# print("unmatched_pred_list: ",unmatched_pred_list)
# print("db_list[matched_index]: ",db_list[matched_index])
# print("pred_list[index]: ",pred_list[index])
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# if db_list[matched_index] in unmatched_db_list:
# unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
@ -663,20 +663,27 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
# print("k: ",k)
# print("v: ",v)
og_db_index=-1
og_pred_index = -1
# og_pred_index = -1
og_pred_index_list = []
if k in cleaned_unmatched_pred_list:
og_pred_index = cleaned_unmatched_pred_list.index(k)
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
if c_item==k:
og_pred_index_list.append(c_idx)
# og_pred_index = cleaned_unmatched_pred_list.index(k)
if og_pred_index == -1:
if len(og_pred_index_list) == 0:
# sometimes, the raw name and db name reversed from the LLM response
if v in cleaned_unmatched_pred_list and k in cleaned_unmatched_db_list:
og_pred_index = cleaned_unmatched_pred_list.index(v)
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
if c_item==v:
og_pred_index_list.append(c_idx)
# og_pred_index = cleaned_unmatched_pred_list.index(v)
og_db_index = cleaned_unmatched_db_list.index(k)
# v and k are swapped
temp = v
v = k
k = temp
if og_pred_index==-1:
if len(og_pred_index_list)==0:
continue
# og_db_index = cleaned_unmatched_db_list.index(v)
if og_db_index == -1 and v in cleaned_unmatched_db_list:
@ -685,21 +692,22 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
# print("unmatched_db_list: ",unmatched_db_list)
for i in df_data:
if i['pred_fund']==unmatched_pred_list[og_pred_index]:
if og_db_index!=-1:
i['db_fund']=unmatched_db_list[og_db_index]
i['cleaned_db_fund_name'] = v
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
else:
i['db_fund'] = ''
i['cleaned_db_fund_name'] = ''
final_result.update({unmatched_pred_list[og_pred_index]:""})
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
i['llm_clean_db_list'] = cleaned_unmatched_db_list,
i['llm_pred_fund'] = k
i['llm_matched_db_name'] = v
i['llm_result'] = llm_result
break
for og_pred_index in og_pred_index_list:
if i['pred_fund']==unmatched_pred_list[og_pred_index]:
if og_db_index!=-1:
i['db_fund']=unmatched_db_list[og_db_index]
i['cleaned_db_fund_name'] = v
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
else:
i['db_fund'] = ''
i['cleaned_db_fund_name'] = ''
final_result.update({unmatched_pred_list[og_pred_index]:""})
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
i['llm_clean_db_list'] = cleaned_unmatched_db_list,
i['llm_pred_fund'] = k
i['llm_matched_db_name'] = v
i['llm_result'] = llm_result
break
# break

View File

@ -793,7 +793,7 @@ class DataExtraction:
previous_page_datapoints = []
previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items():
# if page_num not in [4, 5]:
# if page_num not in [14]:
# continue
if page_num in handled_page_num_list:
continue

View File

@ -555,15 +555,23 @@
"The output should be:",
"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders Fund\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
"---Example 3 Start---",
"\n\nFund name \nComposite benchmark \nCFS Select High \nGrowth \n1.0% Bloomberg AusBond Bank Bill Index, 31.0% MSCI All Country World ex Australia Net Index, 18.0% \nMSCI All Country World ex Australia Net Index Hedged AUD, 4.0% MSCI World Small Cap Index, 5.0% \nMSCI Emerging Markets Index, 37.0% S&P/ASX 300 Accumulation Index, 4.0% S&P/ASX Small \nOrdinaries Index. ",
"Composite benchmarks \n\nThe objective for some funds includes a reference to a composite benchmark. They may be subject to change \nat any time within the allocation ranges. \n\nFund name \nComposite benchmark \nCFS Defensive \nBuilder \n4.0% S&P/ASX 300 Accumulation Index, 4.0% MSCI All Country World ex Australia Net Index, 2.0% \nMSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI/Mercer Australia Core \nWholesale Monthly Property Fund Index, 4.0% MSCI Australia Quarterly Private Infrastructure Index \n(Unfrozen) Post-fee Total Return (50th Percentile), 25.0% Bloomberg Global Aggregate Corporate \nIndex (hedged AUD), 18.0% Bloomberg Global Aggregate Index (AUD hedged), 20.0% Bloomberg \nAusBond Composite 0+Yr Index, 20.0% Bloomberg AusBond Bank Bill Index. \nCFS Growth Builder \n29.0% S&P/ASX 300 Accumulation Index, 25.0% MSCI All Country World ex Australia Net Index, 14.0% \nMSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI World Small Cap Index, \n5.0% MSCI Emerging Markets Index, 2.0% FTSE EPRA Nareit Developed ex Aus Rental Index (AUD \nhedged), 2.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 2.0% FTSE \nDeveloped Core Infra 50/50 100% Hedged to AUD, 4.0% MSCI Australia Quarterly Private Infrastructure \nIndex (Unfrozen) Post-fee Total Return (50th Percentile), 12.0% Bloomberg Global Aggregate \nCorporate Index (hedged AUD), 2.0% Bloomberg AusBond Bank Bill Index. \n",
"---Example 3 End---",
"For this example, there are multiple fund names with multiple benchmark names with weightings, please extract them all.",
"The output should be:",
"{\"data\": [{\"fund name\": \"CFS Select High Growth\", \"benchmark_name\": \"1.0% Bloomberg AusBond Bank Bill Index, 31.0% MSCI All Country World ex Australia Net Index, 18.0% MSCI All Country World ex Australia Net Index Hedged AUD, 4.0% MSCI World Small Cap Index, 5.0% MSCI Emerging Markets Index, 37.0% S&P/ASX 300 Accumulation Index, 4.0% S&P/ASX Small Ordinaries Index\"}]}",
"{\"data\": [{\"fund name\": \"CFS Defensive Builder\", \"benchmark_name\": \"4.0% S&P/ASX 300 Accumulation Index, 4.0% MSCI All Country World ex Australia Net Index, 2.0% MSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 4.0% MSCI Australia Quarterly Private Infrastructure Index (Unfrozen) Post-fee Total Return (50th Percentile), 25.0% Bloomberg Global Aggregate Corporate Index (hedged AUD), 18.0% Bloomberg Global Aggregate Index (AUD hedged), 20.0% Bloomberg AusBond Composite 0+Yr Index, 20.0% Bloomberg AusBond Bank Bill Index\"}, {\"fund name\": \"CFS Growth Builder\", \"benchmark_name\": \"29.0% S&P/ASX 300 Accumulation Index, 25.0% MSCI All Country World ex Australia Net Index, 14.0% MSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI World Small Cap Index, 5.0% MSCI Emerging Markets Index, 2.0% FTSE EPRA Nareit Developed ex Aus Rental Index (AUD hedged), 2.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 2.0% FTSE Developed Core Infra 50/50 100% Hedged to AUD, 4.0% MSCI Australia Quarterly Private Infrastructure Index (Unfrozen) Post-fee Total Return (50th Percentile), 12.0% Bloomberg Global Aggregate Corporate Index (hedged AUD), 2.0% Bloomberg AusBond Bank Bill Index\"}]}",
"---Example 4 Start---",
"\n\nInvestment option \n1 \nPerformance - fee rate \n2 \nPerformance hurdle \n4 \n(maximum fee \n3 \n) \nPayment \nfrequency \nAusbil Australian \nEmerging Leaders \n15.375% \n(maximum 15% ex-GST) \nComposite benchmark (70% S&P/ASX Midcap 50 \nAccumulation Index and 30% S&P/ASX Small Ordinaries \nAccumulation Index) plus 0.85% pa \nMonthly \n",
"---Example 4 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index and 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
"---Example 5 Start---",
"Fixed interest / Income \nCash \nUBS Diversified Fixed Income BT Cash Management Trust \nBenchmark \n50% Bloomberg Barclays Global \nAggregate Index (A$ hedged), 50% \nBloomberg AusBond Composite \n0+ Yr Index* \nBloomberg AusBond Bank Bill Index* \n",
"---Example 5 End---",
"For this example, please read carefully for fund names in same line: \"UBS Diversified Fixed Income BT Cash Management Trust\", there are 2 fund names: \"UBS Diversified Fixed Income\" and \"BT Cash Management Trust\".",
"There are 2 benchmark names: \"50% Bloomberg Barclays Global Aggregate Index (A$ hedged), 50% Bloomberg AusBond Composite 0+ Yr Index\" and \"Bloomberg AusBond Bank Bill Index\".",
"The output should be:",
"{\"data\": [{\"fund name\": \"UBS Diversified Fixed Income\", \"benchmark_name\": \"50% Bloomberg Barclays Global Aggregate Index (A$ hedged), 50% Bloomberg AusBond Composite 0+ Yr Index\"}, {\"fund name\": \"BT Cash Management Trust\", \"benchmark_name\": \"Bloomberg AusBond Bank Bill Index\"}]}",
"\n",
"C. Don't extract benchmark name from context when fit below cases.",
"1. Exclude benchmark name when its reported name is \"Return target\".",
@ -586,6 +594,12 @@
"Return targets, on the other hand, are goals set by the fund to achieve a certain level of performance over a specified period, in this case, 20 years. ",
"They indicate the desired outcome rather than serving as a comparative measure against market performance.",
"The output should be:",
"{\"data\": []}",
"D. If extracted multiple benchmark names, but without weightings, e.g. 50% or 30%, please ignore and output empty.",
"---Example Start---",
"This is calculated by using the weighted average of the \nasset allocation neutral position and the index returns for each asset class. \n\nBT Multi-manager Growth Fund, BT Multi-manager Balanced Fund, BT Multi-manager Conservative Fund \nand BT Multi-manager High Growth Fund \n\nAsset class Indices \nAustralian shares S&P/ASX 300 Accumulation Index \nInternational shares MSCI World ex Australia $A (Net Dividends Reinvested) \nMSCI World ex Australia Hedged $A (Net Dividends Reinvested) \nMSCI Emerging Market (Net Dividends Reinvested) in AUD \nAustralian property S&P/ASX 300 A-REIT Accumulation Index \nInternational property FTSE EPRA/NAREIT Developed Hedged in AUD Net TRI \n",
"---Example End---",
"The output should be:",
"{\"data\": []}"
]
},

View File

@ -1538,7 +1538,7 @@ if __name__ == "__main__":
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
# special_doc_id_list = ["420339794"]
# special_doc_id_list = ["521606755", "384508026", "544886057"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
@ -36,14 +36,14 @@
"\n",
"path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
"# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n",
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530.xlsx\"\n",
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530_new.xlsx\"\n",
"provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@ -316,7 +316,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 22,
"metadata": {},
"outputs": [
{
@ -330,53 +330,53 @@
"All Providers Results: \n",
"Document List File - None\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.9211 \t0.8861 \t0.9589 \t0.8544 \t409 \t350 \t2 \t45 \t15 \n",
"management_fee \t0.9419 \t0.9241 \t0.9605 \t0.8908 \t409 \t365 \t2 \t30 \t15 \n",
"performance_fee_costs \t0.8987 \t0.9325 \t0.8672 \t0.8714 \t284 \t235 \t124 \t17 \t36 \n",
"interposed_vehicle_performance_fee_cost \t0.9600 \t0.9231 \t1.0000 \t0.9854 \t73 \t72 \t334 \t6 \t0 \n",
"administration_fees \t0.9853 \t0.9710 \t1.0000 \t0.9951 \t67 \t67 \t343 \t2 \t0 \n",
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t69 \t69 \t343 \t0 \t0 \n",
"buy_spread \t0.9365 \t0.9242 \t0.9491 \t0.8956 \t352 \t317 \t52 \t26 \t17 \n",
"sell_spread \t0.9412 \t0.9329 \t0.9496 \t0.9029 \t352 \t320 \t52 \t23 \t17 \n",
"minimum_initial_investment \t0.9737 \t0.9642 \t0.9834 \t0.9612 \t301 \t296 \t100 \t11 \t5 \n",
"benchmark_name \t0.8137 \t0.8295 \t0.7985 \t0.8811 \t144 \t107 \t256 \t22 \t27 \n",
"TOTAL \t0.9372 \t0.9288 \t0.9467 \t0.9238 \t2460 \t2198 \t1608 \t182 \t132 \n",
"Total Funds Matched - 412\n",
"Total Funds Not Matched - 153\n",
"Percentage of Funds Matched - 72.9203539823009\n",
"management_fee_and_costs \t0.9321 \t0.8838 \t0.9859 \t0.8734 \t400 \t350 \t2 \t46 \t5 \n",
"management_fee \t0.9516 \t0.9192 \t0.9864 \t0.9082 \t400 \t364 \t2 \t32 \t5 \n",
"performance_fee_costs \t0.8992 \t0.8821 \t0.9170 \t0.8710 \t275 \t232 \t119 \t31 \t21 \n",
"interposed_vehicle_performance_fee_cost \t0.9600 \t0.9231 \t1.0000 \t0.9851 \t73 \t72 \t325 \t6 \t0 \n",
"administration_fees \t0.9920 \t0.9841 \t1.0000 \t0.9975 \t62 \t62 \t340 \t1 \t0 \n",
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t63 \t63 \t340 \t0 \t0 \n",
"buy_spread \t0.9339 \t0.9129 \t0.9560 \t0.8933 \t340 \t304 \t56 \t29 \t14 \n",
"sell_spread \t0.9388 \t0.9219 \t0.9564 \t0.9007 \t340 \t307 \t56 \t26 \t14 \n",
"minimum_initial_investment \t0.9694 \t0.9628 \t0.9760 \t0.9553 \t292 \t285 \t100 \t11 \t7 \n",
"benchmark_name \t0.9023 \t0.8759 \t0.9302 \t0.9355 \t140 \t120 \t257 \t17 \t9 \n",
"TOTAL \t0.9479 \t0.9266 \t0.9708 \t0.9320 \t2385 \t2159 \t1597 \t199 \t75 \n",
"Total Funds Matched - 403\n",
"Total Funds Not Matched - 162\n",
"Percentage of Funds Matched - 71.32743362831859\n",
"All Providers Results: \n",
"Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.9457 \t0.8970 \t1.0000 \t0.8970 \t164 \t148 \t0 \t17 \t0 \n",
"management_fee \t0.9783 \t0.9576 \t1.0000 \t0.9576 \t164 \t158 \t0 \t7 \t0 \n",
"performance_fee_costs \t0.8263 \t0.8846 \t0.7753 \t0.8242 \t95 \t69 \t67 \t9 \t20 \n",
"interposed_vehicle_performance_fee_cost \t0.9455 \t0.8966 \t1.0000 \t0.9636 \t53 \t52 \t107 \t6 \t0 \n",
"administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t1 \t1 \t164 \t0 \t0 \n",
"buy_spread \t0.9812 \t0.9752 \t0.9874 \t0.9636 \t162 \t157 \t2 \t4 \t2 \n",
"sell_spread \t0.9876 \t0.9876 \t0.9876 \t0.9758 \t162 \t159 \t2 \t2 \t2 \n",
"minimum_initial_investment \t0.9569 \t0.9531 \t0.9606 \t0.9333 \t127 \t122 \t32 \t6 \t5 \n",
"benchmark_name \t0.7733 \t0.7945 \t0.7532 \t0.7939 \t85 \t58 \t73 \t15 \t19 \n",
"TOTAL \t0.9328 \t0.9273 \t0.9405 \t0.9232 \t1013 \t924 \t447 \t66 \t180 \n",
"Total Funds Matched - 165\n",
"Total Funds Not Matched - 31\n",
"Percentage of Funds Matched - 84.18367346938776\n",
"management_fee_and_costs \t0.9494 \t0.9036 \t1.0000 \t0.9036 \t165 \t150 \t0 \t16 \t0 \n",
"management_fee \t0.9753 \t0.9518 \t1.0000 \t0.9518 \t165 \t158 \t0 \t8 \t0 \n",
"performance_fee_costs \t0.8427 \t0.7979 \t0.8929 \t0.8313 \t96 \t75 \t63 \t19 \t9 \n",
"interposed_vehicle_performance_fee_cost \t0.9455 \t0.8966 \t1.0000 \t0.9639 \t53 \t52 \t108 \t6 \t0 \n",
"administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t2 \t2 \t164 \t0 \t0 \n",
"buy_spread \t0.9718 \t0.9568 \t0.9873 \t0.9458 \t163 \t155 \t2 \t7 \t2 \n",
"sell_spread \t0.9782 \t0.9691 \t0.9874 \t0.9578 \t163 \t157 \t2 \t5 \t2 \n",
"minimum_initial_investment \t0.9490 \t0.9528 \t0.9453 \t0.9217 \t128 \t121 \t32 \t6 \t7 \n",
"benchmark_name \t0.8944 \t0.8571 \t0.9351 \t0.8976 \t85 \t72 \t77 \t12 \t5 \n",
"TOTAL \t0.9451 \t0.9206 \t0.9720 \t0.9304 \t1020 \t942 \t448 \t79 \t100 \n",
"Total Funds Matched - 166\n",
"Total Funds Not Matched - 30\n",
"Percentage of Funds Matched - 84.6938775510204\n",
"All Providers Results: \n",
"Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.9038 \t0.8783 \t0.9309 \t0.8259 \t245 \t202 \t2 \t28 \t15 \n",
"management_fee \t0.9159 \t0.9000 \t0.9324 \t0.8462 \t245 \t207 \t2 \t23 \t15 \n",
"performance_fee_costs \t0.9326 \t0.9540 \t0.9121 \t0.9028 \t189 \t166 \t57 \t8 \t16 \n",
"interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t227 \t0 \t0 \n",
"administration_fees \t0.9851 \t0.9706 \t1.0000 \t0.9919 \t66 \t66 \t179 \t2 \t0 \n",
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t69 \t69 \t178 \t0 \t0 \n",
"buy_spread \t0.8964 \t0.8791 \t0.9143 \t0.8502 \t190 \t160 \t50 \t22 \t15 \n",
"sell_spread \t0.8994 \t0.8846 \t0.9148 \t0.8543 \t190 \t161 \t50 \t21 \t15 \n",
"minimum_initial_investment \t0.9858 \t0.9721 \t1.0000 \t0.9798 \t174 \t174 \t68 \t5 \t0 \n",
"benchmark_name \t0.8673 \t0.8750 \t0.8596 \t0.9393 \t59 \t49 \t183 \t7 \t8 \n",
"TOTAL \t0.9386 \t0.9314 \t0.9464 \t0.9190 \t1447 \t1274 \t996 \t116 \t264 \n",
"Total Funds Matched - 247\n",
"Total Funds Not Matched - 122\n",
"Percentage of Funds Matched - 66.93766937669376\n"
"management_fee_and_costs \t0.9195 \t0.8696 \t0.9756 \t0.8523 \t235 \t200 \t2 \t30 \t5 \n",
"management_fee \t0.9342 \t0.8957 \t0.9763 \t0.8776 \t235 \t206 \t2 \t24 \t5 \n",
"performance_fee_costs \t0.9290 \t0.9290 \t0.9290 \t0.8987 \t179 \t157 \t56 \t12 \t12 \n",
"interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t217 \t0 \t0 \n",
"administration_fees \t0.9917 \t0.9836 \t1.0000 \t0.9958 \t60 \t60 \t176 \t1 \t0 \n",
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t63 \t63 \t174 \t0 \t0 \n",
"buy_spread \t0.8976 \t0.8713 \t0.9255 \t0.8565 \t177 \t149 \t54 \t22 \t12 \n",
"sell_spread \t0.9009 \t0.8772 \t0.9259 \t0.8608 \t177 \t150 \t54 \t21 \t12 \n",
"minimum_initial_investment \t0.9850 \t0.9704 \t1.0000 \t0.9789 \t164 \t164 \t68 \t5 \t0 \n",
"benchmark_name \t0.9143 \t0.9057 \t0.9231 \t0.9620 \t55 \t48 \t180 \t5 \t4 \n",
"TOTAL \t0.9472 \t0.9302 \t0.9655 \t0.9283 \t1365 \t1217 \t983 \t120 \t150 \n",
"Total Funds Matched - 237\n",
"Total Funds Not Matched - 132\n",
"Percentage of Funds Matched - 64.22764227642277\n"
]
}
],
@ -478,7 +478,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{