1. optimize benchmark_name instructions

2. consider possible with multiple same raw fund names in documents, not to remove unmatched_db_list when match relevant raw fund/ share name
Otherwise, it will occur some raw names couldn't match db name issue.
This commit is contained in:
Blade He 2025-03-18 17:22:21 -05:00
parent 0cea2e501b
commit c71936c5ff
6 changed files with 123 additions and 91 deletions

View File

@ -1376,8 +1376,8 @@ def clean_text(text: str):
def merge_inference_data(): def merge_inference_data():
file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317_Ravi.xlsx" file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530.xlsx"
file2 = r"/data/aus_prospectus/output/merged_data/docs/excel/merged_420339794.xlsx" file2 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_3_documents_by_text_20250318171348.xlsx"
columns = [ columns = [
"doc_id", "doc_id",
"effective_date", "effective_date",
@ -1404,20 +1404,30 @@ def merge_inference_data():
"change_recoverable_expenses" "change_recoverable_expenses"
] ]
file1_data_df = pd.read_excel(file1, sheet_name="total_mapping_data") document_id_list = [384508026, 521606755, 544886057]
file1_data_df = file1_data_df[columns] file1_total_mapping_data_df = pd.read_excel(file1, sheet_name="total_mapping_data")
file1_total_mapping_data_df = file1_total_mapping_data_df[columns]
# remove the rows which doc_id is 420339794 from file1_data_df # remove the rows which doc_id is 420339794 from file1_data_df
file1_data_df = file1_data_df[file1_data_df["doc_id"] != 420339794] file1_total_mapping_data_df = file1_total_mapping_data_df[~(file1_total_mapping_data_df["doc_id"].isin(document_id_list))]
file1_extract_data_df = pd.read_excel(file1, sheet_name="extract_data")
file1_extract_data_df = file1_extract_data_df[~(file1_extract_data_df["doc_id"].isin(document_id_list))]
file2_total_mapping_data_df = pd.read_excel(file2, sheet_name="total_mapping_data")
file2_total_mapping_data_df = file2_total_mapping_data_df[columns]
total_mapping_data_df = pd.concat([file1_total_mapping_data_df, file2_total_mapping_data_df])
total_mapping_data_df.reset_index(drop=True, inplace=True)
file2_extract_data_df = pd.read_excel(file2, sheet_name="extract_data")
total_extract_data_df = pd.concat([file1_extract_data_df, file2_extract_data_df])
total_extract_data_df.reset_index(drop=True, inplace=True)
file2_data_df = pd.read_excel(file2, sheet_name="merged_data")
file2_data_df = file2_data_df[columns]
total_data_df = pd.concat([file1_data_df, file2_data_df])
total_data_df.reset_index(drop=True, inplace=True)
output_folder = r"/data/aus_prospectus/output/mapping_data/total/" output_folder = r"/data/aus_prospectus/output/mapping_data/total/"
output_file = os.path.join(output_folder, "mapping_data_info_46_documents_by_text_20250317_Ravi_modified.xlsx") output_file = os.path.join(output_folder, "mapping_data_info_46_documents_by_text_20250318124530_new.xlsx")
with pd.ExcelWriter(output_file) as f: with pd.ExcelWriter(output_file) as f:
total_data_df.to_excel(f, index=False, sheet_name="total_mapping_data") total_mapping_data_df.to_excel(f, index=False, sheet_name="total_mapping_data")
total_extract_data_df.to_excel(f, index=False, sheet_name="total_extract_data")

View File

@ -460,8 +460,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2], step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2], step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
llm_flag=False)) llm_flag=False))
if db_list[matched_index] in unmatched_db_list: # if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list: if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index]) unmatched_pred_list.remove(pred_list[index])
@ -485,8 +485,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1, step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2], step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False)) step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False))
if db_list[matched_index] in unmatched_db_list: # if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list: if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index]) unmatched_pred_list.remove(pred_list[index])
@ -513,8 +513,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2, step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2], step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False)) step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False))
if db_list[matched_index] in unmatched_db_list: # if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list: if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index]) unmatched_pred_list.remove(pred_list[index])
@ -543,8 +543,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3, step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3,
step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2], step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2],
step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False)) step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False))
if db_list[matched_index] in unmatched_db_list: # if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list: if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index]) unmatched_pred_list.remove(pred_list[index])
@ -585,8 +585,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
# print("unmatched_pred_list: ",unmatched_pred_list) # print("unmatched_pred_list: ",unmatched_pred_list)
# print("db_list[matched_index]: ",db_list[matched_index]) # print("db_list[matched_index]: ",db_list[matched_index])
# print("pred_list[index]: ",pred_list[index]) # print("pred_list[index]: ",pred_list[index])
if db_list[matched_index] in unmatched_db_list: # if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index]) # unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list: if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index]) unmatched_pred_list.remove(pred_list[index])
@ -663,20 +663,27 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
# print("k: ",k) # print("k: ",k)
# print("v: ",v) # print("v: ",v)
og_db_index=-1 og_db_index=-1
og_pred_index = -1 # og_pred_index = -1
og_pred_index_list = []
if k in cleaned_unmatched_pred_list: if k in cleaned_unmatched_pred_list:
og_pred_index = cleaned_unmatched_pred_list.index(k) for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
if c_item==k:
og_pred_index_list.append(c_idx)
# og_pred_index = cleaned_unmatched_pred_list.index(k)
if og_pred_index == -1: if len(og_pred_index_list) == 0:
# sometimes, the raw name and db name reversed from the LLM response # sometimes, the raw name and db name reversed from the LLM response
if v in cleaned_unmatched_pred_list and k in cleaned_unmatched_db_list: if v in cleaned_unmatched_pred_list and k in cleaned_unmatched_db_list:
og_pred_index = cleaned_unmatched_pred_list.index(v) for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
if c_item==v:
og_pred_index_list.append(c_idx)
# og_pred_index = cleaned_unmatched_pred_list.index(v)
og_db_index = cleaned_unmatched_db_list.index(k) og_db_index = cleaned_unmatched_db_list.index(k)
# v and k are swapped # v and k are swapped
temp = v temp = v
v = k v = k
k = temp k = temp
if og_pred_index==-1: if len(og_pred_index_list)==0:
continue continue
# og_db_index = cleaned_unmatched_db_list.index(v) # og_db_index = cleaned_unmatched_db_list.index(v)
if og_db_index == -1 and v in cleaned_unmatched_db_list: if og_db_index == -1 and v in cleaned_unmatched_db_list:
@ -685,21 +692,22 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
# print("unmatched_db_list: ",unmatched_db_list) # print("unmatched_db_list: ",unmatched_db_list)
for i in df_data: for i in df_data:
if i['pred_fund']==unmatched_pred_list[og_pred_index]: for og_pred_index in og_pred_index_list:
if og_db_index!=-1: if i['pred_fund']==unmatched_pred_list[og_pred_index]:
i['db_fund']=unmatched_db_list[og_db_index] if og_db_index!=-1:
i['cleaned_db_fund_name'] = v i['db_fund']=unmatched_db_list[og_db_index]
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]}) i['cleaned_db_fund_name'] = v
else: final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
i['db_fund'] = '' else:
i['cleaned_db_fund_name'] = '' i['db_fund'] = ''
final_result.update({unmatched_pred_list[og_pred_index]:""}) i['cleaned_db_fund_name'] = ''
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list final_result.update({unmatched_pred_list[og_pred_index]:""})
i['llm_clean_db_list'] = cleaned_unmatched_db_list, i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
i['llm_pred_fund'] = k i['llm_clean_db_list'] = cleaned_unmatched_db_list,
i['llm_matched_db_name'] = v i['llm_pred_fund'] = k
i['llm_result'] = llm_result i['llm_matched_db_name'] = v
break i['llm_result'] = llm_result
break
# break # break

View File

@ -793,7 +793,7 @@ class DataExtraction:
previous_page_datapoints = [] previous_page_datapoints = []
previous_page_fund_name = None previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
# if page_num not in [4, 5]: # if page_num not in [14]:
# continue # continue
if page_num in handled_page_num_list: if page_num in handled_page_num_list:
continue continue

View File

@ -555,15 +555,23 @@
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders Fund\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}", "{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders Fund\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
"---Example 3 Start---", "---Example 3 Start---",
"\n\nFund name \nComposite benchmark \nCFS Select High \nGrowth \n1.0% Bloomberg AusBond Bank Bill Index, 31.0% MSCI All Country World ex Australia Net Index, 18.0% \nMSCI All Country World ex Australia Net Index Hedged AUD, 4.0% MSCI World Small Cap Index, 5.0% \nMSCI Emerging Markets Index, 37.0% S&P/ASX 300 Accumulation Index, 4.0% S&P/ASX Small \nOrdinaries Index. ", "Composite benchmarks \n\nThe objective for some funds includes a reference to a composite benchmark. They may be subject to change \nat any time within the allocation ranges. \n\nFund name \nComposite benchmark \nCFS Defensive \nBuilder \n4.0% S&P/ASX 300 Accumulation Index, 4.0% MSCI All Country World ex Australia Net Index, 2.0% \nMSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI/Mercer Australia Core \nWholesale Monthly Property Fund Index, 4.0% MSCI Australia Quarterly Private Infrastructure Index \n(Unfrozen) Post-fee Total Return (50th Percentile), 25.0% Bloomberg Global Aggregate Corporate \nIndex (hedged AUD), 18.0% Bloomberg Global Aggregate Index (AUD hedged), 20.0% Bloomberg \nAusBond Composite 0+Yr Index, 20.0% Bloomberg AusBond Bank Bill Index. \nCFS Growth Builder \n29.0% S&P/ASX 300 Accumulation Index, 25.0% MSCI All Country World ex Australia Net Index, 14.0% \nMSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI World Small Cap Index, \n5.0% MSCI Emerging Markets Index, 2.0% FTSE EPRA Nareit Developed ex Aus Rental Index (AUD \nhedged), 2.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 2.0% FTSE \nDeveloped Core Infra 50/50 100% Hedged to AUD, 4.0% MSCI Australia Quarterly Private Infrastructure \nIndex (Unfrozen) Post-fee Total Return (50th Percentile), 12.0% Bloomberg Global Aggregate \nCorporate Index (hedged AUD), 2.0% Bloomberg AusBond Bank Bill Index. \n",
"---Example 3 End---", "---Example 3 End---",
"For this example, there are multiple fund names with multiple benchmark names with weightings, please extract them all.",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"CFS Select High Growth\", \"benchmark_name\": \"1.0% Bloomberg AusBond Bank Bill Index, 31.0% MSCI All Country World ex Australia Net Index, 18.0% MSCI All Country World ex Australia Net Index Hedged AUD, 4.0% MSCI World Small Cap Index, 5.0% MSCI Emerging Markets Index, 37.0% S&P/ASX 300 Accumulation Index, 4.0% S&P/ASX Small Ordinaries Index\"}]}", "{\"data\": [{\"fund name\": \"CFS Defensive Builder\", \"benchmark_name\": \"4.0% S&P/ASX 300 Accumulation Index, 4.0% MSCI All Country World ex Australia Net Index, 2.0% MSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 4.0% MSCI Australia Quarterly Private Infrastructure Index (Unfrozen) Post-fee Total Return (50th Percentile), 25.0% Bloomberg Global Aggregate Corporate Index (hedged AUD), 18.0% Bloomberg Global Aggregate Index (AUD hedged), 20.0% Bloomberg AusBond Composite 0+Yr Index, 20.0% Bloomberg AusBond Bank Bill Index\"}, {\"fund name\": \"CFS Growth Builder\", \"benchmark_name\": \"29.0% S&P/ASX 300 Accumulation Index, 25.0% MSCI All Country World ex Australia Net Index, 14.0% MSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI World Small Cap Index, 5.0% MSCI Emerging Markets Index, 2.0% FTSE EPRA Nareit Developed ex Aus Rental Index (AUD hedged), 2.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 2.0% FTSE Developed Core Infra 50/50 100% Hedged to AUD, 4.0% MSCI Australia Quarterly Private Infrastructure Index (Unfrozen) Post-fee Total Return (50th Percentile), 12.0% Bloomberg Global Aggregate Corporate Index (hedged AUD), 2.0% Bloomberg AusBond Bank Bill Index\"}]}",
"---Example 4 Start---", "---Example 4 Start---",
"\n\nInvestment option \n1 \nPerformance - fee rate \n2 \nPerformance hurdle \n4 \n(maximum fee \n3 \n) \nPayment \nfrequency \nAusbil Australian \nEmerging Leaders \n15.375% \n(maximum 15% ex-GST) \nComposite benchmark (70% S&P/ASX Midcap 50 \nAccumulation Index and 30% S&P/ASX Small Ordinaries \nAccumulation Index) plus 0.85% pa \nMonthly \n", "\n\nInvestment option \n1 \nPerformance - fee rate \n2 \nPerformance hurdle \n4 \n(maximum fee \n3 \n) \nPayment \nfrequency \nAusbil Australian \nEmerging Leaders \n15.375% \n(maximum 15% ex-GST) \nComposite benchmark (70% S&P/ASX Midcap 50 \nAccumulation Index and 30% S&P/ASX Small Ordinaries \nAccumulation Index) plus 0.85% pa \nMonthly \n",
"---Example 4 End---", "---Example 4 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index and 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}", "{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index and 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
"---Example 5 Start---",
"Fixed interest / Income \nCash \nUBS Diversified Fixed Income BT Cash Management Trust \nBenchmark \n50% Bloomberg Barclays Global \nAggregate Index (A$ hedged), 50% \nBloomberg AusBond Composite \n0+ Yr Index* \nBloomberg AusBond Bank Bill Index* \n",
"---Example 5 End---",
"For this example, please read carefully for fund names in same line: \"UBS Diversified Fixed Income BT Cash Management Trust\", there are 2 fund names: \"UBS Diversified Fixed Income\" and \"BT Cash Management Trust\".",
"There are 2 benchmark names: \"50% Bloomberg Barclays Global Aggregate Index (A$ hedged), 50% Bloomberg AusBond Composite 0+ Yr Index\" and \"Bloomberg AusBond Bank Bill Index\".",
"The output should be:",
"{\"data\": [{\"fund name\": \"UBS Diversified Fixed Income\", \"benchmark_name\": \"50% Bloomberg Barclays Global Aggregate Index (A$ hedged), 50% Bloomberg AusBond Composite 0+ Yr Index\"}, {\"fund name\": \"BT Cash Management Trust\", \"benchmark_name\": \"Bloomberg AusBond Bank Bill Index\"}]}",
"\n", "\n",
"C. Don't extract benchmark name from context when fit below cases.", "C. Don't extract benchmark name from context when fit below cases.",
"1. Exclude benchmark name when its reported name is \"Return target\".", "1. Exclude benchmark name when its reported name is \"Return target\".",
@ -586,6 +594,12 @@
"Return targets, on the other hand, are goals set by the fund to achieve a certain level of performance over a specified period, in this case, 20 years. ", "Return targets, on the other hand, are goals set by the fund to achieve a certain level of performance over a specified period, in this case, 20 years. ",
"They indicate the desired outcome rather than serving as a comparative measure against market performance.", "They indicate the desired outcome rather than serving as a comparative measure against market performance.",
"The output should be:", "The output should be:",
"{\"data\": []}",
"D. If extracted multiple benchmark names, but without weightings, e.g. 50% or 30%, please ignore and output empty.",
"---Example Start---",
"This is calculated by using the weighted average of the \nasset allocation neutral position and the index returns for each asset class. \n\nBT Multi-manager Growth Fund, BT Multi-manager Balanced Fund, BT Multi-manager Conservative Fund \nand BT Multi-manager High Growth Fund \n\nAsset class Indices \nAustralian shares S&P/ASX 300 Accumulation Index \nInternational shares MSCI World ex Australia $A (Net Dividends Reinvested) \nMSCI World ex Australia Hedged $A (Net Dividends Reinvested) \nMSCI Emerging Market (Net Dividends Reinvested) in AUD \nAustralian property S&P/ASX 300 A-REIT Accumulation Index \nInternational property FTSE EPRA/NAREIT Developed Hedged in AUD Net TRI \n",
"---Example End---",
"The output should be:",
"{\"data\": []}" "{\"data\": []}"
] ]
}, },

View File

@ -1538,7 +1538,7 @@ if __name__ == "__main__":
with open(document_sample_file, "r", encoding="utf-8") as f: with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
# special_doc_id_list = ["420339794"] # special_doc_id_list = ["521606755", "384508026", "544886057"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 20,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -36,14 +36,14 @@
"\n", "\n",
"path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
"# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n", "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n",
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530.xlsx\"\n", "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250318124530_new.xlsx\"\n",
"provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n",
"\n" "\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 21,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -316,7 +316,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 22,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -330,53 +330,53 @@
"All Providers Results: \n", "All Providers Results: \n",
"Document List File - None\n", "Document List File - None\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.9211 \t0.8861 \t0.9589 \t0.8544 \t409 \t350 \t2 \t45 \t15 \n", "management_fee_and_costs \t0.9321 \t0.8838 \t0.9859 \t0.8734 \t400 \t350 \t2 \t46 \t5 \n",
"management_fee \t0.9419 \t0.9241 \t0.9605 \t0.8908 \t409 \t365 \t2 \t30 \t15 \n", "management_fee \t0.9516 \t0.9192 \t0.9864 \t0.9082 \t400 \t364 \t2 \t32 \t5 \n",
"performance_fee_costs \t0.8987 \t0.9325 \t0.8672 \t0.8714 \t284 \t235 \t124 \t17 \t36 \n", "performance_fee_costs \t0.8992 \t0.8821 \t0.9170 \t0.8710 \t275 \t232 \t119 \t31 \t21 \n",
"interposed_vehicle_performance_fee_cost \t0.9600 \t0.9231 \t1.0000 \t0.9854 \t73 \t72 \t334 \t6 \t0 \n", "interposed_vehicle_performance_fee_cost \t0.9600 \t0.9231 \t1.0000 \t0.9851 \t73 \t72 \t325 \t6 \t0 \n",
"administration_fees \t0.9853 \t0.9710 \t1.0000 \t0.9951 \t67 \t67 \t343 \t2 \t0 \n", "administration_fees \t0.9920 \t0.9841 \t1.0000 \t0.9975 \t62 \t62 \t340 \t1 \t0 \n",
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t69 \t69 \t343 \t0 \t0 \n", "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t63 \t63 \t340 \t0 \t0 \n",
"buy_spread \t0.9365 \t0.9242 \t0.9491 \t0.8956 \t352 \t317 \t52 \t26 \t17 \n", "buy_spread \t0.9339 \t0.9129 \t0.9560 \t0.8933 \t340 \t304 \t56 \t29 \t14 \n",
"sell_spread \t0.9412 \t0.9329 \t0.9496 \t0.9029 \t352 \t320 \t52 \t23 \t17 \n", "sell_spread \t0.9388 \t0.9219 \t0.9564 \t0.9007 \t340 \t307 \t56 \t26 \t14 \n",
"minimum_initial_investment \t0.9737 \t0.9642 \t0.9834 \t0.9612 \t301 \t296 \t100 \t11 \t5 \n", "minimum_initial_investment \t0.9694 \t0.9628 \t0.9760 \t0.9553 \t292 \t285 \t100 \t11 \t7 \n",
"benchmark_name \t0.8137 \t0.8295 \t0.7985 \t0.8811 \t144 \t107 \t256 \t22 \t27 \n", "benchmark_name \t0.9023 \t0.8759 \t0.9302 \t0.9355 \t140 \t120 \t257 \t17 \t9 \n",
"TOTAL \t0.9372 \t0.9288 \t0.9467 \t0.9238 \t2460 \t2198 \t1608 \t182 \t132 \n", "TOTAL \t0.9479 \t0.9266 \t0.9708 \t0.9320 \t2385 \t2159 \t1597 \t199 \t75 \n",
"Total Funds Matched - 412\n", "Total Funds Matched - 403\n",
"Total Funds Not Matched - 153\n", "Total Funds Not Matched - 162\n",
"Percentage of Funds Matched - 72.9203539823009\n", "Percentage of Funds Matched - 71.32743362831859\n",
"All Providers Results: \n", "All Providers Results: \n",
"Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.9457 \t0.8970 \t1.0000 \t0.8970 \t164 \t148 \t0 \t17 \t0 \n", "management_fee_and_costs \t0.9494 \t0.9036 \t1.0000 \t0.9036 \t165 \t150 \t0 \t16 \t0 \n",
"management_fee \t0.9783 \t0.9576 \t1.0000 \t0.9576 \t164 \t158 \t0 \t7 \t0 \n", "management_fee \t0.9753 \t0.9518 \t1.0000 \t0.9518 \t165 \t158 \t0 \t8 \t0 \n",
"performance_fee_costs \t0.8263 \t0.8846 \t0.7753 \t0.8242 \t95 \t69 \t67 \t9 \t20 \n", "performance_fee_costs \t0.8427 \t0.7979 \t0.8929 \t0.8313 \t96 \t75 \t63 \t19 \t9 \n",
"interposed_vehicle_performance_fee_cost \t0.9455 \t0.8966 \t1.0000 \t0.9636 \t53 \t52 \t107 \t6 \t0 \n", "interposed_vehicle_performance_fee_cost \t0.9455 \t0.8966 \t1.0000 \t0.9639 \t53 \t52 \t108 \t6 \t0 \n",
"administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t1 \t1 \t164 \t0 \t0 \n", "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t2 \t2 \t164 \t0 \t0 \n",
"buy_spread \t0.9812 \t0.9752 \t0.9874 \t0.9636 \t162 \t157 \t2 \t4 \t2 \n", "buy_spread \t0.9718 \t0.9568 \t0.9873 \t0.9458 \t163 \t155 \t2 \t7 \t2 \n",
"sell_spread \t0.9876 \t0.9876 \t0.9876 \t0.9758 \t162 \t159 \t2 \t2 \t2 \n", "sell_spread \t0.9782 \t0.9691 \t0.9874 \t0.9578 \t163 \t157 \t2 \t5 \t2 \n",
"minimum_initial_investment \t0.9569 \t0.9531 \t0.9606 \t0.9333 \t127 \t122 \t32 \t6 \t5 \n", "minimum_initial_investment \t0.9490 \t0.9528 \t0.9453 \t0.9217 \t128 \t121 \t32 \t6 \t7 \n",
"benchmark_name \t0.7733 \t0.7945 \t0.7532 \t0.7939 \t85 \t58 \t73 \t15 \t19 \n", "benchmark_name \t0.8944 \t0.8571 \t0.9351 \t0.8976 \t85 \t72 \t77 \t12 \t5 \n",
"TOTAL \t0.9328 \t0.9273 \t0.9405 \t0.9232 \t1013 \t924 \t447 \t66 \t180 \n", "TOTAL \t0.9451 \t0.9206 \t0.9720 \t0.9304 \t1020 \t942 \t448 \t79 \t100 \n",
"Total Funds Matched - 165\n", "Total Funds Matched - 166\n",
"Total Funds Not Matched - 31\n", "Total Funds Not Matched - 30\n",
"Percentage of Funds Matched - 84.18367346938776\n", "Percentage of Funds Matched - 84.6938775510204\n",
"All Providers Results: \n", "All Providers Results: \n",
"Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.9038 \t0.8783 \t0.9309 \t0.8259 \t245 \t202 \t2 \t28 \t15 \n", "management_fee_and_costs \t0.9195 \t0.8696 \t0.9756 \t0.8523 \t235 \t200 \t2 \t30 \t5 \n",
"management_fee \t0.9159 \t0.9000 \t0.9324 \t0.8462 \t245 \t207 \t2 \t23 \t15 \n", "management_fee \t0.9342 \t0.8957 \t0.9763 \t0.8776 \t235 \t206 \t2 \t24 \t5 \n",
"performance_fee_costs \t0.9326 \t0.9540 \t0.9121 \t0.9028 \t189 \t166 \t57 \t8 \t16 \n", "performance_fee_costs \t0.9290 \t0.9290 \t0.9290 \t0.8987 \t179 \t157 \t56 \t12 \t12 \n",
"interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t227 \t0 \t0 \n", "interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t217 \t0 \t0 \n",
"administration_fees \t0.9851 \t0.9706 \t1.0000 \t0.9919 \t66 \t66 \t179 \t2 \t0 \n", "administration_fees \t0.9917 \t0.9836 \t1.0000 \t0.9958 \t60 \t60 \t176 \t1 \t0 \n",
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t69 \t69 \t178 \t0 \t0 \n", "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t63 \t63 \t174 \t0 \t0 \n",
"buy_spread \t0.8964 \t0.8791 \t0.9143 \t0.8502 \t190 \t160 \t50 \t22 \t15 \n", "buy_spread \t0.8976 \t0.8713 \t0.9255 \t0.8565 \t177 \t149 \t54 \t22 \t12 \n",
"sell_spread \t0.8994 \t0.8846 \t0.9148 \t0.8543 \t190 \t161 \t50 \t21 \t15 \n", "sell_spread \t0.9009 \t0.8772 \t0.9259 \t0.8608 \t177 \t150 \t54 \t21 \t12 \n",
"minimum_initial_investment \t0.9858 \t0.9721 \t1.0000 \t0.9798 \t174 \t174 \t68 \t5 \t0 \n", "minimum_initial_investment \t0.9850 \t0.9704 \t1.0000 \t0.9789 \t164 \t164 \t68 \t5 \t0 \n",
"benchmark_name \t0.8673 \t0.8750 \t0.8596 \t0.9393 \t59 \t49 \t183 \t7 \t8 \n", "benchmark_name \t0.9143 \t0.9057 \t0.9231 \t0.9620 \t55 \t48 \t180 \t5 \t4 \n",
"TOTAL \t0.9386 \t0.9314 \t0.9464 \t0.9190 \t1447 \t1274 \t996 \t116 \t264 \n", "TOTAL \t0.9472 \t0.9302 \t0.9655 \t0.9283 \t1365 \t1217 \t983 \t120 \t150 \n",
"Total Funds Matched - 247\n", "Total Funds Matched - 237\n",
"Total Funds Not Matched - 122\n", "Total Funds Not Matched - 132\n",
"Percentage of Funds Matched - 66.93766937669376\n" "Percentage of Funds Matched - 64.22764227642277\n"
] ]
} }
], ],
@ -478,7 +478,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {