optimize for more cases

This commit is contained in:
Blade He 2024-10-08 17:16:01 -05:00
parent 8bd6008425
commit aa2c2332ae
6 changed files with 217 additions and 128 deletions

View File

@ -206,7 +206,8 @@
"Portfolio turnover ratio", "Portfolio turnover ratio",
"Portfolio turnover rate", "Portfolio turnover rate",
"PTR", "PTR",
"Annual Portfolio Turnover Ratio" "Annual Portfolio Turnover Ratio",
"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid"
], ],
"india": [ "india": [
"Aggregate Value of Purchase and Sale", "Aggregate Value of Purchase and Sale",

View File

@ -361,9 +361,16 @@ class DataMapping:
data_info["legal_name"] = max_similarity_name data_info["legal_name"] = max_similarity_name
data_info["similarity"] = max_similarity data_info["similarity"] = max_similarity
else: else:
data_info["id"] = "" if len(doc_compare_name_list) == 1:
data_info["legal_name"] = "" data_info["id"] = doc_compare_mapping[
data_info["similarity"] = 0 doc_compare_mapping[compare_name_dp] == doc_compare_name_list[0]
][compare_id_dp].values[0]
data_info["legal_name"] = doc_compare_name_list[0]
data_info["similarity"] = 1
else:
data_info["id"] = ""
data_info["legal_name"] = ""
data_info["similarity"] = 0
data_info["investment_type"] = investment_type data_info["investment_type"] = investment_type
else: else:
data_info["id"] = "" data_info["id"] = ""

View File

@ -14,7 +14,7 @@
"3.2 Please extract data from the context." "3.2 Please extract data from the context."
], ],
"reported_name": { "reported_name": {
"tor": "The TOR reported name could be:\nTOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, etc.", "tor": "The TOR reported name could be:\nTOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, Taux de rotation corrigé - Gecorrigeerde omloopsnelheid, etc.",
"ogc": "The OGC reported name could be:\nOGC, OGF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, etc.", "ogc": "The OGC reported name could be:\nOGC, OGF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, etc.",
"ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), etc.", "ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), etc.",
"performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc." "performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc."

295
main.py
View File

@ -258,6 +258,7 @@ def batch_start_job(
re_run_extract_data: bool = False, re_run_extract_data: bool = False,
re_run_mapping_data: bool = False, re_run_mapping_data: bool = False,
force_save_total_data: bool = False, force_save_total_data: bool = False,
calculate_metrics: bool = False,
): ):
pdf_files = glob(pdf_folder + "*.pdf") pdf_files = glob(pdf_folder + "*.pdf")
doc_list = [] doc_list = []
@ -322,48 +323,73 @@ def batch_start_job(
output_mapping_total_folder, output_mapping_total_folder,
f"mapping_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx", f"mapping_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx",
) )
doc_mapping_data_in_db = only_output_mapping_data_in_db(result_mappingdata_df)
with pd.ExcelWriter(output_file) as writer: with pd.ExcelWriter(output_file) as writer:
doc_mapping_data_in_db.to_excel(
writer, index=False, sheet_name="data_in_doc_mapping"
)
result_mappingdata_df.to_excel( result_mappingdata_df.to_excel(
writer, index=False, sheet_name="mapping_data" writer, index=False, sheet_name="total_mapping_data"
) )
result_extract_data_df.to_excel( result_extract_data_df.to_excel(
writer, index=False, sheet_name="extract_data" writer, index=False, sheet_name="extract_data"
) )
prediction_sheet_name = "mapping_data"
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
ground_truth_sheet_name = "mapping_data"
metrics_output_folder = r"/data/emea_ar/output/metrics/"
# logger.info(f"Calculating metrics for data extraction")
# missing_error_list, metrics_list, metrics_file = get_metrics(
# "data_extraction",
# output_file,
# prediction_sheet_name,
# ground_truth_file,
# ground_truth_sheet_name,
# metrics_output_folder,
# )
# logger.info(f"Calculating metrics for investment mapping by actual document mapping") if calculate_metrics:
# missing_error_list, metrics_list, metrics_file = get_metrics( prediction_sheet_name = "mapping_data"
# "investment_mapping", ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
# output_file, ground_truth_sheet_name = "mapping_data"
# prediction_sheet_name, metrics_output_folder = r"/data/emea_ar/output/metrics/"
# ground_truth_file,
# ground_truth_sheet_name, # logger.info(f"Calculating metrics for data extraction")
# metrics_output_folder, # missing_error_list, metrics_list, metrics_file = get_metrics(
# ) # "data_extraction",
# output_file,
# prediction_sheet_name,
# ground_truth_file,
# ground_truth_sheet_name,
# metrics_output_folder,
# )
# logger.info(f"Calculating metrics for investment mapping by actual document mapping")
# missing_error_list, metrics_list, metrics_file = get_metrics(
# "investment_mapping",
# output_file,
# prediction_sheet_name,
# ground_truth_file,
# ground_truth_sheet_name,
# metrics_output_folder,
# )
logger.info(f"Calculating metrics for investment mapping by database document mapping")
missing_error_list, metrics_list, metrics_file = get_metrics(
"document_mapping_in_db",
output_file,
prediction_sheet_name,
ground_truth_file,
ground_truth_sheet_name,
metrics_output_folder,
)
def only_output_mapping_data_in_db(mapping_data: pd.DataFrame) -> None:
doc_id_list = mapping_data["doc_id"].unique().tolist()
data_in_mapping_df_list = []
for doc_id in doc_id_list:
doc_mapping_data = mapping_data[mapping_data["doc_id"] == doc_id]
logger.info(f"Calculating metrics for investment mapping by database document mapping") document_mapping = query_document_fund_mapping(doc_id, rerun=False)
missing_error_list, metrics_list, metrics_file = get_metrics( fund_id_list = document_mapping["FundId"].unique().tolist()
"document_mapping_in_db", sec_id_list = document_mapping["SecId"].unique().tolist()
output_file, id_list = fund_id_list + sec_id_list
prediction_sheet_name, # filter doc_mapping_data by id_list
ground_truth_file, filter_doc_mapping_data = doc_mapping_data[doc_mapping_data["investment_id"].isin(id_list)]
ground_truth_sheet_name, data_in_mapping_df_list.append(filter_doc_mapping_data)
metrics_output_folder, result_mapping_data_df = pd.concat(data_in_mapping_df_list)
) result_mapping_data_df.reset_index(drop=True, inplace=True)
return result_mapping_data_df
def batch_filter_pdf_files( def batch_filter_pdf_files(
@ -597,7 +623,7 @@ def test_mapping_raw_name():
if __name__ == "__main__": if __name__ == "__main__":
pdf_folder = r"/data/emea_ar/small_pdf/" pdf_folder = r"/data/emea_ar/pdf/"
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
) )
@ -637,96 +663,132 @@ if __name__ == "__main__":
# re_run_extract_data) # re_run_extract_data)
# special_doc_id_list = ["505174428", "510326848", "349679479"] # special_doc_id_list = ["505174428", "510326848", "349679479"]
check_mapping_doc_id_list = [ # check_mapping_doc_id_list = [
"327956364", # "327956364",
"391456740", # "391456740",
"391736837", # "391736837",
"458359181", # "458359181",
"486383912", # "486383912",
"497497599", # "497497599",
"529925114", # "529925114",
"321733631", # "321733631",
"334718372", # "334718372",
"344636875", # "344636875",
"362246081", # "362246081",
"445256897", # "445256897",
"449623976", # "449623976",
"458291624", # "458291624",
"478585901", # "478585901",
"492121213", # "492121213",
"502821436", # "502821436",
"507967525", # "507967525",
"481475385", # "481475385",
"508854243", # "508854243",
"520879048", # "520879048",
"402181770", # "402181770",
"463081566", # "463081566",
"502693599", # "502693599",
"509845549", # "509845549",
"389171486", # "389171486",
"323390570", # "323390570",
"366179419", # "366179419",
"486378555", # "486378555",
"506559375", # "506559375",
"479793787", # "479793787",
"471641628", # "471641628",
] # ]
# check_db_mapping_doc_id_list = [
# "292989214",
# "316237292",
# "321733631",
# "323390570",
# "327956364",
# "332223498",
# "333207452",
# "334718372",
# "344636875",
# "362246081",
# "366179419",
# "380945052",
# "382366116",
# "387202452",
# "389171486",
# "391456740",
# "391736837",
# "394778487",
# "401684600",
# "402113224",
# "402181770",
# "402397014",
# "405803396",
# "445102363",
# "445256897",
# "448265376",
# "449555622",
# "449623976",
# "458291624",
# "458359181",
# "463081566",
# "469138353",
# "471641628",
# "476492237",
# "478585901",
# "478586066",
# "479042264",
# "479042269",
# "479793787",
# "481475385",
# "483617247",
# "486378555",
# "486383912",
# "492121213",
# "497497599",
# "502693599"
# ]
check_db_mapping_doc_id_list = [ check_db_mapping_doc_id_list = [
"292989214", "334584772",
"316237292", "406913630",
"321733631", "407275419",
"323390570", "337937633",
"327956364", "337293427",
"332223498", "334584772",
"333207452", "404712928",
"334718372", "451063582",
"344636875", "451878128",
"362246081", "425595958",
"366179419", "536344026",
"380945052", "532422548",
"382366116", "423418540",
"387202452", "423418395",
"389171486", "532998065",
"391456740", "540307575",
"391736837", "423395975",
"394778487", "508704368",
"401684600", "481482392",
"402113224", "466580448",
"402181770", "423365707",
"402397014", "423364758",
"405803396", "422761666",
"445102363", "422760156",
"445256897", "422760148",
"448265376", "422686965",
"449555622", "492029971",
"449623976", "510300817",
"458291624", "512745032",
"458359181", "514213638",
"463081566", "527525440",
"469138353", "534535767"
"471641628",
"476492237",
"478585901",
"478586066",
"479042264",
"479042269",
"479793787",
"481475385",
"483617247",
"486378555",
"486383912",
"492121213",
"497497599",
"502693599"
] ]
# special_doc_id_list = check_mapping_doc_id_list # special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["503194284"] # special_doc_id_list = ["337937633"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = False
force_save_total_data = True force_save_total_data = True
calculate_metrics = False
extract_ways = ["text"] extract_ways = ["text"]
for extract_way in extract_ways: for extract_way in extract_ways:
@ -742,6 +804,7 @@ if __name__ == "__main__":
re_run_extract_data, re_run_extract_data,
re_run_mapping_data, re_run_mapping_data,
force_save_total_data=force_save_total_data, force_save_total_data=force_save_total_data,
calculate_metrics=calculate_metrics,
) )
# test_data_extraction_metrics() # test_data_extraction_metrics()

View File

@ -39,11 +39,14 @@ def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
) )
def download_pdf(doc_provider_file_path: str, sheet_name: str, pdf_path: str): def download_pdf(doc_provider_file_path: str,
sheet_name: str,
pdf_path: str,
doc_id_column: str = "DocumentId"):
document_data = pd.read_excel(doc_provider_file_path, sheet_name=sheet_name) document_data = pd.read_excel(doc_provider_file_path, sheet_name=sheet_name)
# get all unique docids as list # get all unique docids as list
doc_id_list = [ doc_id_list = [
str(doc_id) for doc_id in document_data["DocumentId"].unique().tolist() str(doc_id) for doc_id in document_data[doc_id_column].unique().tolist()
] ]
# download pdfs # download pdfs
logger.info(f"Start downloading {len(doc_id_list)} pdfs") logger.info(f"Start downloading {len(doc_id_list)} pdfs")
@ -994,8 +997,13 @@ if __name__ == "__main__":
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx" r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
) )
doc_provider_file_path = r"/data/emea_ar/basic_information/English/emea_sample_doc_from_Kshitij.xlsx"
# download_pdf(random_small_document_data_file, 'random_small_document', pdf_folder) pdf_folder = r"/data/emea_ar/pdf/"
download_pdf(
doc_provider_file_path=doc_provider_file_path,
sheet_name="Sample EMEA Docs",
doc_id_column="Document ID",
pdf_path=pdf_folder)
# output_pdf_page_text(pdf_folder, output_folder) # output_pdf_page_text(pdf_folder, output_folder)
# extract_pdf_table(pdf_folder, output_folder) # extract_pdf_table(pdf_folder, output_folder)
@ -1012,7 +1020,7 @@ if __name__ == "__main__":
# sheet_name="latest_doc_ar_data", # sheet_name="latest_doc_ar_data",
# output_folder=output_data_folder, # output_folder=output_data_folder,
# output_file="latest_doc_ar_mapping_statistics.xlsx") # output_file="latest_doc_ar_mapping_statistics.xlsx")
get_document_extracted_share_diff_by_db() # get_document_extracted_share_diff_by_db()
# statistics_provider_mapping( # statistics_provider_mapping(
# provider_mapping_data_file=provider_mapping_data_file, # provider_mapping_data_file=provider_mapping_data_file,
# output_folder=basic_info_folder, # output_folder=basic_info_folder,

View File

@ -599,8 +599,18 @@ def get_jacard_similarity(text_left,
text_right = text_right.split() text_right = text_right.split()
intersection = set(text_left).intersection(set(text_right)) intersection = set(text_left).intersection(set(text_right))
union = set(text_left).union(set(text_right)) union = set(text_left).union(set(text_right))
if len(union) > 0:
return round(len(intersection) / len(union), 3) intersection_count = len(intersection)
union_count = len(union)
differ_a = list(set(text_left).difference(set(text_right)))
differ_a.sort()
differ_b = list(set(text_right).difference(set(text_left)))
differ_b.sort()
if ''.join(differ_a) == ''.join(differ_b):
intersection_count += len(differ_a) + len(differ_b)
if union_count > 0:
return round(intersection_count / union_count, 3)
else: else:
return 0 return 0