From aa2c2332ae289c3518fb86067d28211b1ae88c89 Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 8 Oct 2024 17:16:01 -0500 Subject: [PATCH] optimize for more cases --- configuration/datapoint_keyword.json | 3 +- core/data_mapping.py | 13 +- .../data_extraction_prompts_config.json | 2 +- main.py | 295 +++++++++++------- prepare_data.py | 18 +- utils/biz_utils.py | 14 +- 6 files changed, 217 insertions(+), 128 deletions(-) diff --git a/configuration/datapoint_keyword.json b/configuration/datapoint_keyword.json index c54d312..c132cf0 100644 --- a/configuration/datapoint_keyword.json +++ b/configuration/datapoint_keyword.json @@ -206,7 +206,8 @@ "Portfolio turnover ratio", "Portfolio turnover rate", "PTR", - "Annual Portfolio Turnover Ratio" + "Annual Portfolio Turnover Ratio", + "Taux de rotation corrigé - Gecorrigeerde omloopsnelheid" ], "india": [ "Aggregate Value of Purchase and Sale", diff --git a/core/data_mapping.py b/core/data_mapping.py index 7a3015b..a3dfc40 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -361,9 +361,16 @@ class DataMapping: data_info["legal_name"] = max_similarity_name data_info["similarity"] = max_similarity else: - data_info["id"] = "" - data_info["legal_name"] = "" - data_info["similarity"] = 0 + if len(doc_compare_name_list) == 1: + data_info["id"] = doc_compare_mapping[ + doc_compare_mapping[compare_name_dp] == doc_compare_name_list[0] + ][compare_id_dp].values[0] + data_info["legal_name"] = doc_compare_name_list[0] + data_info["similarity"] = 1 + else: + data_info["id"] = "" + data_info["legal_name"] = "" + data_info["similarity"] = 0 data_info["investment_type"] = investment_type else: data_info["id"] = "" diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index 3e24a5d..700c310 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -14,7 +14,7 @@ "3.2 Please extract data from the context." ], "reported_name": { - "tor": "The TOR reported name could be:\nTOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, etc.", + "tor": "The TOR reported name could be:\nTOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, Taux de rotation corrigé - Gecorrigeerde omloopsnelheid, etc.", "ogc": "The OGC reported name could be:\nOGC, OGF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, etc.", "ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), etc.", "performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc." diff --git a/main.py b/main.py index 889973a..9dd739b 100644 --- a/main.py +++ b/main.py @@ -258,6 +258,7 @@ def batch_start_job( re_run_extract_data: bool = False, re_run_mapping_data: bool = False, force_save_total_data: bool = False, + calculate_metrics: bool = False, ): pdf_files = glob(pdf_folder + "*.pdf") doc_list = [] @@ -322,48 +323,73 @@ def batch_start_job( output_mapping_total_folder, f"mapping_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx", ) + + doc_mapping_data_in_db = only_output_mapping_data_in_db(result_mappingdata_df) with pd.ExcelWriter(output_file) as writer: + doc_mapping_data_in_db.to_excel( + writer, index=False, sheet_name="data_in_doc_mapping" + ) result_mappingdata_df.to_excel( - writer, index=False, sheet_name="mapping_data" + writer, index=False, sheet_name="total_mapping_data" ) result_extract_data_df.to_excel( writer, index=False, sheet_name="extract_data" ) - prediction_sheet_name = "mapping_data" - ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx" - ground_truth_sheet_name = "mapping_data" - metrics_output_folder = r"/data/emea_ar/output/metrics/" - - # logger.info(f"Calculating metrics for data extraction") - # missing_error_list, metrics_list, metrics_file = get_metrics( - # "data_extraction", - # output_file, - # prediction_sheet_name, - # ground_truth_file, - # ground_truth_sheet_name, - # metrics_output_folder, - # ) - # logger.info(f"Calculating metrics for investment mapping by actual document mapping") - # missing_error_list, metrics_list, metrics_file = get_metrics( - # "investment_mapping", - # output_file, - # prediction_sheet_name, - # ground_truth_file, - # ground_truth_sheet_name, - # metrics_output_folder, - # ) + if calculate_metrics: + prediction_sheet_name = "mapping_data" + ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx" + ground_truth_sheet_name = "mapping_data" + metrics_output_folder = r"/data/emea_ar/output/metrics/" + + # logger.info(f"Calculating metrics for data extraction") + # missing_error_list, metrics_list, metrics_file = get_metrics( + # "data_extraction", + # output_file, + # prediction_sheet_name, + # ground_truth_file, + # ground_truth_sheet_name, + # metrics_output_folder, + # ) + + # logger.info(f"Calculating metrics for investment mapping by actual document mapping") + # missing_error_list, metrics_list, metrics_file = get_metrics( + # "investment_mapping", + # output_file, + # prediction_sheet_name, + # ground_truth_file, + # ground_truth_sheet_name, + # metrics_output_folder, + # ) + + logger.info(f"Calculating metrics for investment mapping by database document mapping") + missing_error_list, metrics_list, metrics_file = get_metrics( + "document_mapping_in_db", + output_file, + prediction_sheet_name, + ground_truth_file, + ground_truth_sheet_name, + metrics_output_folder, + ) + + +def only_output_mapping_data_in_db(mapping_data: pd.DataFrame) -> None: + doc_id_list = mapping_data["doc_id"].unique().tolist() + data_in_mapping_df_list = [] + for doc_id in doc_id_list: + doc_mapping_data = mapping_data[mapping_data["doc_id"] == doc_id] - logger.info(f"Calculating metrics for investment mapping by database document mapping") - missing_error_list, metrics_list, metrics_file = get_metrics( - "document_mapping_in_db", - output_file, - prediction_sheet_name, - ground_truth_file, - ground_truth_sheet_name, - metrics_output_folder, - ) + document_mapping = query_document_fund_mapping(doc_id, rerun=False) + fund_id_list = document_mapping["FundId"].unique().tolist() + sec_id_list = document_mapping["SecId"].unique().tolist() + id_list = fund_id_list + sec_id_list + # filter doc_mapping_data by id_list + filter_doc_mapping_data = doc_mapping_data[doc_mapping_data["investment_id"].isin(id_list)] + data_in_mapping_df_list.append(filter_doc_mapping_data) + result_mapping_data_df = pd.concat(data_in_mapping_df_list) + result_mapping_data_df.reset_index(drop=True, inplace=True) + return result_mapping_data_df def batch_filter_pdf_files( @@ -597,7 +623,7 @@ def test_mapping_raw_name(): if __name__ == "__main__": - pdf_folder = r"/data/emea_ar/small_pdf/" + pdf_folder = r"/data/emea_ar/pdf/" page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) @@ -637,96 +663,132 @@ if __name__ == "__main__": # re_run_extract_data) # special_doc_id_list = ["505174428", "510326848", "349679479"] - check_mapping_doc_id_list = [ - "327956364", - "391456740", - "391736837", - "458359181", - "486383912", - "497497599", - "529925114", - "321733631", - "334718372", - "344636875", - "362246081", - "445256897", - "449623976", - "458291624", - "478585901", - "492121213", - "502821436", - "507967525", - "481475385", - "508854243", - "520879048", - "402181770", - "463081566", - "502693599", - "509845549", - "389171486", - "323390570", - "366179419", - "486378555", - "506559375", - "479793787", - "471641628", - ] + # check_mapping_doc_id_list = [ + # "327956364", + # "391456740", + # "391736837", + # "458359181", + # "486383912", + # "497497599", + # "529925114", + # "321733631", + # "334718372", + # "344636875", + # "362246081", + # "445256897", + # "449623976", + # "458291624", + # "478585901", + # "492121213", + # "502821436", + # "507967525", + # "481475385", + # "508854243", + # "520879048", + # "402181770", + # "463081566", + # "502693599", + # "509845549", + # "389171486", + # "323390570", + # "366179419", + # "486378555", + # "506559375", + # "479793787", + # "471641628", + # ] + # check_db_mapping_doc_id_list = [ + # "292989214", + # "316237292", + # "321733631", + # "323390570", + # "327956364", + # "332223498", + # "333207452", + # "334718372", + # "344636875", + # "362246081", + # "366179419", + # "380945052", + # "382366116", + # "387202452", + # "389171486", + # "391456740", + # "391736837", + # "394778487", + # "401684600", + # "402113224", + # "402181770", + # "402397014", + # "405803396", + # "445102363", + # "445256897", + # "448265376", + # "449555622", + # "449623976", + # "458291624", + # "458359181", + # "463081566", + # "469138353", + # "471641628", + # "476492237", + # "478585901", + # "478586066", + # "479042264", + # "479042269", + # "479793787", + # "481475385", + # "483617247", + # "486378555", + # "486383912", + # "492121213", + # "497497599", + # "502693599" + # ] + check_db_mapping_doc_id_list = [ - "292989214", - "316237292", - "321733631", - "323390570", - "327956364", - "332223498", - "333207452", - "334718372", - "344636875", - "362246081", - "366179419", - "380945052", - "382366116", - "387202452", - "389171486", - "391456740", - "391736837", - "394778487", - "401684600", - "402113224", - "402181770", - "402397014", - "405803396", - "445102363", - "445256897", - "448265376", - "449555622", - "449623976", - "458291624", - "458359181", - "463081566", - "469138353", - "471641628", - "476492237", - "478585901", - "478586066", - "479042264", - "479042269", - "479793787", - "481475385", - "483617247", - "486378555", - "486383912", - "492121213", - "497497599", - "502693599" + "334584772", + "406913630", + "407275419", + "337937633", + "337293427", + "334584772", + "404712928", + "451063582", + "451878128", + "425595958", + "536344026", + "532422548", + "423418540", + "423418395", + "532998065", + "540307575", + "423395975", + "508704368", + "481482392", + "466580448", + "423365707", + "423364758", + "422761666", + "422760156", + "422760148", + "422686965", + "492029971", + "510300817", + "512745032", + "514213638", + "527525440", + "534535767" ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["503194284"] + # special_doc_id_list = ["337937633"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False - re_run_mapping_data = True + re_run_mapping_data = False force_save_total_data = True + calculate_metrics = False extract_ways = ["text"] for extract_way in extract_ways: @@ -742,6 +804,7 @@ if __name__ == "__main__": re_run_extract_data, re_run_mapping_data, force_save_total_data=force_save_total_data, + calculate_metrics=calculate_metrics, ) # test_data_extraction_metrics() diff --git a/prepare_data.py b/prepare_data.py index 9453f2e..cfed2ce 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -39,11 +39,14 @@ def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str): ) -def download_pdf(doc_provider_file_path: str, sheet_name: str, pdf_path: str): +def download_pdf(doc_provider_file_path: str, + sheet_name: str, + pdf_path: str, + doc_id_column: str = "DocumentId"): document_data = pd.read_excel(doc_provider_file_path, sheet_name=sheet_name) # get all unique docids as list doc_id_list = [ - str(doc_id) for doc_id in document_data["DocumentId"].unique().tolist() + str(doc_id) for doc_id in document_data[doc_id_column].unique().tolist() ] # download pdfs logger.info(f"Start downloading {len(doc_id_list)} pdfs") @@ -994,8 +997,13 @@ if __name__ == "__main__": r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx" ) - - # download_pdf(random_small_document_data_file, 'random_small_document', pdf_folder) + doc_provider_file_path = r"/data/emea_ar/basic_information/English/emea_sample_doc_from_Kshitij.xlsx" + pdf_folder = r"/data/emea_ar/pdf/" + download_pdf( + doc_provider_file_path=doc_provider_file_path, + sheet_name="Sample EMEA Docs", + doc_id_column="Document ID", + pdf_path=pdf_folder) # output_pdf_page_text(pdf_folder, output_folder) # extract_pdf_table(pdf_folder, output_folder) @@ -1012,7 +1020,7 @@ if __name__ == "__main__": # sheet_name="latest_doc_ar_data", # output_folder=output_data_folder, # output_file="latest_doc_ar_mapping_statistics.xlsx") - get_document_extracted_share_diff_by_db() + # get_document_extracted_share_diff_by_db() # statistics_provider_mapping( # provider_mapping_data_file=provider_mapping_data_file, # output_folder=basic_info_folder, diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 5760268..4fa8539 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -599,8 +599,18 @@ def get_jacard_similarity(text_left, text_right = text_right.split() intersection = set(text_left).intersection(set(text_right)) union = set(text_left).union(set(text_right)) - if len(union) > 0: - return round(len(intersection) / len(union), 3) + + intersection_count = len(intersection) + union_count = len(union) + + differ_a = list(set(text_left).difference(set(text_right))) + differ_a.sort() + differ_b = list(set(text_right).difference(set(text_left))) + differ_b.sort() + if ''.join(differ_a) == ''.join(differ_b): + intersection_count += len(differ_a) + len(differ_b) + if union_count > 0: + return round(intersection_count / union_count, 3) else: return 0