From fc800935573b0879b768b9103ff54460680f91ef Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 22 Nov 2024 14:54:52 -0600 Subject: [PATCH] optimize investment mapping --- main.py | 44 +++++++++++++++++++++++++------------------- utils/biz_utils.py | 2 +- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/main.py b/main.py index 8d0a21c..a1cfd3d 100644 --- a/main.py +++ b/main.py @@ -830,7 +830,7 @@ if __name__ == "__main__": # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" # original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx" # replace_rerun_data(new_data_file, original_data_file) - test_calculate_metrics() + # test_calculate_metrics() # test_replace_abbrevation() # test_translate_pdf() pdf_folder = r"/data/emea_ar/pdf/" @@ -1242,32 +1242,38 @@ if __name__ == "__main__": "501380497", "514636959", "508981020"] - # special_doc_id_list = ["514636952"] + special_doc_id_list = ["501380801", + "501600541", + "507720522", + "509133771", + "514636951", + "514636955", + "514636993"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = False - re_run_mapping_data = False + re_run_extract_data = True + re_run_mapping_data = True force_save_total_data = True calculate_metrics = False extract_ways = ["text"] # pdf_folder = r"/data/emea_ar/small_pdf/" pdf_folder = r"/data/emea_ar/pdf/" - # for extract_way in extract_ways: - # batch_start_job( - # pdf_folder, - # page_filter_ground_truth_file, - # output_extract_data_child_folder, - # output_mapping_child_folder, - # output_extract_data_total_folder, - # output_mapping_total_folder, - # extract_way, - # special_doc_id_list, - # re_run_extract_data, - # re_run_mapping_data, - # force_save_total_data=force_save_total_data, - # calculate_metrics=calculate_metrics, - # ) + for extract_way in extract_ways: + batch_start_job( + pdf_folder, + page_filter_ground_truth_file, + output_extract_data_child_folder, + output_mapping_child_folder, + output_extract_data_total_folder, + output_mapping_total_folder, + extract_way, + special_doc_id_list, + re_run_extract_data, + re_run_mapping_data, + force_save_total_data=force_save_total_data, + calculate_metrics=calculate_metrics, + ) # test_data_extraction_metrics() # test_mapping_raw_name() diff --git a/utils/biz_utils.py b/utils/biz_utils.py index c56a668..9e75d6d 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -870,7 +870,7 @@ def replace_abbrevation(text: str): new_text_splits.append('no trail') elif split.lower() in ['non']: new_text_splits.append('Not') - elif split.lower() in ['net', 'unhgd']: + elif split.lower() in ['net', 'unhgd'] or split == "fl": new_text_splits.append('') else: split = split_short_name_with_share_features(split)