optimize investment mapping

This commit is contained in:
Blade He 2024-11-22 14:54:52 -06:00
parent f1c0290588
commit fc80093557
2 changed files with 26 additions and 20 deletions

44
main.py
View File

@ -830,7 +830,7 @@ if __name__ == "__main__":
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx" # original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
# replace_rerun_data(new_data_file, original_data_file) # replace_rerun_data(new_data_file, original_data_file)
test_calculate_metrics() # test_calculate_metrics()
# test_replace_abbrevation() # test_replace_abbrevation()
# test_translate_pdf() # test_translate_pdf()
pdf_folder = r"/data/emea_ar/pdf/" pdf_folder = r"/data/emea_ar/pdf/"
@ -1242,32 +1242,38 @@ if __name__ == "__main__":
"501380497", "501380497",
"514636959", "514636959",
"508981020"] "508981020"]
# special_doc_id_list = ["514636952"] special_doc_id_list = ["501380801",
"501600541",
"507720522",
"509133771",
"514636951",
"514636955",
"514636993"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = True
re_run_mapping_data = False re_run_mapping_data = True
force_save_total_data = True force_save_total_data = True
calculate_metrics = False calculate_metrics = False
extract_ways = ["text"] extract_ways = ["text"]
# pdf_folder = r"/data/emea_ar/small_pdf/" # pdf_folder = r"/data/emea_ar/small_pdf/"
pdf_folder = r"/data/emea_ar/pdf/" pdf_folder = r"/data/emea_ar/pdf/"
# for extract_way in extract_ways: for extract_way in extract_ways:
# batch_start_job( batch_start_job(
# pdf_folder, pdf_folder,
# page_filter_ground_truth_file, page_filter_ground_truth_file,
# output_extract_data_child_folder, output_extract_data_child_folder,
# output_mapping_child_folder, output_mapping_child_folder,
# output_extract_data_total_folder, output_extract_data_total_folder,
# output_mapping_total_folder, output_mapping_total_folder,
# extract_way, extract_way,
# special_doc_id_list, special_doc_id_list,
# re_run_extract_data, re_run_extract_data,
# re_run_mapping_data, re_run_mapping_data,
# force_save_total_data=force_save_total_data, force_save_total_data=force_save_total_data,
# calculate_metrics=calculate_metrics, calculate_metrics=calculate_metrics,
# ) )
# test_data_extraction_metrics() # test_data_extraction_metrics()
# test_mapping_raw_name() # test_mapping_raw_name()

View File

@ -870,7 +870,7 @@ def replace_abbrevation(text: str):
new_text_splits.append('no trail') new_text_splits.append('no trail')
elif split.lower() in ['non']: elif split.lower() in ['non']:
new_text_splits.append('Not') new_text_splits.append('Not')
elif split.lower() in ['net', 'unhgd']: elif split.lower() in ['net', 'unhgd'] or split == "fl":
new_text_splits.append('') new_text_splits.append('')
else: else:
split = split_short_name_with_share_features(split) split = split_short_name_with_share_features(split)