optimize investment mapping
This commit is contained in:
parent
f1c0290588
commit
fc80093557
44
main.py
44
main.py
|
|
@ -830,7 +830,7 @@ if __name__ == "__main__":
|
|||
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
||||
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
||||
# replace_rerun_data(new_data_file, original_data_file)
|
||||
test_calculate_metrics()
|
||||
# test_calculate_metrics()
|
||||
# test_replace_abbrevation()
|
||||
# test_translate_pdf()
|
||||
pdf_folder = r"/data/emea_ar/pdf/"
|
||||
|
|
@ -1242,32 +1242,38 @@ if __name__ == "__main__":
|
|||
"501380497",
|
||||
"514636959",
|
||||
"508981020"]
|
||||
# special_doc_id_list = ["514636952"]
|
||||
special_doc_id_list = ["501380801",
|
||||
"501600541",
|
||||
"507720522",
|
||||
"509133771",
|
||||
"514636951",
|
||||
"514636955",
|
||||
"514636993"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = False
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = True
|
||||
calculate_metrics = False
|
||||
|
||||
extract_ways = ["text"]
|
||||
# pdf_folder = r"/data/emea_ar/small_pdf/"
|
||||
pdf_folder = r"/data/emea_ar/pdf/"
|
||||
# for extract_way in extract_ways:
|
||||
# batch_start_job(
|
||||
# pdf_folder,
|
||||
# page_filter_ground_truth_file,
|
||||
# output_extract_data_child_folder,
|
||||
# output_mapping_child_folder,
|
||||
# output_extract_data_total_folder,
|
||||
# output_mapping_total_folder,
|
||||
# extract_way,
|
||||
# special_doc_id_list,
|
||||
# re_run_extract_data,
|
||||
# re_run_mapping_data,
|
||||
# force_save_total_data=force_save_total_data,
|
||||
# calculate_metrics=calculate_metrics,
|
||||
# )
|
||||
for extract_way in extract_ways:
|
||||
batch_start_job(
|
||||
pdf_folder,
|
||||
page_filter_ground_truth_file,
|
||||
output_extract_data_child_folder,
|
||||
output_mapping_child_folder,
|
||||
output_extract_data_total_folder,
|
||||
output_mapping_total_folder,
|
||||
extract_way,
|
||||
special_doc_id_list,
|
||||
re_run_extract_data,
|
||||
re_run_mapping_data,
|
||||
force_save_total_data=force_save_total_data,
|
||||
calculate_metrics=calculate_metrics,
|
||||
)
|
||||
|
||||
# test_data_extraction_metrics()
|
||||
# test_mapping_raw_name()
|
||||
|
|
|
|||
|
|
@ -870,7 +870,7 @@ def replace_abbrevation(text: str):
|
|||
new_text_splits.append('no trail')
|
||||
elif split.lower() in ['non']:
|
||||
new_text_splits.append('Not')
|
||||
elif split.lower() in ['net', 'unhgd']:
|
||||
elif split.lower() in ['net', 'unhgd'] or split == "fl":
|
||||
new_text_splits.append('')
|
||||
else:
|
||||
split = split_short_name_with_share_features(split)
|
||||
|
|
|
|||
Loading…
Reference in New Issue