optimize investment mapping
This commit is contained in:
parent
f1c0290588
commit
fc80093557
44
main.py
44
main.py
|
|
@ -830,7 +830,7 @@ if __name__ == "__main__":
|
||||||
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
||||||
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
||||||
# replace_rerun_data(new_data_file, original_data_file)
|
# replace_rerun_data(new_data_file, original_data_file)
|
||||||
test_calculate_metrics()
|
# test_calculate_metrics()
|
||||||
# test_replace_abbrevation()
|
# test_replace_abbrevation()
|
||||||
# test_translate_pdf()
|
# test_translate_pdf()
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
|
|
@ -1242,32 +1242,38 @@ if __name__ == "__main__":
|
||||||
"501380497",
|
"501380497",
|
||||||
"514636959",
|
"514636959",
|
||||||
"508981020"]
|
"508981020"]
|
||||||
# special_doc_id_list = ["514636952"]
|
special_doc_id_list = ["501380801",
|
||||||
|
"501600541",
|
||||||
|
"507720522",
|
||||||
|
"509133771",
|
||||||
|
"514636951",
|
||||||
|
"514636955",
|
||||||
|
"514636993"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
# pdf_folder = r"/data/emea_ar/small_pdf/"
|
# pdf_folder = r"/data/emea_ar/small_pdf/"
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
# for extract_way in extract_ways:
|
for extract_way in extract_ways:
|
||||||
# batch_start_job(
|
batch_start_job(
|
||||||
# pdf_folder,
|
pdf_folder,
|
||||||
# page_filter_ground_truth_file,
|
page_filter_ground_truth_file,
|
||||||
# output_extract_data_child_folder,
|
output_extract_data_child_folder,
|
||||||
# output_mapping_child_folder,
|
output_mapping_child_folder,
|
||||||
# output_extract_data_total_folder,
|
output_extract_data_total_folder,
|
||||||
# output_mapping_total_folder,
|
output_mapping_total_folder,
|
||||||
# extract_way,
|
extract_way,
|
||||||
# special_doc_id_list,
|
special_doc_id_list,
|
||||||
# re_run_extract_data,
|
re_run_extract_data,
|
||||||
# re_run_mapping_data,
|
re_run_mapping_data,
|
||||||
# force_save_total_data=force_save_total_data,
|
force_save_total_data=force_save_total_data,
|
||||||
# calculate_metrics=calculate_metrics,
|
calculate_metrics=calculate_metrics,
|
||||||
# )
|
)
|
||||||
|
|
||||||
# test_data_extraction_metrics()
|
# test_data_extraction_metrics()
|
||||||
# test_mapping_raw_name()
|
# test_mapping_raw_name()
|
||||||
|
|
|
||||||
|
|
@ -870,7 +870,7 @@ def replace_abbrevation(text: str):
|
||||||
new_text_splits.append('no trail')
|
new_text_splits.append('no trail')
|
||||||
elif split.lower() in ['non']:
|
elif split.lower() in ['non']:
|
||||||
new_text_splits.append('Not')
|
new_text_splits.append('Not')
|
||||||
elif split.lower() in ['net', 'unhgd']:
|
elif split.lower() in ['net', 'unhgd'] or split == "fl":
|
||||||
new_text_splits.append('')
|
new_text_splits.append('')
|
||||||
else:
|
else:
|
||||||
split = split_short_name_with_share_features(split)
|
split = split_short_name_with_share_features(split)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue