"total match" logic for single word value, need consider the "\n" char scenario

This commit is contained in:
Blade He 2024-11-12 11:40:19 -06:00
parent 5b67bd332b
commit c2d2e54670
2 changed files with 9 additions and 9 deletions

View File

@ -1110,7 +1110,7 @@ if __name__ == "__main__":
"546046730", "546046730",
"546919329" "546919329"
] ]
# special_doc_id_list = ["501380775"] special_doc_id_list = ["479742284"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False

View File

@ -519,13 +519,13 @@ class PDFUtil:
if start_index == -1: if start_index == -1:
continue continue
if start_index > 0: if start_index > 0:
previous_char = text[start_index - 1] previous_char = text[start_index - 1].strip()
if previous_char not in [" ", "("]: if previous_char not in ["", " ", "("]:
continue continue
end_index = start_index + len(pure_text_block) end_index = start_index + len(pure_text_block)
if end_index < len(text): if end_index < len(text):
next_char = text[end_index] next_char = text[end_index].strip()
if next_char not in [" ", "%", ")"]: if next_char not in ["", " ", "%", ")"]:
continue continue
new_matching_val_area.append(area) new_matching_val_area.append(area)
matching_val_area = new_matching_val_area matching_val_area = new_matching_val_area
@ -561,13 +561,13 @@ class PDFUtil:
# get start and end index of the highlight_text_inside_block in text # get start and end index of the highlight_text_inside_block in text
start_index = text.find(highlight_text_inside_block) start_index = text.find(highlight_text_inside_block)
if start_index > 0: if start_index > 0:
previous_char = text[start_index - 1] previous_char = text[start_index - 1].strip()
if previous_char not in [" ", "("]: if previous_char not in ["", " ", "("]:
continue continue
end_index = start_index + len(highlight_text_inside_block) end_index = start_index + len(highlight_text_inside_block)
if end_index < len(text): if end_index < len(text):
next_char = text[end_index] next_char = text[end_index].strip()
if next_char not in [" ", "%", ")"]: if next_char not in ["", " ", "%", ")"]:
continue continue
highlight_bbox_list.append(area) highlight_bbox_list.append(area)
else: else: