"total match" logic for single word value, need consider the "\n" char scenario
This commit is contained in:
parent
5b67bd332b
commit
c2d2e54670
2
main.py
2
main.py
|
|
@ -1110,7 +1110,7 @@ if __name__ == "__main__":
|
|||
"546046730",
|
||||
"546919329"
|
||||
]
|
||||
# special_doc_id_list = ["501380775"]
|
||||
special_doc_id_list = ["479742284"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
|
|
|
|||
|
|
@ -519,13 +519,13 @@ class PDFUtil:
|
|||
if start_index == -1:
|
||||
continue
|
||||
if start_index > 0:
|
||||
previous_char = text[start_index - 1]
|
||||
if previous_char not in [" ", "("]:
|
||||
previous_char = text[start_index - 1].strip()
|
||||
if previous_char not in ["", " ", "("]:
|
||||
continue
|
||||
end_index = start_index + len(pure_text_block)
|
||||
if end_index < len(text):
|
||||
next_char = text[end_index]
|
||||
if next_char not in [" ", "%", ")"]:
|
||||
next_char = text[end_index].strip()
|
||||
if next_char not in ["", " ", "%", ")"]:
|
||||
continue
|
||||
new_matching_val_area.append(area)
|
||||
matching_val_area = new_matching_val_area
|
||||
|
|
@ -561,13 +561,13 @@ class PDFUtil:
|
|||
# get start and end index of the highlight_text_inside_block in text
|
||||
start_index = text.find(highlight_text_inside_block)
|
||||
if start_index > 0:
|
||||
previous_char = text[start_index - 1]
|
||||
if previous_char not in [" ", "("]:
|
||||
previous_char = text[start_index - 1].strip()
|
||||
if previous_char not in ["", " ", "("]:
|
||||
continue
|
||||
end_index = start_index + len(highlight_text_inside_block)
|
||||
if end_index < len(text):
|
||||
next_char = text[end_index]
|
||||
if next_char not in [" ", "%", ")"]:
|
||||
next_char = text[end_index].strip()
|
||||
if next_char not in ["", " ", "%", ")"]:
|
||||
continue
|
||||
highlight_bbox_list.append(area)
|
||||
else:
|
||||
|
|
|
|||
Loading…
Reference in New Issue