"total match" logic for single word value, need consider the "\n" char scenario
This commit is contained in:
parent
5b67bd332b
commit
c2d2e54670
2
main.py
2
main.py
|
|
@ -1110,7 +1110,7 @@ if __name__ == "__main__":
|
||||||
"546046730",
|
"546046730",
|
||||||
"546919329"
|
"546919329"
|
||||||
]
|
]
|
||||||
# special_doc_id_list = ["501380775"]
|
special_doc_id_list = ["479742284"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
|
|
|
||||||
|
|
@ -519,13 +519,13 @@ class PDFUtil:
|
||||||
if start_index == -1:
|
if start_index == -1:
|
||||||
continue
|
continue
|
||||||
if start_index > 0:
|
if start_index > 0:
|
||||||
previous_char = text[start_index - 1]
|
previous_char = text[start_index - 1].strip()
|
||||||
if previous_char not in [" ", "("]:
|
if previous_char not in ["", " ", "("]:
|
||||||
continue
|
continue
|
||||||
end_index = start_index + len(pure_text_block)
|
end_index = start_index + len(pure_text_block)
|
||||||
if end_index < len(text):
|
if end_index < len(text):
|
||||||
next_char = text[end_index]
|
next_char = text[end_index].strip()
|
||||||
if next_char not in [" ", "%", ")"]:
|
if next_char not in ["", " ", "%", ")"]:
|
||||||
continue
|
continue
|
||||||
new_matching_val_area.append(area)
|
new_matching_val_area.append(area)
|
||||||
matching_val_area = new_matching_val_area
|
matching_val_area = new_matching_val_area
|
||||||
|
|
@ -561,13 +561,13 @@ class PDFUtil:
|
||||||
# get start and end index of the highlight_text_inside_block in text
|
# get start and end index of the highlight_text_inside_block in text
|
||||||
start_index = text.find(highlight_text_inside_block)
|
start_index = text.find(highlight_text_inside_block)
|
||||||
if start_index > 0:
|
if start_index > 0:
|
||||||
previous_char = text[start_index - 1]
|
previous_char = text[start_index - 1].strip()
|
||||||
if previous_char not in [" ", "("]:
|
if previous_char not in ["", " ", "("]:
|
||||||
continue
|
continue
|
||||||
end_index = start_index + len(highlight_text_inside_block)
|
end_index = start_index + len(highlight_text_inside_block)
|
||||||
if end_index < len(text):
|
if end_index < len(text):
|
||||||
next_char = text[end_index]
|
next_char = text[end_index].strip()
|
||||||
if next_char not in [" ", "%", ")"]:
|
if next_char not in ["", " ", "%", ")"]:
|
||||||
continue
|
continue
|
||||||
highlight_bbox_list.append(area)
|
highlight_bbox_list.append(area)
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue