From c2d2e54670aecb748cc145c0141aea9f5d8d38c9 Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 12 Nov 2024 11:40:19 -0600 Subject: [PATCH] "total match" logic for single word value, need consider the "\n" char scenario --- main.py | 2 +- utils/pdf_util.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 33d7acd..55ee1fd 100644 --- a/main.py +++ b/main.py @@ -1110,7 +1110,7 @@ if __name__ == "__main__": "546046730", "546919329" ] - # special_doc_id_list = ["501380775"] + special_doc_id_list = ["479742284"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False diff --git a/utils/pdf_util.py b/utils/pdf_util.py index 4ddea7d..3040a87 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -519,13 +519,13 @@ class PDFUtil: if start_index == -1: continue if start_index > 0: - previous_char = text[start_index - 1] - if previous_char not in [" ", "("]: + previous_char = text[start_index - 1].strip() + if previous_char not in ["", " ", "("]: continue end_index = start_index + len(pure_text_block) if end_index < len(text): - next_char = text[end_index] - if next_char not in [" ", "%", ")"]: + next_char = text[end_index].strip() + if next_char not in ["", " ", "%", ")"]: continue new_matching_val_area.append(area) matching_val_area = new_matching_val_area @@ -561,13 +561,13 @@ class PDFUtil: # get start and end index of the highlight_text_inside_block in text start_index = text.find(highlight_text_inside_block) if start_index > 0: - previous_char = text[start_index - 1] - if previous_char not in [" ", "("]: + previous_char = text[start_index - 1].strip() + if previous_char not in ["", " ", "("]: continue end_index = start_index + len(highlight_text_inside_block) if end_index < len(text): - next_char = text[end_index] - if next_char not in [" ", "%", ")"]: + next_char = text[end_index].strip() + if next_char not in ["", " ", "%", ")"]: continue highlight_bbox_list.append(area) else: