diff --git a/main.py b/main.py index 33d7acd..55ee1fd 100644 --- a/main.py +++ b/main.py @@ -1110,7 +1110,7 @@ if __name__ == "__main__": "546046730", "546919329" ] - # special_doc_id_list = ["501380775"] + special_doc_id_list = ["479742284"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False diff --git a/utils/pdf_util.py b/utils/pdf_util.py index 4ddea7d..3040a87 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -519,13 +519,13 @@ class PDFUtil: if start_index == -1: continue if start_index > 0: - previous_char = text[start_index - 1] - if previous_char not in [" ", "("]: + previous_char = text[start_index - 1].strip() + if previous_char not in ["", " ", "("]: continue end_index = start_index + len(pure_text_block) if end_index < len(text): - next_char = text[end_index] - if next_char not in [" ", "%", ")"]: + next_char = text[end_index].strip() + if next_char not in ["", " ", "%", ")"]: continue new_matching_val_area.append(area) matching_val_area = new_matching_val_area @@ -561,13 +561,13 @@ class PDFUtil: # get start and end index of the highlight_text_inside_block in text start_index = text.find(highlight_text_inside_block) if start_index > 0: - previous_char = text[start_index - 1] - if previous_char not in [" ", "("]: + previous_char = text[start_index - 1].strip() + if previous_char not in ["", " ", "("]: continue end_index = start_index + len(highlight_text_inside_block) if end_index < len(text): - next_char = text[end_index] - if next_char not in [" ", "%", ")"]: + next_char = text[end_index].strip() + if next_char not in ["", " ", "%", ")"]: continue highlight_bbox_list.append(area) else: