"total match" logic for single word value, need consider the "\n" char scenario

2024-11-12 11:40:19 -06:00 · 2024-11-12 11:40:19 -06:00 · c2d2e54670
parent 5b67bd332b
commit c2d2e54670
2 changed files with 9 additions and 9 deletions
--- a/main.py
+++ b/main.py
@ -1110,7 +1110,7 @@ if __name__ == "__main__":
                            "546046730",
                            "546919329"
                            ]
-    # special_doc_id_list = ["501380775"]
+    special_doc_id_list = ["479742284"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
--- a/utils/pdf_util.py
+++ b/utils/pdf_util.py
@ -519,13 +519,13 @@ class PDFUtil:
                    if start_index == -1:
                        continue
                    if start_index > 0:
-                        previous_char = text[start_index - 1]
+                        previous_char = text[start_index - 1].strip()
-                        if previous_char not in [" ", "("]:
+                        if previous_char not in ["", " ", "("]:
                            continue
                    end_index = start_index + len(pure_text_block)
                    if end_index < len(text):
-                        next_char = text[end_index]
+                        next_char = text[end_index].strip()
-                        if next_char not in [" ", "%", ")"]:
+                        if next_char not in ["", " ", "%", ")"]:
                            continue
                    new_matching_val_area.append(area)
            matching_val_area = new_matching_val_area
@ -561,13 +561,13 @@ class PDFUtil:
                                    # get start and end index of the highlight_text_inside_block in text
                                    start_index = text.find(highlight_text_inside_block)
                                    if start_index > 0:
-                                        previous_char = text[start_index - 1]
+                                        previous_char = text[start_index - 1].strip()
-                                        if previous_char not in [" ", "("]:
+                                        if previous_char not in ["", " ", "("]:
                                            continue
                                    end_index = start_index + len(highlight_text_inside_block)
                                    if end_index < len(text):
-                                        next_char = text[end_index]
+                                        next_char = text[end_index].strip()
-                                        if next_char not in [" ", "%", ")"]:
+                                        if next_char not in ["", " ", "%", ")"]:
                                            continue
                                    highlight_bbox_list.append(area)
                        else: