optimize drilldown algorithm

2024-11-12 11:20:38 -06:00 · 2024-11-12 11:20:38 -06:00 · 5b67bd332b
parent c6c3e99d3e
commit 5b67bd332b
2 changed files with 26 additions and 7 deletions
--- a/main.py
+++ b/main.py
@ -1110,7 +1110,7 @@ if __name__ == "__main__":
                            "546046730",
                            "546919329"
                            ]
-    special_doc_id_list = ["501380497"]
+    # special_doc_id_list = ["501380775"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
--- a/utils/pdf_util.py
+++ b/utils/pdf_util.py
@ -310,7 +310,15 @@ class PDFUtil:
                        annotation_attribute=annotation_attribute
                    ))
            else:
-                pass
+                highlight_value = str(highlight_value)
+                annotation_list.append(self.highlight_pdf_doc(
+                    pdf_doc=pdf_doc,
+                    page_index=page_index,
+                    highlight_value=highlight_value,
+                    parent_text_block=parent_text_block,
+                    data_point=data_point,
+                    annotation_attribute=annotation_attribute
+                ))
        if output_pdf_folder is not None and len(output_pdf_folder) > 0:
            os.makedirs(output_pdf_folder, exist_ok=True)
            pdf_file_path = self.save_annotated_pdf(pdf_doc=pdf_doc,
@ -362,12 +370,20 @@ class PDFUtil:
                parent_text_block_search_text = parent_text_block_search.group()
                
        highlight_value_search_text = ""
-        if highlight_value is not None:
-            highlight_value_regex = self.add_slash_to_text_as_regex(highlight_value)
-            if len(highlight_value.strip().split()) == 1 and len(highlight_value.strip()) < 3:
+        
+        if highlight_value is not None and len(highlight_value.strip()) > 0:
+            pure_highlight_value = highlight_value.strip()
+            
+            if len(pure_highlight_value.split()) == 1 and \
+                (len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]):
+                highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value)
                highlight_value_search = re.search(highlight_value_regex, page_text)
            else:
+                highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False)
                highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE)
+                if highlight_value_search is None:
+                    highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
+                    highlight_value_search = re.search(highlight_value_regex, page_text)
            if highlight_value_search is not None:
                highlight_value_search_text = highlight_value_search.group()
                
@ -417,7 +433,7 @@ class PDFUtil:
            annotation_data["matching_val_area"] = bbox_list
        return annotation_data
    
-    def add_slash_to_text_as_regex(self, text: str):
+    def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True):
        if text is None or len(text) == 0:
            return text
        special_char_iter = re.finditer("\W", text)
@ -428,7 +444,10 @@ class PDFUtil:
            if replace not in text:
                text = re.sub(replace, r"\\W", text)
        text = re.sub(r"( ){2,}", " ", text)
-        text = text.replace(" ", r"\s*\W*")
+        if match_special_char_after_space:
+            text = text.replace(" ", r"\s*\W*")
+        else:
+            text = text.replace(" ", r"\s*")
        return text

    def highlight_matching_data(