From 5b67bd332b8abbbe3afa0853458927b26546cac1 Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 12 Nov 2024 11:20:38 -0600 Subject: [PATCH] optimize drilldown algorithm --- main.py | 2 +- utils/pdf_util.py | 31 +++++++++++++++++++++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 84a4b65..33d7acd 100644 --- a/main.py +++ b/main.py @@ -1110,7 +1110,7 @@ if __name__ == "__main__": "546046730", "546919329" ] - special_doc_id_list = ["501380497"] + # special_doc_id_list = ["501380775"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False diff --git a/utils/pdf_util.py b/utils/pdf_util.py index 05780c8..4ddea7d 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -310,7 +310,15 @@ class PDFUtil: annotation_attribute=annotation_attribute )) else: - pass + highlight_value = str(highlight_value) + annotation_list.append(self.highlight_pdf_doc( + pdf_doc=pdf_doc, + page_index=page_index, + highlight_value=highlight_value, + parent_text_block=parent_text_block, + data_point=data_point, + annotation_attribute=annotation_attribute + )) if output_pdf_folder is not None and len(output_pdf_folder) > 0: os.makedirs(output_pdf_folder, exist_ok=True) pdf_file_path = self.save_annotated_pdf(pdf_doc=pdf_doc, @@ -362,12 +370,20 @@ class PDFUtil: parent_text_block_search_text = parent_text_block_search.group() highlight_value_search_text = "" - if highlight_value is not None: - highlight_value_regex = self.add_slash_to_text_as_regex(highlight_value) - if len(highlight_value.strip().split()) == 1 and len(highlight_value.strip()) < 3: + + if highlight_value is not None and len(highlight_value.strip()) > 0: + pure_highlight_value = highlight_value.strip() + + if len(pure_highlight_value.split()) == 1 and \ + (len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]): + highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value) highlight_value_search = re.search(highlight_value_regex, page_text) else: + highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False) highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE) + if highlight_value_search is None: + highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True) + highlight_value_search = re.search(highlight_value_regex, page_text) if highlight_value_search is not None: highlight_value_search_text = highlight_value_search.group() @@ -417,7 +433,7 @@ class PDFUtil: annotation_data["matching_val_area"] = bbox_list return annotation_data - def add_slash_to_text_as_regex(self, text: str): + def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True): if text is None or len(text) == 0: return text special_char_iter = re.finditer("\W", text) @@ -428,7 +444,10 @@ class PDFUtil: if replace not in text: text = re.sub(replace, r"\\W", text) text = re.sub(r"( ){2,}", " ", text) - text = text.replace(" ", r"\s*\W*") + if match_special_char_after_space: + text = text.replace(" ", r"\s*\W*") + else: + text = text.replace(" ", r"\s*") return text def highlight_matching_data(