optimize drilldown algorithm

This commit is contained in:
Blade He 2024-11-12 11:20:38 -06:00
parent c6c3e99d3e
commit 5b67bd332b
2 changed files with 26 additions and 7 deletions

View File

@ -1110,7 +1110,7 @@ if __name__ == "__main__":
"546046730",
"546919329"
]
special_doc_id_list = ["501380497"]
# special_doc_id_list = ["501380775"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False

View File

@ -310,7 +310,15 @@ class PDFUtil:
annotation_attribute=annotation_attribute
))
else:
pass
highlight_value = str(highlight_value)
annotation_list.append(self.highlight_pdf_doc(
pdf_doc=pdf_doc,
page_index=page_index,
highlight_value=highlight_value,
parent_text_block=parent_text_block,
data_point=data_point,
annotation_attribute=annotation_attribute
))
if output_pdf_folder is not None and len(output_pdf_folder) > 0:
os.makedirs(output_pdf_folder, exist_ok=True)
pdf_file_path = self.save_annotated_pdf(pdf_doc=pdf_doc,
@ -362,12 +370,20 @@ class PDFUtil:
parent_text_block_search_text = parent_text_block_search.group()
highlight_value_search_text = ""
if highlight_value is not None:
highlight_value_regex = self.add_slash_to_text_as_regex(highlight_value)
if len(highlight_value.strip().split()) == 1 and len(highlight_value.strip()) < 3:
if highlight_value is not None and len(highlight_value.strip()) > 0:
pure_highlight_value = highlight_value.strip()
if len(pure_highlight_value.split()) == 1 and \
(len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]):
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value)
highlight_value_search = re.search(highlight_value_regex, page_text)
else:
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False)
highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE)
if highlight_value_search is None:
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
highlight_value_search = re.search(highlight_value_regex, page_text)
if highlight_value_search is not None:
highlight_value_search_text = highlight_value_search.group()
@ -417,7 +433,7 @@ class PDFUtil:
annotation_data["matching_val_area"] = bbox_list
return annotation_data
def add_slash_to_text_as_regex(self, text: str):
def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True):
if text is None or len(text) == 0:
return text
special_char_iter = re.finditer("\W", text)
@ -428,7 +444,10 @@ class PDFUtil:
if replace not in text:
text = re.sub(replace, r"\\W", text)
text = re.sub(r"( ){2,}", " ", text)
text = text.replace(" ", r"\s*\W*")
if match_special_char_after_space:
text = text.replace(" ", r"\s*\W*")
else:
text = text.replace(" ", r"\s*")
return text
def highlight_matching_data(