optimize drilldown algorithm
This commit is contained in:
parent
c6c3e99d3e
commit
5b67bd332b
2
main.py
2
main.py
|
|
@ -1110,7 +1110,7 @@ if __name__ == "__main__":
|
|||
"546046730",
|
||||
"546919329"
|
||||
]
|
||||
special_doc_id_list = ["501380497"]
|
||||
# special_doc_id_list = ["501380775"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
|
|
|
|||
|
|
@ -310,7 +310,15 @@ class PDFUtil:
|
|||
annotation_attribute=annotation_attribute
|
||||
))
|
||||
else:
|
||||
pass
|
||||
highlight_value = str(highlight_value)
|
||||
annotation_list.append(self.highlight_pdf_doc(
|
||||
pdf_doc=pdf_doc,
|
||||
page_index=page_index,
|
||||
highlight_value=highlight_value,
|
||||
parent_text_block=parent_text_block,
|
||||
data_point=data_point,
|
||||
annotation_attribute=annotation_attribute
|
||||
))
|
||||
if output_pdf_folder is not None and len(output_pdf_folder) > 0:
|
||||
os.makedirs(output_pdf_folder, exist_ok=True)
|
||||
pdf_file_path = self.save_annotated_pdf(pdf_doc=pdf_doc,
|
||||
|
|
@ -362,12 +370,20 @@ class PDFUtil:
|
|||
parent_text_block_search_text = parent_text_block_search.group()
|
||||
|
||||
highlight_value_search_text = ""
|
||||
if highlight_value is not None:
|
||||
highlight_value_regex = self.add_slash_to_text_as_regex(highlight_value)
|
||||
if len(highlight_value.strip().split()) == 1 and len(highlight_value.strip()) < 3:
|
||||
|
||||
if highlight_value is not None and len(highlight_value.strip()) > 0:
|
||||
pure_highlight_value = highlight_value.strip()
|
||||
|
||||
if len(pure_highlight_value.split()) == 1 and \
|
||||
(len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]):
|
||||
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value)
|
||||
highlight_value_search = re.search(highlight_value_regex, page_text)
|
||||
else:
|
||||
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False)
|
||||
highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE)
|
||||
if highlight_value_search is None:
|
||||
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
|
||||
highlight_value_search = re.search(highlight_value_regex, page_text)
|
||||
if highlight_value_search is not None:
|
||||
highlight_value_search_text = highlight_value_search.group()
|
||||
|
||||
|
|
@ -417,7 +433,7 @@ class PDFUtil:
|
|||
annotation_data["matching_val_area"] = bbox_list
|
||||
return annotation_data
|
||||
|
||||
def add_slash_to_text_as_regex(self, text: str):
|
||||
def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True):
|
||||
if text is None or len(text) == 0:
|
||||
return text
|
||||
special_char_iter = re.finditer("\W", text)
|
||||
|
|
@ -428,7 +444,10 @@ class PDFUtil:
|
|||
if replace not in text:
|
||||
text = re.sub(replace, r"\\W", text)
|
||||
text = re.sub(r"( ){2,}", " ", text)
|
||||
text = text.replace(" ", r"\s*\W*")
|
||||
if match_special_char_after_space:
|
||||
text = text.replace(" ", r"\s*\W*")
|
||||
else:
|
||||
text = text.replace(" ", r"\s*")
|
||||
return text
|
||||
|
||||
def highlight_matching_data(
|
||||
|
|
|
|||
Loading…
Reference in New Issue