optimize drilldown algorithm
This commit is contained in:
parent
c6c3e99d3e
commit
5b67bd332b
2
main.py
2
main.py
|
|
@ -1110,7 +1110,7 @@ if __name__ == "__main__":
|
||||||
"546046730",
|
"546046730",
|
||||||
"546919329"
|
"546919329"
|
||||||
]
|
]
|
||||||
special_doc_id_list = ["501380497"]
|
# special_doc_id_list = ["501380775"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
|
|
|
||||||
|
|
@ -310,7 +310,15 @@ class PDFUtil:
|
||||||
annotation_attribute=annotation_attribute
|
annotation_attribute=annotation_attribute
|
||||||
))
|
))
|
||||||
else:
|
else:
|
||||||
pass
|
highlight_value = str(highlight_value)
|
||||||
|
annotation_list.append(self.highlight_pdf_doc(
|
||||||
|
pdf_doc=pdf_doc,
|
||||||
|
page_index=page_index,
|
||||||
|
highlight_value=highlight_value,
|
||||||
|
parent_text_block=parent_text_block,
|
||||||
|
data_point=data_point,
|
||||||
|
annotation_attribute=annotation_attribute
|
||||||
|
))
|
||||||
if output_pdf_folder is not None and len(output_pdf_folder) > 0:
|
if output_pdf_folder is not None and len(output_pdf_folder) > 0:
|
||||||
os.makedirs(output_pdf_folder, exist_ok=True)
|
os.makedirs(output_pdf_folder, exist_ok=True)
|
||||||
pdf_file_path = self.save_annotated_pdf(pdf_doc=pdf_doc,
|
pdf_file_path = self.save_annotated_pdf(pdf_doc=pdf_doc,
|
||||||
|
|
@ -362,12 +370,20 @@ class PDFUtil:
|
||||||
parent_text_block_search_text = parent_text_block_search.group()
|
parent_text_block_search_text = parent_text_block_search.group()
|
||||||
|
|
||||||
highlight_value_search_text = ""
|
highlight_value_search_text = ""
|
||||||
if highlight_value is not None:
|
|
||||||
highlight_value_regex = self.add_slash_to_text_as_regex(highlight_value)
|
if highlight_value is not None and len(highlight_value.strip()) > 0:
|
||||||
if len(highlight_value.strip().split()) == 1 and len(highlight_value.strip()) < 3:
|
pure_highlight_value = highlight_value.strip()
|
||||||
|
|
||||||
|
if len(pure_highlight_value.split()) == 1 and \
|
||||||
|
(len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]):
|
||||||
|
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value)
|
||||||
highlight_value_search = re.search(highlight_value_regex, page_text)
|
highlight_value_search = re.search(highlight_value_regex, page_text)
|
||||||
else:
|
else:
|
||||||
|
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False)
|
||||||
highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE)
|
highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE)
|
||||||
|
if highlight_value_search is None:
|
||||||
|
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
|
||||||
|
highlight_value_search = re.search(highlight_value_regex, page_text)
|
||||||
if highlight_value_search is not None:
|
if highlight_value_search is not None:
|
||||||
highlight_value_search_text = highlight_value_search.group()
|
highlight_value_search_text = highlight_value_search.group()
|
||||||
|
|
||||||
|
|
@ -417,7 +433,7 @@ class PDFUtil:
|
||||||
annotation_data["matching_val_area"] = bbox_list
|
annotation_data["matching_val_area"] = bbox_list
|
||||||
return annotation_data
|
return annotation_data
|
||||||
|
|
||||||
def add_slash_to_text_as_regex(self, text: str):
|
def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True):
|
||||||
if text is None or len(text) == 0:
|
if text is None or len(text) == 0:
|
||||||
return text
|
return text
|
||||||
special_char_iter = re.finditer("\W", text)
|
special_char_iter = re.finditer("\W", text)
|
||||||
|
|
@ -428,7 +444,10 @@ class PDFUtil:
|
||||||
if replace not in text:
|
if replace not in text:
|
||||||
text = re.sub(replace, r"\\W", text)
|
text = re.sub(replace, r"\\W", text)
|
||||||
text = re.sub(r"( ){2,}", " ", text)
|
text = re.sub(r"( ){2,}", " ", text)
|
||||||
text = text.replace(" ", r"\s*\W*")
|
if match_special_char_after_space:
|
||||||
|
text = text.replace(" ", r"\s*\W*")
|
||||||
|
else:
|
||||||
|
text = text.replace(" ", r"\s*")
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def highlight_matching_data(
|
def highlight_matching_data(
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue