optimize drilldown algorithm

This commit is contained in:
Blade He 2024-11-08 15:00:34 -06:00
parent 81f855f725
commit c34e2e960e
2 changed files with 98 additions and 119 deletions

115
main.py
View File

@ -1119,44 +1119,10 @@ if __name__ == "__main__":
"514636990",
"514636993",
"514636994",
"539794746"
"539794746",
]
special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["532500349",
"535324239",
"532442891",
"543243650",
"528588598",
"532437639",
"527525440",
"534987291",
"534112055",
"533482585",
"544208174",
"534547266",
"544713166",
"526463547",
"534535569",
"534106067",
"532486560",
"532781760",
"533727067",
"527256381",
"533392425",
"532179676",
"534300608",
"539233950",
"532438414",
"533681744",
"537654645",
"533594905",
"537926443",
"533499655",
"533862814",
"544918611",
"539087870",
"536343790",
"479742284",
special_doc_id_list = ["479742284",
"501380497",
"501380553",
"501380775",
@ -1186,52 +1152,37 @@ if __name__ == "__main__":
"514636993",
"514636994",
"539794746",
"292989214",
"316237292",
"321733631",
"323390570",
"327956364",
"332223498",
"333207452",
"334718372",
"344636875",
"362246081",
"366179419",
"380945052",
"382366116",
"387202452",
"389171486",
"391456740",
"391736837",
"394778487",
"401684600",
"402113224",
"402181770",
"402397014",
"405803396",
"445102363",
"445256897",
"448265376",
"449555622",
"449623976",
"458291624",
"458359181",
"463081566",
"469138353",
"471641628",
"476492237",
"478585901",
"478586066",
"479042264",
"479042269",
"479793787",
"481475385",
"483617247",
"486378555",
"486383912",
"492121213",
"497497599",
"502693599"]
"546483469",
"546375582",
"546375575",
"546375576",
"546375577",
"546375568",
"546371033",
"546632761",
"546632544",
"546632464",
"546724583",
"546724552",
"546694677",
"546660422",
"546638908",
"546632845",
"546105299",
"546085481",
"546078693",
"546078650",
"546289930",
"546289910",
"542967371",
"542798238",
"546048730",
"546048143",
"546047619",
"546047528",
"546046730",
"546919329"
]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False

View File

@ -313,9 +313,35 @@ class PDFUtil:
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
if len(matching_val_area) == 0:
matching_val_area = page.search_for(text_block.replace('-\n', ''))
if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
new_matching_val_area = []
for area in matching_val_area:
# get text by text_bbox
pure_text_block = text_block.strip()
copy_area = deepcopy(area)
copy_area.x0 -= 10
copy_area.x1 += 10
text = page.get_text("text", clip=copy_area).strip()
if text == pure_text_block:
new_matching_val_area.append(area)
else:
# get start and end index of the pure_text_block in text
start_index = text.find(pure_text_block)
if start_index > 0:
previous_char = text[start_index - 1]
if previous_char not in [" ", "("]:
continue
end_index = start_index + len(pure_text_block)
if end_index < len(text):
next_char = text[end_index]
if next_char not in [" ", "%", ")"]:
continue
new_matching_val_area.append(area)
matching_val_area = new_matching_val_area
if (
highlight_text_inside_block is not None
and len(highlight_text_inside_block) > 0
and len(matching_val_area) > 0
):
highlight_bbox_list = []
merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
@ -332,14 +358,14 @@ class PDFUtil:
else:
pure_number_match = pure_number_regex.match(highlight_text_inside_block)
if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block:
for text_bbox in text_bbox_area:
for area in text_bbox_area:
# get text by text_bbox
copy_text_bbox = deepcopy(text_bbox)
copy_text_bbox.x0 -= 10
copy_text_bbox.x1 += 10
text = page.get_text("text", clip=copy_text_bbox).strip()
copy_area = deepcopy(area)
copy_area.x0 -= 10
copy_area.x1 += 10
text = page.get_text("text", clip=copy_area).strip()
if text == highlight_text_inside_block:
highlight_bbox_list.append(text_bbox)
highlight_bbox_list.append(area)
else:
# get start and end index of the highlight_text_inside_block in text
start_index = text.find(highlight_text_inside_block)
@ -352,7 +378,7 @@ class PDFUtil:
next_char = text[end_index]
if next_char not in [" ", "%", ")"]:
continue
highlight_bbox_list.append(text_bbox)
highlight_bbox_list.append(area)
else:
highlight_bbox_list.extend(text_bbox_area)
if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2:
@ -365,6 +391,8 @@ class PDFUtil:
matching_val_area = [matching_val_area[0]]
if matching_val_area is not None and len(matching_val_area) > 0:
if (highlight_text_inside_block is not None and len(highlight_text_inside_block.strip().split()) > 1) or \
(highlight_text_inside_block is None and len(text_block.strip().split()) > 1):
matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
if exact_match:
matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block)