optimize drilldown algorithm

This commit is contained in:
Blade He 2024-11-08 15:00:34 -06:00
parent 81f855f725
commit c34e2e960e
2 changed files with 98 additions and 119 deletions

115
main.py
View File

@ -1119,44 +1119,10 @@ if __name__ == "__main__":
"514636990", "514636990",
"514636993", "514636993",
"514636994", "514636994",
"539794746" "539794746",
] ]
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["532500349", special_doc_id_list = ["479742284",
"535324239",
"532442891",
"543243650",
"528588598",
"532437639",
"527525440",
"534987291",
"534112055",
"533482585",
"544208174",
"534547266",
"544713166",
"526463547",
"534535569",
"534106067",
"532486560",
"532781760",
"533727067",
"527256381",
"533392425",
"532179676",
"534300608",
"539233950",
"532438414",
"533681744",
"537654645",
"533594905",
"537926443",
"533499655",
"533862814",
"544918611",
"539087870",
"536343790",
"479742284",
"501380497", "501380497",
"501380553", "501380553",
"501380775", "501380775",
@ -1186,52 +1152,37 @@ if __name__ == "__main__":
"514636993", "514636993",
"514636994", "514636994",
"539794746", "539794746",
"292989214", "546483469",
"316237292", "546375582",
"321733631", "546375575",
"323390570", "546375576",
"327956364", "546375577",
"332223498", "546375568",
"333207452", "546371033",
"334718372", "546632761",
"344636875", "546632544",
"362246081", "546632464",
"366179419", "546724583",
"380945052", "546724552",
"382366116", "546694677",
"387202452", "546660422",
"389171486", "546638908",
"391456740", "546632845",
"391736837", "546105299",
"394778487", "546085481",
"401684600", "546078693",
"402113224", "546078650",
"402181770", "546289930",
"402397014", "546289910",
"405803396", "542967371",
"445102363", "542798238",
"445256897", "546048730",
"448265376", "546048143",
"449555622", "546047619",
"449623976", "546047528",
"458291624", "546046730",
"458359181", "546919329"
"463081566", ]
"469138353",
"471641628",
"476492237",
"478585901",
"478586066",
"479042264",
"479042269",
"479793787",
"481475385",
"483617247",
"486378555",
"486383912",
"492121213",
"497497599",
"502693599"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False

View File

@ -313,9 +313,35 @@ class PDFUtil:
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', '')) matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
if len(matching_val_area) == 0: if len(matching_val_area) == 0:
matching_val_area = page.search_for(text_block.replace('-\n', '')) matching_val_area = page.search_for(text_block.replace('-\n', ''))
if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
new_matching_val_area = []
for area in matching_val_area:
# get text by text_bbox
pure_text_block = text_block.strip()
copy_area = deepcopy(area)
copy_area.x0 -= 10
copy_area.x1 += 10
text = page.get_text("text", clip=copy_area).strip()
if text == pure_text_block:
new_matching_val_area.append(area)
else:
# get start and end index of the pure_text_block in text
start_index = text.find(pure_text_block)
if start_index > 0:
previous_char = text[start_index - 1]
if previous_char not in [" ", "("]:
continue
end_index = start_index + len(pure_text_block)
if end_index < len(text):
next_char = text[end_index]
if next_char not in [" ", "%", ")"]:
continue
new_matching_val_area.append(area)
matching_val_area = new_matching_val_area
if ( if (
highlight_text_inside_block is not None highlight_text_inside_block is not None
and len(highlight_text_inside_block) > 0 and len(highlight_text_inside_block) > 0
and len(matching_val_area) > 0
): ):
highlight_bbox_list = [] highlight_bbox_list = []
merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines) merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
@ -332,14 +358,14 @@ class PDFUtil:
else: else:
pure_number_match = pure_number_regex.match(highlight_text_inside_block) pure_number_match = pure_number_regex.match(highlight_text_inside_block)
if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block: if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block:
for text_bbox in text_bbox_area: for area in text_bbox_area:
# get text by text_bbox # get text by text_bbox
copy_text_bbox = deepcopy(text_bbox) copy_area = deepcopy(area)
copy_text_bbox.x0 -= 10 copy_area.x0 -= 10
copy_text_bbox.x1 += 10 copy_area.x1 += 10
text = page.get_text("text", clip=copy_text_bbox).strip() text = page.get_text("text", clip=copy_area).strip()
if text == highlight_text_inside_block: if text == highlight_text_inside_block:
highlight_bbox_list.append(text_bbox) highlight_bbox_list.append(area)
else: else:
# get start and end index of the highlight_text_inside_block in text # get start and end index of the highlight_text_inside_block in text
start_index = text.find(highlight_text_inside_block) start_index = text.find(highlight_text_inside_block)
@ -352,7 +378,7 @@ class PDFUtil:
next_char = text[end_index] next_char = text[end_index]
if next_char not in [" ", "%", ")"]: if next_char not in [" ", "%", ")"]:
continue continue
highlight_bbox_list.append(text_bbox) highlight_bbox_list.append(area)
else: else:
highlight_bbox_list.extend(text_bbox_area) highlight_bbox_list.extend(text_bbox_area)
if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2: if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2:
@ -365,6 +391,8 @@ class PDFUtil:
matching_val_area = [matching_val_area[0]] matching_val_area = [matching_val_area[0]]
if matching_val_area is not None and len(matching_val_area) > 0: if matching_val_area is not None and len(matching_val_area) > 0:
if (highlight_text_inside_block is not None and len(highlight_text_inside_block.strip().split()) > 1) or \
(highlight_text_inside_block is None and len(text_block.strip().split()) > 1):
matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines) matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
if exact_match: if exact_match:
matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block) matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block)