optimize drilldown algorithm

This commit is contained in:
Blade He 2024-11-08 15:00:34 -06:00
parent 81f855f725
commit c34e2e960e
2 changed files with 98 additions and 119 deletions

173
main.py
View File

@ -1119,119 +1119,70 @@ if __name__ == "__main__":
"514636990", "514636990",
"514636993", "514636993",
"514636994", "514636994",
"539794746" "539794746",
] ]
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["532500349", special_doc_id_list = ["479742284",
"535324239", "501380497",
"532442891", "501380553",
"543243650", "501380775",
"528588598", "501380801",
"532437639", "501600428",
"527525440", "501600429",
"534987291", "501600541",
"534112055", "501600549",
"533482585", "503659548",
"544208174", "506326520",
"534547266", "507720522",
"544713166", "507928179",
"526463547", "508981020",
"534535569", "509133771",
"534106067", "509743502",
"532486560", "514636951",
"532781760", "514636952",
"533727067", "514636953",
"527256381", "514636954",
"533392425", "514636955",
"532179676", "514636957",
"534300608", "514636958",
"539233950", "514636959",
"532438414", "514636985",
"533681744", "514636988",
"537654645", "514636990",
"533594905", "514636993",
"537926443", "514636994",
"533499655", "539794746",
"533862814", "546483469",
"544918611", "546375582",
"539087870", "546375575",
"536343790", "546375576",
"479742284", "546375577",
"501380497", "546375568",
"501380553", "546371033",
"501380775", "546632761",
"501380801", "546632544",
"501600428", "546632464",
"501600429", "546724583",
"501600541", "546724552",
"501600549", "546694677",
"503659548", "546660422",
"506326520", "546638908",
"507720522", "546632845",
"507928179", "546105299",
"508981020", "546085481",
"509133771", "546078693",
"509743502", "546078650",
"514636951", "546289930",
"514636952", "546289910",
"514636953", "542967371",
"514636954", "542798238",
"514636955", "546048730",
"514636957", "546048143",
"514636958", "546047619",
"514636959", "546047528",
"514636985", "546046730",
"514636988", "546919329"
"514636990", ]
"514636993",
"514636994",
"539794746",
"292989214",
"316237292",
"321733631",
"323390570",
"327956364",
"332223498",
"333207452",
"334718372",
"344636875",
"362246081",
"366179419",
"380945052",
"382366116",
"387202452",
"389171486",
"391456740",
"391736837",
"394778487",
"401684600",
"402113224",
"402181770",
"402397014",
"405803396",
"445102363",
"445256897",
"448265376",
"449555622",
"449623976",
"458291624",
"458359181",
"463081566",
"469138353",
"471641628",
"476492237",
"478585901",
"478586066",
"479042264",
"479042269",
"479793787",
"481475385",
"483617247",
"486378555",
"486383912",
"492121213",
"497497599",
"502693599"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False

View File

@ -313,9 +313,35 @@ class PDFUtil:
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', '')) matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
if len(matching_val_area) == 0: if len(matching_val_area) == 0:
matching_val_area = page.search_for(text_block.replace('-\n', '')) matching_val_area = page.search_for(text_block.replace('-\n', ''))
if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
new_matching_val_area = []
for area in matching_val_area:
# get text by text_bbox
pure_text_block = text_block.strip()
copy_area = deepcopy(area)
copy_area.x0 -= 10
copy_area.x1 += 10
text = page.get_text("text", clip=copy_area).strip()
if text == pure_text_block:
new_matching_val_area.append(area)
else:
# get start and end index of the pure_text_block in text
start_index = text.find(pure_text_block)
if start_index > 0:
previous_char = text[start_index - 1]
if previous_char not in [" ", "("]:
continue
end_index = start_index + len(pure_text_block)
if end_index < len(text):
next_char = text[end_index]
if next_char not in [" ", "%", ")"]:
continue
new_matching_val_area.append(area)
matching_val_area = new_matching_val_area
if ( if (
highlight_text_inside_block is not None highlight_text_inside_block is not None
and len(highlight_text_inside_block) > 0 and len(highlight_text_inside_block) > 0
and len(matching_val_area) > 0
): ):
highlight_bbox_list = [] highlight_bbox_list = []
merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines) merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
@ -332,14 +358,14 @@ class PDFUtil:
else: else:
pure_number_match = pure_number_regex.match(highlight_text_inside_block) pure_number_match = pure_number_regex.match(highlight_text_inside_block)
if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block: if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block:
for text_bbox in text_bbox_area: for area in text_bbox_area:
# get text by text_bbox # get text by text_bbox
copy_text_bbox = deepcopy(text_bbox) copy_area = deepcopy(area)
copy_text_bbox.x0 -= 10 copy_area.x0 -= 10
copy_text_bbox.x1 += 10 copy_area.x1 += 10
text = page.get_text("text", clip=copy_text_bbox).strip() text = page.get_text("text", clip=copy_area).strip()
if text == highlight_text_inside_block: if text == highlight_text_inside_block:
highlight_bbox_list.append(text_bbox) highlight_bbox_list.append(area)
else: else:
# get start and end index of the highlight_text_inside_block in text # get start and end index of the highlight_text_inside_block in text
start_index = text.find(highlight_text_inside_block) start_index = text.find(highlight_text_inside_block)
@ -352,7 +378,7 @@ class PDFUtil:
next_char = text[end_index] next_char = text[end_index]
if next_char not in [" ", "%", ")"]: if next_char not in [" ", "%", ")"]:
continue continue
highlight_bbox_list.append(text_bbox) highlight_bbox_list.append(area)
else: else:
highlight_bbox_list.extend(text_bbox_area) highlight_bbox_list.extend(text_bbox_area)
if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2: if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2:
@ -365,7 +391,9 @@ class PDFUtil:
matching_val_area = [matching_val_area[0]] matching_val_area = [matching_val_area[0]]
if matching_val_area is not None and len(matching_val_area) > 0: if matching_val_area is not None and len(matching_val_area) > 0:
matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines) if (highlight_text_inside_block is not None and len(highlight_text_inside_block.strip().split()) > 1) or \
(highlight_text_inside_block is None and len(text_block.strip().split()) > 1):
matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
if exact_match: if exact_match:
matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block) matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block)
# matching_val_area = self.merge_matching_val_area(matching_val_area) # matching_val_area = self.merge_matching_val_area(matching_val_area)