optimize drilldown algorithm
This commit is contained in:
parent
81f855f725
commit
c34e2e960e
115
main.py
115
main.py
|
|
@ -1119,44 +1119,10 @@ if __name__ == "__main__":
|
|||
"514636990",
|
||||
"514636993",
|
||||
"514636994",
|
||||
"539794746"
|
||||
"539794746",
|
||||
]
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
special_doc_id_list = ["532500349",
|
||||
"535324239",
|
||||
"532442891",
|
||||
"543243650",
|
||||
"528588598",
|
||||
"532437639",
|
||||
"527525440",
|
||||
"534987291",
|
||||
"534112055",
|
||||
"533482585",
|
||||
"544208174",
|
||||
"534547266",
|
||||
"544713166",
|
||||
"526463547",
|
||||
"534535569",
|
||||
"534106067",
|
||||
"532486560",
|
||||
"532781760",
|
||||
"533727067",
|
||||
"527256381",
|
||||
"533392425",
|
||||
"532179676",
|
||||
"534300608",
|
||||
"539233950",
|
||||
"532438414",
|
||||
"533681744",
|
||||
"537654645",
|
||||
"533594905",
|
||||
"537926443",
|
||||
"533499655",
|
||||
"533862814",
|
||||
"544918611",
|
||||
"539087870",
|
||||
"536343790",
|
||||
"479742284",
|
||||
special_doc_id_list = ["479742284",
|
||||
"501380497",
|
||||
"501380553",
|
||||
"501380775",
|
||||
|
|
@ -1186,52 +1152,37 @@ if __name__ == "__main__":
|
|||
"514636993",
|
||||
"514636994",
|
||||
"539794746",
|
||||
"292989214",
|
||||
"316237292",
|
||||
"321733631",
|
||||
"323390570",
|
||||
"327956364",
|
||||
"332223498",
|
||||
"333207452",
|
||||
"334718372",
|
||||
"344636875",
|
||||
"362246081",
|
||||
"366179419",
|
||||
"380945052",
|
||||
"382366116",
|
||||
"387202452",
|
||||
"389171486",
|
||||
"391456740",
|
||||
"391736837",
|
||||
"394778487",
|
||||
"401684600",
|
||||
"402113224",
|
||||
"402181770",
|
||||
"402397014",
|
||||
"405803396",
|
||||
"445102363",
|
||||
"445256897",
|
||||
"448265376",
|
||||
"449555622",
|
||||
"449623976",
|
||||
"458291624",
|
||||
"458359181",
|
||||
"463081566",
|
||||
"469138353",
|
||||
"471641628",
|
||||
"476492237",
|
||||
"478585901",
|
||||
"478586066",
|
||||
"479042264",
|
||||
"479042269",
|
||||
"479793787",
|
||||
"481475385",
|
||||
"483617247",
|
||||
"486378555",
|
||||
"486383912",
|
||||
"492121213",
|
||||
"497497599",
|
||||
"502693599"]
|
||||
"546483469",
|
||||
"546375582",
|
||||
"546375575",
|
||||
"546375576",
|
||||
"546375577",
|
||||
"546375568",
|
||||
"546371033",
|
||||
"546632761",
|
||||
"546632544",
|
||||
"546632464",
|
||||
"546724583",
|
||||
"546724552",
|
||||
"546694677",
|
||||
"546660422",
|
||||
"546638908",
|
||||
"546632845",
|
||||
"546105299",
|
||||
"546085481",
|
||||
"546078693",
|
||||
"546078650",
|
||||
"546289930",
|
||||
"546289910",
|
||||
"542967371",
|
||||
"542798238",
|
||||
"546048730",
|
||||
"546048143",
|
||||
"546047619",
|
||||
"546047528",
|
||||
"546046730",
|
||||
"546919329"
|
||||
]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
|
|
|
|||
|
|
@ -313,9 +313,35 @@ class PDFUtil:
|
|||
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
|
||||
if len(matching_val_area) == 0:
|
||||
matching_val_area = page.search_for(text_block.replace('-\n', ''))
|
||||
if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
|
||||
new_matching_val_area = []
|
||||
for area in matching_val_area:
|
||||
# get text by text_bbox
|
||||
pure_text_block = text_block.strip()
|
||||
copy_area = deepcopy(area)
|
||||
copy_area.x0 -= 10
|
||||
copy_area.x1 += 10
|
||||
text = page.get_text("text", clip=copy_area).strip()
|
||||
if text == pure_text_block:
|
||||
new_matching_val_area.append(area)
|
||||
else:
|
||||
# get start and end index of the pure_text_block in text
|
||||
start_index = text.find(pure_text_block)
|
||||
if start_index > 0:
|
||||
previous_char = text[start_index - 1]
|
||||
if previous_char not in [" ", "("]:
|
||||
continue
|
||||
end_index = start_index + len(pure_text_block)
|
||||
if end_index < len(text):
|
||||
next_char = text[end_index]
|
||||
if next_char not in [" ", "%", ")"]:
|
||||
continue
|
||||
new_matching_val_area.append(area)
|
||||
matching_val_area = new_matching_val_area
|
||||
if (
|
||||
highlight_text_inside_block is not None
|
||||
and len(highlight_text_inside_block) > 0
|
||||
and len(matching_val_area) > 0
|
||||
):
|
||||
highlight_bbox_list = []
|
||||
merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
|
||||
|
|
@ -332,14 +358,14 @@ class PDFUtil:
|
|||
else:
|
||||
pure_number_match = pure_number_regex.match(highlight_text_inside_block)
|
||||
if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block:
|
||||
for text_bbox in text_bbox_area:
|
||||
for area in text_bbox_area:
|
||||
# get text by text_bbox
|
||||
copy_text_bbox = deepcopy(text_bbox)
|
||||
copy_text_bbox.x0 -= 10
|
||||
copy_text_bbox.x1 += 10
|
||||
text = page.get_text("text", clip=copy_text_bbox).strip()
|
||||
copy_area = deepcopy(area)
|
||||
copy_area.x0 -= 10
|
||||
copy_area.x1 += 10
|
||||
text = page.get_text("text", clip=copy_area).strip()
|
||||
if text == highlight_text_inside_block:
|
||||
highlight_bbox_list.append(text_bbox)
|
||||
highlight_bbox_list.append(area)
|
||||
else:
|
||||
# get start and end index of the highlight_text_inside_block in text
|
||||
start_index = text.find(highlight_text_inside_block)
|
||||
|
|
@ -352,7 +378,7 @@ class PDFUtil:
|
|||
next_char = text[end_index]
|
||||
if next_char not in [" ", "%", ")"]:
|
||||
continue
|
||||
highlight_bbox_list.append(text_bbox)
|
||||
highlight_bbox_list.append(area)
|
||||
else:
|
||||
highlight_bbox_list.extend(text_bbox_area)
|
||||
if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2:
|
||||
|
|
@ -365,6 +391,8 @@ class PDFUtil:
|
|||
matching_val_area = [matching_val_area[0]]
|
||||
|
||||
if matching_val_area is not None and len(matching_val_area) > 0:
|
||||
if (highlight_text_inside_block is not None and len(highlight_text_inside_block.strip().split()) > 1) or \
|
||||
(highlight_text_inside_block is None and len(text_block.strip().split()) > 1):
|
||||
matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
|
||||
if exact_match:
|
||||
matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block)
|
||||
|
|
|
|||
Loading…
Reference in New Issue