optimize drilldown algorithm
This commit is contained in:
parent
81f855f725
commit
c34e2e960e
115
main.py
115
main.py
|
|
@ -1119,44 +1119,10 @@ if __name__ == "__main__":
|
||||||
"514636990",
|
"514636990",
|
||||||
"514636993",
|
"514636993",
|
||||||
"514636994",
|
"514636994",
|
||||||
"539794746"
|
"539794746",
|
||||||
]
|
]
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
special_doc_id_list = ["532500349",
|
special_doc_id_list = ["479742284",
|
||||||
"535324239",
|
|
||||||
"532442891",
|
|
||||||
"543243650",
|
|
||||||
"528588598",
|
|
||||||
"532437639",
|
|
||||||
"527525440",
|
|
||||||
"534987291",
|
|
||||||
"534112055",
|
|
||||||
"533482585",
|
|
||||||
"544208174",
|
|
||||||
"534547266",
|
|
||||||
"544713166",
|
|
||||||
"526463547",
|
|
||||||
"534535569",
|
|
||||||
"534106067",
|
|
||||||
"532486560",
|
|
||||||
"532781760",
|
|
||||||
"533727067",
|
|
||||||
"527256381",
|
|
||||||
"533392425",
|
|
||||||
"532179676",
|
|
||||||
"534300608",
|
|
||||||
"539233950",
|
|
||||||
"532438414",
|
|
||||||
"533681744",
|
|
||||||
"537654645",
|
|
||||||
"533594905",
|
|
||||||
"537926443",
|
|
||||||
"533499655",
|
|
||||||
"533862814",
|
|
||||||
"544918611",
|
|
||||||
"539087870",
|
|
||||||
"536343790",
|
|
||||||
"479742284",
|
|
||||||
"501380497",
|
"501380497",
|
||||||
"501380553",
|
"501380553",
|
||||||
"501380775",
|
"501380775",
|
||||||
|
|
@ -1186,52 +1152,37 @@ if __name__ == "__main__":
|
||||||
"514636993",
|
"514636993",
|
||||||
"514636994",
|
"514636994",
|
||||||
"539794746",
|
"539794746",
|
||||||
"292989214",
|
"546483469",
|
||||||
"316237292",
|
"546375582",
|
||||||
"321733631",
|
"546375575",
|
||||||
"323390570",
|
"546375576",
|
||||||
"327956364",
|
"546375577",
|
||||||
"332223498",
|
"546375568",
|
||||||
"333207452",
|
"546371033",
|
||||||
"334718372",
|
"546632761",
|
||||||
"344636875",
|
"546632544",
|
||||||
"362246081",
|
"546632464",
|
||||||
"366179419",
|
"546724583",
|
||||||
"380945052",
|
"546724552",
|
||||||
"382366116",
|
"546694677",
|
||||||
"387202452",
|
"546660422",
|
||||||
"389171486",
|
"546638908",
|
||||||
"391456740",
|
"546632845",
|
||||||
"391736837",
|
"546105299",
|
||||||
"394778487",
|
"546085481",
|
||||||
"401684600",
|
"546078693",
|
||||||
"402113224",
|
"546078650",
|
||||||
"402181770",
|
"546289930",
|
||||||
"402397014",
|
"546289910",
|
||||||
"405803396",
|
"542967371",
|
||||||
"445102363",
|
"542798238",
|
||||||
"445256897",
|
"546048730",
|
||||||
"448265376",
|
"546048143",
|
||||||
"449555622",
|
"546047619",
|
||||||
"449623976",
|
"546047528",
|
||||||
"458291624",
|
"546046730",
|
||||||
"458359181",
|
"546919329"
|
||||||
"463081566",
|
]
|
||||||
"469138353",
|
|
||||||
"471641628",
|
|
||||||
"476492237",
|
|
||||||
"478585901",
|
|
||||||
"478586066",
|
|
||||||
"479042264",
|
|
||||||
"479042269",
|
|
||||||
"479793787",
|
|
||||||
"481475385",
|
|
||||||
"483617247",
|
|
||||||
"486378555",
|
|
||||||
"486383912",
|
|
||||||
"492121213",
|
|
||||||
"497497599",
|
|
||||||
"502693599"]
|
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
|
|
|
||||||
|
|
@ -313,9 +313,35 @@ class PDFUtil:
|
||||||
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
|
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
|
||||||
if len(matching_val_area) == 0:
|
if len(matching_val_area) == 0:
|
||||||
matching_val_area = page.search_for(text_block.replace('-\n', ''))
|
matching_val_area = page.search_for(text_block.replace('-\n', ''))
|
||||||
|
if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
|
||||||
|
new_matching_val_area = []
|
||||||
|
for area in matching_val_area:
|
||||||
|
# get text by text_bbox
|
||||||
|
pure_text_block = text_block.strip()
|
||||||
|
copy_area = deepcopy(area)
|
||||||
|
copy_area.x0 -= 10
|
||||||
|
copy_area.x1 += 10
|
||||||
|
text = page.get_text("text", clip=copy_area).strip()
|
||||||
|
if text == pure_text_block:
|
||||||
|
new_matching_val_area.append(area)
|
||||||
|
else:
|
||||||
|
# get start and end index of the pure_text_block in text
|
||||||
|
start_index = text.find(pure_text_block)
|
||||||
|
if start_index > 0:
|
||||||
|
previous_char = text[start_index - 1]
|
||||||
|
if previous_char not in [" ", "("]:
|
||||||
|
continue
|
||||||
|
end_index = start_index + len(pure_text_block)
|
||||||
|
if end_index < len(text):
|
||||||
|
next_char = text[end_index]
|
||||||
|
if next_char not in [" ", "%", ")"]:
|
||||||
|
continue
|
||||||
|
new_matching_val_area.append(area)
|
||||||
|
matching_val_area = new_matching_val_area
|
||||||
if (
|
if (
|
||||||
highlight_text_inside_block is not None
|
highlight_text_inside_block is not None
|
||||||
and len(highlight_text_inside_block) > 0
|
and len(highlight_text_inside_block) > 0
|
||||||
|
and len(matching_val_area) > 0
|
||||||
):
|
):
|
||||||
highlight_bbox_list = []
|
highlight_bbox_list = []
|
||||||
merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
|
merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
|
||||||
|
|
@ -332,14 +358,14 @@ class PDFUtil:
|
||||||
else:
|
else:
|
||||||
pure_number_match = pure_number_regex.match(highlight_text_inside_block)
|
pure_number_match = pure_number_regex.match(highlight_text_inside_block)
|
||||||
if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block:
|
if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block:
|
||||||
for text_bbox in text_bbox_area:
|
for area in text_bbox_area:
|
||||||
# get text by text_bbox
|
# get text by text_bbox
|
||||||
copy_text_bbox = deepcopy(text_bbox)
|
copy_area = deepcopy(area)
|
||||||
copy_text_bbox.x0 -= 10
|
copy_area.x0 -= 10
|
||||||
copy_text_bbox.x1 += 10
|
copy_area.x1 += 10
|
||||||
text = page.get_text("text", clip=copy_text_bbox).strip()
|
text = page.get_text("text", clip=copy_area).strip()
|
||||||
if text == highlight_text_inside_block:
|
if text == highlight_text_inside_block:
|
||||||
highlight_bbox_list.append(text_bbox)
|
highlight_bbox_list.append(area)
|
||||||
else:
|
else:
|
||||||
# get start and end index of the highlight_text_inside_block in text
|
# get start and end index of the highlight_text_inside_block in text
|
||||||
start_index = text.find(highlight_text_inside_block)
|
start_index = text.find(highlight_text_inside_block)
|
||||||
|
|
@ -352,7 +378,7 @@ class PDFUtil:
|
||||||
next_char = text[end_index]
|
next_char = text[end_index]
|
||||||
if next_char not in [" ", "%", ")"]:
|
if next_char not in [" ", "%", ")"]:
|
||||||
continue
|
continue
|
||||||
highlight_bbox_list.append(text_bbox)
|
highlight_bbox_list.append(area)
|
||||||
else:
|
else:
|
||||||
highlight_bbox_list.extend(text_bbox_area)
|
highlight_bbox_list.extend(text_bbox_area)
|
||||||
if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2:
|
if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2:
|
||||||
|
|
@ -365,6 +391,8 @@ class PDFUtil:
|
||||||
matching_val_area = [matching_val_area[0]]
|
matching_val_area = [matching_val_area[0]]
|
||||||
|
|
||||||
if matching_val_area is not None and len(matching_val_area) > 0:
|
if matching_val_area is not None and len(matching_val_area) > 0:
|
||||||
|
if (highlight_text_inside_block is not None and len(highlight_text_inside_block.strip().split()) > 1) or \
|
||||||
|
(highlight_text_inside_block is None and len(text_block.strip().split()) > 1):
|
||||||
matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
|
matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
|
||||||
if exact_match:
|
if exact_match:
|
||||||
matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block)
|
matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue