diff --git a/main.py b/main.py index 27b363d..2cbd4c3 100644 --- a/main.py +++ b/main.py @@ -1119,119 +1119,70 @@ if __name__ == "__main__": "514636990", "514636993", "514636994", - "539794746" + "539794746", ] special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["532500349", - "535324239", - "532442891", - "543243650", - "528588598", - "532437639", - "527525440", - "534987291", - "534112055", - "533482585", - "544208174", - "534547266", - "544713166", - "526463547", - "534535569", - "534106067", - "532486560", - "532781760", - "533727067", - "527256381", - "533392425", - "532179676", - "534300608", - "539233950", - "532438414", - "533681744", - "537654645", - "533594905", - "537926443", - "533499655", - "533862814", - "544918611", - "539087870", - "536343790", - "479742284", - "501380497", - "501380553", - "501380775", - "501380801", - "501600428", - "501600429", - "501600541", - "501600549", - "503659548", - "506326520", - "507720522", - "507928179", - "508981020", - "509133771", - "509743502", - "514636951", - "514636952", - "514636953", - "514636954", - "514636955", - "514636957", - "514636958", - "514636959", - "514636985", - "514636988", - "514636990", - "514636993", - "514636994", - "539794746", - "292989214", - "316237292", - "321733631", - "323390570", - "327956364", - "332223498", - "333207452", - "334718372", - "344636875", - "362246081", - "366179419", - "380945052", - "382366116", - "387202452", - "389171486", - "391456740", - "391736837", - "394778487", - "401684600", - "402113224", - "402181770", - "402397014", - "405803396", - "445102363", - "445256897", - "448265376", - "449555622", - "449623976", - "458291624", - "458359181", - "463081566", - "469138353", - "471641628", - "476492237", - "478585901", - "478586066", - "479042264", - "479042269", - "479793787", - "481475385", - "483617247", - "486378555", - "486383912", - "492121213", - "497497599", - "502693599"] + special_doc_id_list = ["479742284", + "501380497", + "501380553", + "501380775", + "501380801", + "501600428", + "501600429", + "501600541", + "501600549", + "503659548", + "506326520", + "507720522", + "507928179", + "508981020", + "509133771", + "509743502", + "514636951", + "514636952", + "514636953", + "514636954", + "514636955", + "514636957", + "514636958", + "514636959", + "514636985", + "514636988", + "514636990", + "514636993", + "514636994", + "539794746", + "546483469", + "546375582", + "546375575", + "546375576", + "546375577", + "546375568", + "546371033", + "546632761", + "546632544", + "546632464", + "546724583", + "546724552", + "546694677", + "546660422", + "546638908", + "546632845", + "546105299", + "546085481", + "546078693", + "546078650", + "546289930", + "546289910", + "542967371", + "542798238", + "546048730", + "546048143", + "546047619", + "546047528", + "546046730", + "546919329" + ] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False diff --git a/utils/pdf_util.py b/utils/pdf_util.py index f9d6503..d643d40 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -313,9 +313,35 @@ class PDFUtil: matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', '')) if len(matching_val_area) == 0: matching_val_area = page.search_for(text_block.replace('-\n', '')) + if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1: + new_matching_val_area = [] + for area in matching_val_area: + # get text by text_bbox + pure_text_block = text_block.strip() + copy_area = deepcopy(area) + copy_area.x0 -= 10 + copy_area.x1 += 10 + text = page.get_text("text", clip=copy_area).strip() + if text == pure_text_block: + new_matching_val_area.append(area) + else: + # get start and end index of the pure_text_block in text + start_index = text.find(pure_text_block) + if start_index > 0: + previous_char = text[start_index - 1] + if previous_char not in [" ", "("]: + continue + end_index = start_index + len(pure_text_block) + if end_index < len(text): + next_char = text[end_index] + if next_char not in [" ", "%", ")"]: + continue + new_matching_val_area.append(area) + matching_val_area = new_matching_val_area if ( highlight_text_inside_block is not None and len(highlight_text_inside_block) > 0 + and len(matching_val_area) > 0 ): highlight_bbox_list = [] merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines) @@ -332,14 +358,14 @@ class PDFUtil: else: pure_number_match = pure_number_regex.match(highlight_text_inside_block) if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block: - for text_bbox in text_bbox_area: + for area in text_bbox_area: # get text by text_bbox - copy_text_bbox = deepcopy(text_bbox) - copy_text_bbox.x0 -= 10 - copy_text_bbox.x1 += 10 - text = page.get_text("text", clip=copy_text_bbox).strip() + copy_area = deepcopy(area) + copy_area.x0 -= 10 + copy_area.x1 += 10 + text = page.get_text("text", clip=copy_area).strip() if text == highlight_text_inside_block: - highlight_bbox_list.append(text_bbox) + highlight_bbox_list.append(area) else: # get start and end index of the highlight_text_inside_block in text start_index = text.find(highlight_text_inside_block) @@ -352,7 +378,7 @@ class PDFUtil: next_char = text[end_index] if next_char not in [" ", "%", ")"]: continue - highlight_bbox_list.append(text_bbox) + highlight_bbox_list.append(area) else: highlight_bbox_list.extend(text_bbox_area) if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2: @@ -365,7 +391,9 @@ class PDFUtil: matching_val_area = [matching_val_area[0]] if matching_val_area is not None and len(matching_val_area) > 0: - matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines) + if (highlight_text_inside_block is not None and len(highlight_text_inside_block.strip().split()) > 1) or \ + (highlight_text_inside_block is None and len(text_block.strip().split()) > 1): + matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines) if exact_match: matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block) # matching_val_area = self.merge_matching_val_area(matching_val_area)