diff --git a/main.py b/main.py index 8ee0a7a..71ecd84 100644 --- a/main.py +++ b/main.py @@ -937,113 +937,78 @@ if __name__ == "__main__": # "471641628", # ] # English documents with ground truth - check_db_mapping_doc_id_list = [ - "292989214", - "316237292", - "321733631", - "323390570", - "327956364", - "332223498", - "333207452", - "334718372", - "344636875", - "362246081", - "366179419", - "380945052", - "382366116", - "387202452", - "389171486", - "391456740", - "391736837", - "394778487", - "401684600", - "402113224", - "402181770", - "402397014", - "405803396", - "445102363", - "445256897", - "448265376", - "449555622", - "449623976", - "458291624", - "458359181", - "463081566", - "469138353", - "471641628", - "476492237", - "478585901", - "478586066", - "479042264", - "479042269", - "479793787", - "481475385", - "483617247", - "486378555", - "486383912", - "492121213", - "497497599", - "502693599" - ] - - # Sample documents with special cases # check_db_mapping_doc_id_list = [ - # "334584772", - # "406913630", - # "407275419", - # "337937633", - # "337293427", - # "334584772", - # "404712928", - # "451063582", - # "451878128", - # "425595958", - # "536344026", - # "532422548", - # "423418540", - # "423418395", - # "532998065", - # "540307575", - # "423395975", - # "508704368", - # "481482392", - # "466580448", - # "423365707", - # "423364758", - # "422761666", - # "422760156", - # "422760148", - # "422686965", - # "492029971", - # "510300817", - # "512745032", - # "514213638", - # "527525440", - # "534535767" + # "292989214", + # "316237292", + # "321733631", + # "323390570", + # "327956364", + # "332223498", + # "333207452", + # "334718372", + # "344636875", + # "362246081", + # "366179419", + # "380945052", + # "382366116", + # "387202452", + # "389171486", + # "391456740", + # "391736837", + # "394778487", + # "401684600", + # "402113224", + # "402181770", + # "402397014", + # "405803396", + # "445102363", + # "445256897", + # "448265376", + # "449555622", + # "449623976", + # "458291624", + # "458359181", + # "463081566", + # "469138353", + # "471641628", + # "476492237", + # "478585901", + # "478586066", + # "479042264", + # "479042269", + # "479793787", + # "481475385", + # "483617247", + # "486378555", + # "486383912", + # "492121213", + # "497497599", + # "502693599" # ] + # Documents in EMEA Case 1.docx - check_db_mapping_doc_id_list = [ - "424976833", - "425480144", - "427637151", - "429564034", - "429950833", - "430240853", - "431073795", - "434710819", - "434851173", - "434902020", - "434924914", - "435128656", - "440029306", - "466371135", - "466528487", - "466859621", - "466860852", - "467595142", - "467788879", - "470515549" - ] + # check_db_mapping_doc_id_list = [ + # "424976833", + # "425480144", + # "427637151", + # "429564034", + # "429950833", + # "430240853", + # "431073795", + # "434710819", + # "434851173", + # "434902020", + # "434924914", + # "435128656", + # "440029306", + # "466371135", + # "466528487", + # "466859621", + # "466860852", + # "467595142", + # "467788879", + # "470515549" + # ] # documents in New EMEA Documents sample.xlsx as typical documents # """ @@ -1091,43 +1056,43 @@ if __name__ == "__main__": # ] # documents in Final list of EMEA documents.xlsx as typical documents - check_db_mapping_doc_id_list = [ - "532500349", - "535324239", - "532442891", - "543243650", - "528588598", - "532437639", - "527525440", - "534987291", - "534112055", - "533482585", - "544208174", - "534547266", - "544713166", - "526463547", - "534535569", - "534106067", - "532486560", - "532781760", - "533727067", - "527256381", - "533392425", - "532179676", - "534300608", - "539233950", - # "533727908", - "532438414", - "533681744", - "537654645", - "533594905", - "537926443", - "533499655", - "533862814", - "544918611", - "539087870", - "536343790" - ] + # check_db_mapping_doc_id_list = [ + # "532500349", + # "535324239", + # "532442891", + # "543243650", + # "528588598", + # "532437639", + # "527525440", + # "534987291", + # "534112055", + # "533482585", + # "544208174", + # "534547266", + # "544713166", + # "526463547", + # "534535569", + # "534106067", + # "532486560", + # "532781760", + # "533727067", + # "527256381", + # "533392425", + # "532179676", + # "534300608", + # "539233950", + # # "533727908", + # "532438414", + # "533681744", + # "537654645", + # "533594905", + # "537926443", + # "533499655", + # "533862814", + # "544918611", + # "539087870", + # "536343790" + # ] # document samples 2024-11-06 # check_db_mapping_doc_id_list = ["546483469", @@ -1162,122 +1127,74 @@ if __name__ == "__main__": # "546919329"] # document samples: 30 documents, all with 4 data points - check_db_mapping_doc_id_list = ["479742284", - "501380497", - "501380553", - "501380775", - "501380801", - "501600428", - "501600429", - "501600541", - "501600549", - "503659548", - "506326520", - "507720522", - "507928179", - "508981020", - "509133771", - "509743502", - "514636951", - "514636952", - "514636953", - "514636954", - "514636955", - "514636957", - "514636958", - "514636959", - "514636985", - "514636988", - "514636990", - "514636993", - "514636994", - "539794746", + # check_db_mapping_doc_id_list = ["479742284", + # "501380497", + # "501380553", + # "501380775", + # "501380801", + # "501600428", + # "501600429", + # "501600541", + # "501600549", + # "503659548", + # "506326520", + # "507720522", + # "507928179", + # "508981020", + # "509133771", + # "509743502", + # "514636951", + # "514636952", + # "514636953", + # "514636954", + # "514636955", + # "514636957", + # "514636958", + # "514636959", + # "514636985", + # "514636988", + # "514636990", + # "514636993", + # "514636994", + # "539794746", + # ] + # Sample documents with special cases + check_db_mapping_doc_id_list = [ + "334584772", + "406913630", + "407275419", + "337937633", + "337293427", + "334584772", + "404712928", + "451063582", + "451878128", + "425595958", + "536344026", + "532422548", + "423418540", + "423418395", + "532998065", + "540307575", + "423395975", + "508704368", + "481482392", + "466580448", + "423365707", + "423364758", + "422761666", + "422760156", + "422760148", + "422686965", + "492029971", + "510300817", + "512745032", + "514213638", + "527525440", + "534535767" ] special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["479742284", - "501380497", - "501380553", - "501380775", - "501380801", - "501600428", - "501600429", - "501600541", - "501600549", - "503659548", - "506326520", - "507720522", - "507928179", - "508981020", - "509133771", - "509743502", - "514636951", - "514636952", - "514636953", - "514636954", - "514636955", - "514636957", - "514636958", - "514636959", - "514636985", - "514636988", - "514636990", - "514636993", - "514636994", - "539794746", - "546483469", - "546375582", - "546375575", - "546375576", - "546375577", - "546375568", - "546371033", - "546632761", - "546632544", - "546632464", - "546724583", - "546724552", - "546694677", - "546660422", - "546638908", - "546632845", - "546105299", - "546085481", - "546078693", - "546078650", - "546289930", - "546289910", - "542967371", - "542798238", - "546048730", - "546048143", - "546047619", - "546047528", - "546046730", - "546919329" - ] - special_doc_id_list = ["514636958", - "514636959", - "514636985", - "514636988", - "514636990", - "514636994", - "514636957", - "514636954", - "514636953", - "514636952", - "501600549", - "501600429", - "501380553", - "501380497", - "514636959", - "508981020"] - special_doc_id_list = ["501380801", - "501600541", - "507720522", - "509133771", - "514636951", - "514636955", - "514636993"] + # special_doc_id_list = ["334584772"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = True diff --git a/utils/pdf_util.py b/utils/pdf_util.py index 6312d0b..6745754 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -11,6 +11,7 @@ from tqdm import tqdm import base64 from copy import deepcopy from utils.similarity import Similarity +from utils.biz_utils import total_currency_list from utils.logger import logger @@ -385,6 +386,21 @@ class PDFUtil: if highlight_value_search_text is None: highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True) highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False) + if highlight_value_search_text is None: + pure_highlight_value_splits = pure_highlight_value.split() + if pure_highlight_value_splits[-1].upper() in total_currency_list: + highlight_value_regex = self.add_slash_to_text_as_regex(' '.join(pure_highlight_value_splits[0:-1]), match_special_char_after_space=False) + highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False) + if highlight_value_search_text is None: + # If still can't find the highlight value, search in the previous page + previous_page_index = page_index - 1 + if previous_page_index >= 0: + previous_page = pdf_doc[previous_page_index] + previous_page_text = previous_page.get_text() + highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, previous_page_text, ignore_case=False) + if highlight_value_search_text is not None: + page_index = previous_page_index + page = previous_page annotation_data = {"pdf_file": self.simple_pdf_file, "page_index": page_index, @@ -611,7 +627,19 @@ class PDFUtil: if exact_match: matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block) # matching_val_area = self.merge_matching_val_area(matching_val_area) + + # get annotation list from current page + xrefs = [annot.xref for annot in page.annots()] + annotated_list = [] + for xref in xrefs: + annotated = page.load_annot(xref) + real_bbox_tuple = annotated.vertices[0] + annotated.vertices[3] + bbox = fitz.Rect(real_bbox_tuple) + annotated_list.append(bbox) + for area in matching_val_area: + if area in annotated_list: + continue highlight = page.add_highlight_annot([area]) bbox_list = [area.x0, area.y0, area.x1, area.y1] content["bbox"] = bbox_list