Optimize drilldown algorithm

Share class names with currency
Reason
The currency in document not next to share name
Solution
If can't get relevant text from PDF page contents, and the last word of share class name belongs to currency, remove currency from share class name, then try again.
After implementing this solution, recall is from 95% to 96%
Can't find relevant text from current PDF page text
Reason
Hence apply try to merge previous page text into current page, perhaps the text is from previous page text.
Solution
Try to get previous page and search relevant value.
After implementing this solution, recall is from 96% to 98%.
This commit is contained in:
Blade He 2024-11-26 16:35:07 -06:00
parent a09778d9d1
commit 276ff93a1d
2 changed files with 200 additions and 255 deletions

427
main.py
View File

@ -937,113 +937,78 @@ if __name__ == "__main__":
# "471641628", # "471641628",
# ] # ]
# English documents with ground truth # English documents with ground truth
check_db_mapping_doc_id_list = [
"292989214",
"316237292",
"321733631",
"323390570",
"327956364",
"332223498",
"333207452",
"334718372",
"344636875",
"362246081",
"366179419",
"380945052",
"382366116",
"387202452",
"389171486",
"391456740",
"391736837",
"394778487",
"401684600",
"402113224",
"402181770",
"402397014",
"405803396",
"445102363",
"445256897",
"448265376",
"449555622",
"449623976",
"458291624",
"458359181",
"463081566",
"469138353",
"471641628",
"476492237",
"478585901",
"478586066",
"479042264",
"479042269",
"479793787",
"481475385",
"483617247",
"486378555",
"486383912",
"492121213",
"497497599",
"502693599"
]
# Sample documents with special cases
# check_db_mapping_doc_id_list = [ # check_db_mapping_doc_id_list = [
# "334584772", # "292989214",
# "406913630", # "316237292",
# "407275419", # "321733631",
# "337937633", # "323390570",
# "337293427", # "327956364",
# "334584772", # "332223498",
# "404712928", # "333207452",
# "451063582", # "334718372",
# "451878128", # "344636875",
# "425595958", # "362246081",
# "536344026", # "366179419",
# "532422548", # "380945052",
# "423418540", # "382366116",
# "423418395", # "387202452",
# "532998065", # "389171486",
# "540307575", # "391456740",
# "423395975", # "391736837",
# "508704368", # "394778487",
# "481482392", # "401684600",
# "466580448", # "402113224",
# "423365707", # "402181770",
# "423364758", # "402397014",
# "422761666", # "405803396",
# "422760156", # "445102363",
# "422760148", # "445256897",
# "422686965", # "448265376",
# "492029971", # "449555622",
# "510300817", # "449623976",
# "512745032", # "458291624",
# "514213638", # "458359181",
# "527525440", # "463081566",
# "534535767" # "469138353",
# "471641628",
# "476492237",
# "478585901",
# "478586066",
# "479042264",
# "479042269",
# "479793787",
# "481475385",
# "483617247",
# "486378555",
# "486383912",
# "492121213",
# "497497599",
# "502693599"
# ] # ]
# Documents in EMEA Case 1.docx # Documents in EMEA Case 1.docx
check_db_mapping_doc_id_list = [ # check_db_mapping_doc_id_list = [
"424976833", # "424976833",
"425480144", # "425480144",
"427637151", # "427637151",
"429564034", # "429564034",
"429950833", # "429950833",
"430240853", # "430240853",
"431073795", # "431073795",
"434710819", # "434710819",
"434851173", # "434851173",
"434902020", # "434902020",
"434924914", # "434924914",
"435128656", # "435128656",
"440029306", # "440029306",
"466371135", # "466371135",
"466528487", # "466528487",
"466859621", # "466859621",
"466860852", # "466860852",
"467595142", # "467595142",
"467788879", # "467788879",
"470515549" # "470515549"
] # ]
# documents in New EMEA Documents sample.xlsx as typical documents # documents in New EMEA Documents sample.xlsx as typical documents
# """ # """
@ -1091,43 +1056,43 @@ if __name__ == "__main__":
# ] # ]
# documents in Final list of EMEA documents.xlsx as typical documents # documents in Final list of EMEA documents.xlsx as typical documents
check_db_mapping_doc_id_list = [ # check_db_mapping_doc_id_list = [
"532500349", # "532500349",
"535324239", # "535324239",
"532442891", # "532442891",
"543243650", # "543243650",
"528588598", # "528588598",
"532437639", # "532437639",
"527525440", # "527525440",
"534987291", # "534987291",
"534112055", # "534112055",
"533482585", # "533482585",
"544208174", # "544208174",
"534547266", # "534547266",
"544713166", # "544713166",
"526463547", # "526463547",
"534535569", # "534535569",
"534106067", # "534106067",
"532486560", # "532486560",
"532781760", # "532781760",
"533727067", # "533727067",
"527256381", # "527256381",
"533392425", # "533392425",
"532179676", # "532179676",
"534300608", # "534300608",
"539233950", # "539233950",
# "533727908", # # "533727908",
"532438414", # "532438414",
"533681744", # "533681744",
"537654645", # "537654645",
"533594905", # "533594905",
"537926443", # "537926443",
"533499655", # "533499655",
"533862814", # "533862814",
"544918611", # "544918611",
"539087870", # "539087870",
"536343790" # "536343790"
] # ]
# document samples 2024-11-06 # document samples 2024-11-06
# check_db_mapping_doc_id_list = ["546483469", # check_db_mapping_doc_id_list = ["546483469",
@ -1162,122 +1127,74 @@ if __name__ == "__main__":
# "546919329"] # "546919329"]
# document samples: 30 documents, all with 4 data points # document samples: 30 documents, all with 4 data points
check_db_mapping_doc_id_list = ["479742284", # check_db_mapping_doc_id_list = ["479742284",
"501380497", # "501380497",
"501380553", # "501380553",
"501380775", # "501380775",
"501380801", # "501380801",
"501600428", # "501600428",
"501600429", # "501600429",
"501600541", # "501600541",
"501600549", # "501600549",
"503659548", # "503659548",
"506326520", # "506326520",
"507720522", # "507720522",
"507928179", # "507928179",
"508981020", # "508981020",
"509133771", # "509133771",
"509743502", # "509743502",
"514636951", # "514636951",
"514636952", # "514636952",
"514636953", # "514636953",
"514636954", # "514636954",
"514636955", # "514636955",
"514636957", # "514636957",
"514636958", # "514636958",
"514636959", # "514636959",
"514636985", # "514636985",
"514636988", # "514636988",
"514636990", # "514636990",
"514636993", # "514636993",
"514636994", # "514636994",
"539794746", # "539794746",
# ]
# Sample documents with special cases
check_db_mapping_doc_id_list = [
"334584772",
"406913630",
"407275419",
"337937633",
"337293427",
"334584772",
"404712928",
"451063582",
"451878128",
"425595958",
"536344026",
"532422548",
"423418540",
"423418395",
"532998065",
"540307575",
"423395975",
"508704368",
"481482392",
"466580448",
"423365707",
"423364758",
"422761666",
"422760156",
"422760148",
"422686965",
"492029971",
"510300817",
"512745032",
"514213638",
"527525440",
"534535767"
] ]
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["479742284", # special_doc_id_list = ["334584772"]
"501380497",
"501380553",
"501380775",
"501380801",
"501600428",
"501600429",
"501600541",
"501600549",
"503659548",
"506326520",
"507720522",
"507928179",
"508981020",
"509133771",
"509743502",
"514636951",
"514636952",
"514636953",
"514636954",
"514636955",
"514636957",
"514636958",
"514636959",
"514636985",
"514636988",
"514636990",
"514636993",
"514636994",
"539794746",
"546483469",
"546375582",
"546375575",
"546375576",
"546375577",
"546375568",
"546371033",
"546632761",
"546632544",
"546632464",
"546724583",
"546724552",
"546694677",
"546660422",
"546638908",
"546632845",
"546105299",
"546085481",
"546078693",
"546078650",
"546289930",
"546289910",
"542967371",
"542798238",
"546048730",
"546048143",
"546047619",
"546047528",
"546046730",
"546919329"
]
special_doc_id_list = ["514636958",
"514636959",
"514636985",
"514636988",
"514636990",
"514636994",
"514636957",
"514636954",
"514636953",
"514636952",
"501600549",
"501600429",
"501380553",
"501380497",
"514636959",
"508981020"]
special_doc_id_list = ["501380801",
"501600541",
"507720522",
"509133771",
"514636951",
"514636955",
"514636993"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True re_run_extract_data = True

View File

@ -11,6 +11,7 @@ from tqdm import tqdm
import base64 import base64
from copy import deepcopy from copy import deepcopy
from utils.similarity import Similarity from utils.similarity import Similarity
from utils.biz_utils import total_currency_list
from utils.logger import logger from utils.logger import logger
@ -385,6 +386,21 @@ class PDFUtil:
if highlight_value_search_text is None: if highlight_value_search_text is None:
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True) highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False) highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
if highlight_value_search_text is None:
pure_highlight_value_splits = pure_highlight_value.split()
if pure_highlight_value_splits[-1].upper() in total_currency_list:
highlight_value_regex = self.add_slash_to_text_as_regex(' '.join(pure_highlight_value_splits[0:-1]), match_special_char_after_space=False)
highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
if highlight_value_search_text is None:
# If still can't find the highlight value, search in the previous page
previous_page_index = page_index - 1
if previous_page_index >= 0:
previous_page = pdf_doc[previous_page_index]
previous_page_text = previous_page.get_text()
highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, previous_page_text, ignore_case=False)
if highlight_value_search_text is not None:
page_index = previous_page_index
page = previous_page
annotation_data = {"pdf_file": self.simple_pdf_file, annotation_data = {"pdf_file": self.simple_pdf_file,
"page_index": page_index, "page_index": page_index,
@ -611,7 +627,19 @@ class PDFUtil:
if exact_match: if exact_match:
matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block) matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block)
# matching_val_area = self.merge_matching_val_area(matching_val_area) # matching_val_area = self.merge_matching_val_area(matching_val_area)
# get annotation list from current page
xrefs = [annot.xref for annot in page.annots()]
annotated_list = []
for xref in xrefs:
annotated = page.load_annot(xref)
real_bbox_tuple = annotated.vertices[0] + annotated.vertices[3]
bbox = fitz.Rect(real_bbox_tuple)
annotated_list.append(bbox)
for area in matching_val_area: for area in matching_val_area:
if area in annotated_list:
continue
highlight = page.add_highlight_annot([area]) highlight = page.add_highlight_annot([area])
bbox_list = [area.x0, area.y0, area.x1, area.y1] bbox_list = [area.x0, area.y0, area.x1, area.y1]
content["bbox"] = bbox_list content["bbox"] = bbox_list