diff --git a/main.py b/main.py index 1ca1d8f..27b363d 100644 --- a/main.py +++ b/main.py @@ -4,9 +4,15 @@ import pandas as pd from glob import glob from tqdm import tqdm import time +import fitz +import re +from io import BytesIO +from traceback import print_exc from utils.logger import logger from utils.pdf_download import download_pdf_from_documents_warehouse from utils.sql_query_util import query_document_fund_mapping +from utils.pdf_util import PDFUtil +from utils.biz_utils import add_slash_to_text_as_regex from core.page_filter import FilterPages from core.data_extraction import DataExtraction from core.data_mapping import DataMapping @@ -21,6 +27,7 @@ class EMEA_AR_Parsing: output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/", extract_way: str = "text", + drilldown_folder: str = r"/data/emea_ar/output/drilldown/", ) -> None: self.doc_id = doc_id self.pdf_folder = pdf_folder @@ -66,7 +73,12 @@ class EMEA_AR_Parsing: self.page_text_dict = self.filter_pages.page_text_dict self.datapoint_page_info, self.result_details = self.get_datapoint_page_info() self.datapoints = self.get_datapoints_from_datapoint_page_info() - + + if drilldown_folder is None or len(drilldown_folder) == 0: + drilldown_folder = r"/data/emea_ar/output/drilldown/" + os.makedirs(drilldown_folder, exist_ok=True) + self.drilldown_folder = drilldown_folder + def download_pdf(self) -> str: pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id) return pdf_file @@ -85,6 +97,7 @@ class EMEA_AR_Parsing: self, re_run: bool = False, ) -> list: + found_data = False if not re_run: output_data_json_folder = os.path.join( self.output_extract_data_folder, "json/" @@ -97,26 +110,147 @@ class EMEA_AR_Parsing: ) with open(json_file, "r", encoding="utf-8") as f: data_from_gpt = json.load(f) - return data_from_gpt - - try: - data_extraction = DataExtraction( - self.doc_id, - self.pdf_file, - self.output_extract_data_folder, - self.page_text_dict, - self.datapoint_page_info, - self.datapoints, - self.document_mapping_info_df, - extract_way=self.extract_way, - output_image_folder=self.output_extract_image_folder, - ) - data_from_gpt = data_extraction.extract_data() - except Exception as e: - logger.error(f"Error: {e}") - data_from_gpt = {"data": []} + found_data = True + + if not found_data: + try: + data_extraction = DataExtraction( + self.doc_id, + self.pdf_file, + self.output_extract_data_folder, + self.page_text_dict, + self.datapoint_page_info, + self.datapoints, + self.document_mapping_info_df, + extract_way=self.extract_way, + output_image_folder=self.output_extract_image_folder, + ) + data_from_gpt = data_extraction.extract_data() + except Exception as e: + logger.error(f"Error: {e}") + data_from_gpt = {"data": []} + + # Drilldown data to relevant PDF document + self.drilldown_pdf_document(data_from_gpt) return data_from_gpt + + def drilldown_pdf_document(self, data_from_gpt: list) -> list: + logger.info(f"Drilldown PDF document for doc_id: {self.doc_id}") + pdf_util = PDFUtil(self.pdf_file) + pdf_doc = self.get_pdf_doc(self.pdf_file) + highlight_annotation = False + for data in data_from_gpt: + page_index = data.get("page_index", -1) + if page_index == -1: + continue + extract_data_list = data.get("extract_data", {}).get("data", []) + dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {}) + highlighted_value_list = [] + for extract_data in extract_data_list: + for data_point, value in extract_data.items(): + if value in highlighted_value_list: + continue + if data_point in ["ter", "ogc", "performance_fee"]: + continue + drilldown_data = self.highlight_pdf_doc(pdf_doc=pdf_doc, + page_index=page_index, + highlight_value=value, + data_point=data_point, + pdf_util=pdf_util) + if len(drilldown_data.get("matching_val_area", [])) > 0: + highlight_annotation = True + highlighted_value_list.append(value) + + for data_point, reported_name in dp_reported_name_dict.items(): + if reported_name in highlighted_value_list: + continue + data_point = f"{data_point}_reported_name" + drilldown_data = self.highlight_pdf_doc(pdf_doc=pdf_doc, + page_index=page_index, + highlight_value=reported_name, + data_point=data_point, + pdf_util=pdf_util) + if len(drilldown_data.get("matching_val_area", [])) > 0: + highlight_annotation = True + highlighted_value_list.append(reported_name) + if highlight_annotation: + annotated_pdf_file = self.save_annotated_pdf(pdf_doc) + return annotated_pdf_file + + def highlight_pdf_doc(self, + pdf_doc: fitz.Document, + page_index: int, + highlight_value: str, + data_point: str = None, + pdf_util: PDFUtil = None,): + page = pdf_doc[page_index] + page_text = page.get_text() + highlight_value = str(highlight_value) + highlight_value_regex = add_slash_to_text_as_regex(highlight_value) + highlight_value_search = re.search(highlight_value_regex, page_text) + highlight_value_search_text = None + if highlight_value_search is not None: + highlight_value_search_text = highlight_value_search.group() + drilldown_data = {"DocumentId": self.doc_id, + "page_index": page_index, + "data_point": data_point, + "value": highlight_value, + "matching_val_area": []} + if highlight_value_search_text is not None: + content = { + "data_point": data_point, + "data_value": highlight_value + } + matching_val_area = pdf_util.highlight_matching_data( + page=page, + text_block=highlight_value_search_text, + content=content, + title=data_point, + only_hightlight_first=False, + merge_nearby_lines=False + ) + + bbox_list = [] + for area in matching_val_area: + bbox = [area.x0, area.y0, area.x1, area.y1] + bbox_list.append(bbox) + # order bbox_list by y0, x0, y1, x1 + bbox_list = sorted(bbox_list, key=lambda x: (x[1], x[0], x[3], x[2])) + drilldown_data["matching_val_area"] = bbox_list + return drilldown_data + + def get_pdf_doc(self, pdf_file): + pdf_doc = fitz.open(pdf_file) + try: + pdf_encrypted = pdf_doc.isEncrypted + except: + pdf_encrypted = pdf_doc.is_encrypted + if pdf_encrypted: + pdf_doc.authenticate("") + return pdf_doc + def save_annotated_pdf(self, pdf_doc: fitz.Document): + try: + if pdf_doc is None and pdf_doc.is_closed: + return + pdf_file_name = os.path.basename(self.pdf_file) + pdf_file_name = pdf_file_name.replace(".pdf", "_annotated.pdf") + output_pdf_dir = os.path.join(self.drilldown_folder, "pdf/") + os.makedirs(output_pdf_dir, exist_ok=True) + pdf_file_path = os.path.join(output_pdf_dir, pdf_file_name) + output_buffer = BytesIO() + pdf_doc.save(output_buffer) + + # Save the output buffer to the output file + with open(pdf_file_path, mode="wb") as f: + f.write(output_buffer.getbuffer()) + pdf_doc.close() + logger.info(f"File saved to {pdf_file_path}") + return pdf_file_path + except Exception as e: + print_exc() + logger.error(f"Error when save output file: {e}") + def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list: if not re_run: output_data_json_folder = os.path.join( @@ -988,12 +1122,121 @@ if __name__ == "__main__": "539794746" ] special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = [] + special_doc_id_list = ["532500349", + "535324239", + "532442891", + "543243650", + "528588598", + "532437639", + "527525440", + "534987291", + "534112055", + "533482585", + "544208174", + "534547266", + "544713166", + "526463547", + "534535569", + "534106067", + "532486560", + "532781760", + "533727067", + "527256381", + "533392425", + "532179676", + "534300608", + "539233950", + "532438414", + "533681744", + "537654645", + "533594905", + "537926443", + "533499655", + "533862814", + "544918611", + "539087870", + "536343790", + "479742284", + "501380497", + "501380553", + "501380775", + "501380801", + "501600428", + "501600429", + "501600541", + "501600549", + "503659548", + "506326520", + "507720522", + "507928179", + "508981020", + "509133771", + "509743502", + "514636951", + "514636952", + "514636953", + "514636954", + "514636955", + "514636957", + "514636958", + "514636959", + "514636985", + "514636988", + "514636990", + "514636993", + "514636994", + "539794746", + "292989214", + "316237292", + "321733631", + "323390570", + "327956364", + "332223498", + "333207452", + "334718372", + "344636875", + "362246081", + "366179419", + "380945052", + "382366116", + "387202452", + "389171486", + "391456740", + "391736837", + "394778487", + "401684600", + "402113224", + "402181770", + "402397014", + "405803396", + "445102363", + "445256897", + "448265376", + "449555622", + "449623976", + "458291624", + "458359181", + "463081566", + "469138353", + "471641628", + "476492237", + "478585901", + "478586066", + "479042264", + "479042269", + "479793787", + "481475385", + "483617247", + "486378555", + "486383912", + "492121213", + "497497599", + "502693599"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False re_run_mapping_data = False - force_save_total_data = True + force_save_total_data = False calculate_metrics = False extract_ways = ["text"] diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 2ade806..ecdb4ab 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -65,8 +65,9 @@ def add_slash_to_text_as_regex(text: str): continue replace = r"\{0}".format(special_iter.group()) if replace not in text: - text = re.sub(replace, replace, text) - text = re.sub(r"\s+", r"\\s+", text) + text = re.sub(replace, r"\\W", text) + text = re.sub(r"( ){2,}", " ", text) + text = text.replace(" ", r"\s*") return text diff --git a/utils/pdf_util.py b/utils/pdf_util.py index 1221b29..f9d6503 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -9,6 +9,7 @@ import json from traceback import print_exc from tqdm import tqdm import base64 +from copy import deepcopy from utils.similarity import Similarity from utils.logger import logger @@ -276,9 +277,20 @@ class PDFUtil: title: str = "", only_hightlight_first: bool = False, exact_match: bool = False, + merge_nearby_lines: bool = False, ): """ Highlight matching values + page: page object in fitz.Document + within_bbox: bounding box to search for the text + text_block: text to search for in page text + highlight_text_inside_block: text to highlight inside parameter: text_block + content: content as JSON format to add to the highlight annotation, + please customize according to relevant business logic + title: title of the highlight annotation + only_hightlight_first: only highlight the first match + exact_match: exact match or not + merge_nearby_lines: merge nearby lines or not """ # logger.info(f"Highlighting matching values in {self.pdf_file}") if within_bbox is not None: @@ -295,6 +307,8 @@ class PDFUtil: matching_val_area = page.search_for(text_block) else: matching_val_area = page.search_for(text_block) + if len(matching_val_area) == 0: + matching_val_area = page.search_for(text_block.strip()) if len(matching_val_area) == 0: matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', '')) if len(matching_val_area) == 0: @@ -304,7 +318,9 @@ class PDFUtil: and len(highlight_text_inside_block) > 0 ): highlight_bbox_list = [] - for area in matching_val_area: + merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines) + pure_number_regex = re.compile(r"^\d+$") + for area in merged_matching_val_area: text_bbox_area = page.search_for( highlight_text_inside_block, clip=[area.x0, area.y0, area.x1, area.y1], @@ -314,14 +330,42 @@ class PDFUtil: highlight_bbox_list.append(text_bbox_area[0]) break else: - highlight_bbox_list.extend(text_bbox_area) + pure_number_match = pure_number_regex.match(highlight_text_inside_block) + if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block: + for text_bbox in text_bbox_area: + # get text by text_bbox + copy_text_bbox = deepcopy(text_bbox) + copy_text_bbox.x0 -= 10 + copy_text_bbox.x1 += 10 + text = page.get_text("text", clip=copy_text_bbox).strip() + if text == highlight_text_inside_block: + highlight_bbox_list.append(text_bbox) + else: + # get start and end index of the highlight_text_inside_block in text + start_index = text.find(highlight_text_inside_block) + if start_index > 0: + previous_char = text[start_index - 1] + if previous_char not in [" ", "("]: + continue + end_index = start_index + len(highlight_text_inside_block) + if end_index < len(text): + next_char = text[end_index] + if next_char not in [" ", "%", ")"]: + continue + highlight_bbox_list.append(text_bbox) + else: + highlight_bbox_list.extend(text_bbox_area) + if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2: + highlight_bbox_list = text_bbox_area = page.search_for( + highlight_text_inside_block + ) matching_val_area = highlight_bbox_list else: if only_hightlight_first: matching_val_area = [matching_val_area[0]] if matching_val_area is not None and len(matching_val_area) > 0: - matching_val_area = self.merge_matching_val_area(matching_val_area) + matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines) if exact_match: matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block) # matching_val_area = self.merge_matching_val_area(matching_val_area) @@ -329,6 +373,11 @@ class PDFUtil: highlight = page.add_highlight_annot([area]) bbox_list = [area.x0, area.y0, area.x1, area.y1] content["bbox"] = bbox_list + normalized_bbox = self.get_bbox_normalized(page, [bbox_list]) + if len(normalized_bbox) > 0: + content["normalized_bbox"] = normalized_bbox[0] + else: + content["normalized_bbox"] = [] content_text = json.dumps(content) highlight.set_info(content=content_text, title=title) highlight.update() @@ -358,7 +407,7 @@ class PDFUtil: pass return results - def merge_matching_val_area(self, matching_val_area): + def merge_matching_val_area(self, matching_val_area, merge_nearby_lines=False): """ Merge the matching val areas which with same y0 and y1, the x0 is the min x0, x1 is the max x1 @@ -401,6 +450,91 @@ class PDFUtil: min_x0 = min(x0_list) max_x1 = max(x1_list) new_matching_val_area.append(fitz.Rect(min_x0, y0, max_x1, y1)) + if merge_nearby_lines and len(new_matching_val_area) > 1: + new_matching_val_area = self.merge_nearby_lines(new_matching_val_area) + # merge again + if len(new_matching_val_area) > 1: + new_matching_val_area = self.merge_nearby_lines(new_matching_val_area) + elif len(new_matching_val_area) > 1: + new_matching_val_area = self.remove_small_pitches(new_matching_val_area) + else: + pass + return new_matching_val_area + + def remove_small_pitches(self, matching_val_area): + x_mini_threshold = 5 + new_matching_val_area = [] + for area in matching_val_area: + if area.x1 - area.x0 > x_mini_threshold: + new_matching_val_area.append(area) + return new_matching_val_area + + def merge_nearby_lines(self, matching_val_area): + bbox_list = [] + + for bbox in matching_val_area: + bbox = [bbox.x0, bbox.y0, bbox.x1, bbox.y1] + bbox_list.append(bbox) + # order bbox_list by y0, x0, y1, x1 + bbox_list = sorted(bbox_list, key=lambda x: (x[1], x[0], x[3], x[2])) + new_matching_val_area = [] + + last_x0 = None + last_x1 = None + last_y0 = None + last_y1 = None + x_mini_threshold = 5 + y_threshold = 15 + x_threshold = 10 + for index, bbox in enumerate(bbox_list): + if bbox[2] - bbox[0] <= x_mini_threshold: + continue + + if index == 0 or last_x0 is None: + last_x0 = bbox[0] + last_y0 = bbox[1] + last_x1 = bbox[2] + last_y1 = bbox[3] + continue + + x0 = bbox[0] + y0 = bbox[1] + x1 = bbox[2] + y1 = bbox[3] + + last_x0_x1_range = [i for i in range(int(last_x0), int(last_x1))] + x0_x1_range = [i for i in range(int(x0), int(x1))] + x_intersection = list(set(last_x0_x1_range).intersection(set(x0_x1_range))) + + # abs(y0 - last_y1) <= y_threshold and (len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold) + # exist nearby line as vertical direction, + # the horizontal coordinates are intersected or the horizontal coordinates are close to each other + + # abs(y0 - last_y0) <= y_threshold and abs(x0 - last_x1) <= x_threshold + # exist nearby line as horizontal direction, + # last sentence is the begin of the current sentence + + # abs(y1 - last_y1) <= y_threshold and (len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold) + # last sentence and current sentence are in the same horizontal line + # the horizontal coordinates of are last sentence and current sentence intersected + # or the horizontal coordinates are close to each other + + if (abs(y0 - last_y1) <= y_threshold and + (len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold)) or \ + (abs(y0 - last_y0) <= y_threshold and abs(x0 - last_x1) <= x_threshold) or \ + (abs(y1 - last_y1) <= y_threshold and + (len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold)): + last_x0 = min(last_x0, x0) + last_x1 = max(last_x1, x1) + last_y0 = min(last_y0, y0) + last_y1 = max(last_y1, y1) + else: + new_matching_val_area.append(fitz.Rect(last_x0, last_y0, last_x1, last_y1)) + last_x0 = x0 + last_x1 = x1 + last_y0 = y0 + last_y1 = y1 + new_matching_val_area.append(fitz.Rect(last_x0, last_y0, last_x1, last_y1)) return new_matching_val_area def highlight_matching_paragraph_text(