support drilldown data to PDF

2024-11-08 11:22:35 -06:00 · 2024-11-08 11:22:35 -06:00 · 81f855f725
parent 0349033eaf
commit 81f855f725
3 changed files with 405 additions and 27 deletions
--- a/main.py
+++ b/main.py
@ -4,9 +4,15 @@ import pandas as pd
 from glob import glob
 from tqdm import tqdm
 import time
 import fitz
 import re
 from io import BytesIO
 from traceback import print_exc
 from utils.logger import logger
 from utils.pdf_download import download_pdf_from_documents_warehouse
 from utils.sql_query_util import query_document_fund_mapping
 from utils.pdf_util import PDFUtil
 from utils.biz_utils import add_slash_to_text_as_regex
 from core.page_filter import FilterPages
 from core.data_extraction import DataExtraction
 from core.data_mapping import DataMapping
@ -21,6 +27,7 @@ class EMEA_AR_Parsing:
        output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/",
        output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
        extract_way: str = "text",
        drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
    ) -> None:
        self.doc_id = doc_id
        self.pdf_folder = pdf_folder
@ -66,7 +73,12 @@ class EMEA_AR_Parsing:
        self.page_text_dict = self.filter_pages.page_text_dict
        self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
        self.datapoints = self.get_datapoints_from_datapoint_page_info()
-
+        
        if drilldown_folder is None or len(drilldown_folder) == 0:
            drilldown_folder = r"/data/emea_ar/output/drilldown/"
        os.makedirs(drilldown_folder, exist_ok=True)
        self.drilldown_folder = drilldown_folder
    def download_pdf(self) -> str:
        pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
        return pdf_file
@ -85,6 +97,7 @@ class EMEA_AR_Parsing:
        self,
        re_run: bool = False,
    ) -> list:
        found_data = False
        if not re_run:
            output_data_json_folder = os.path.join(
                self.output_extract_data_folder, "json/"
@ -97,26 +110,147 @@ class EMEA_AR_Parsing:
                )
                with open(json_file, "r", encoding="utf-8") as f:
                    data_from_gpt = json.load(f)
-                return data_from_gpt
+                    found_data = True
-
+                    
-        try:
+        if not found_data:
-            data_extraction = DataExtraction(
+            try:
-                self.doc_id,
+                data_extraction = DataExtraction(
-                self.pdf_file,
+                    self.doc_id,
-                self.output_extract_data_folder,
+                    self.pdf_file,
-                self.page_text_dict,
+                    self.output_extract_data_folder,
-                self.datapoint_page_info,
+                    self.page_text_dict,
-                self.datapoints,
+                    self.datapoint_page_info,
-                self.document_mapping_info_df,
+                    self.datapoints,
-                extract_way=self.extract_way,
+                    self.document_mapping_info_df,
-                output_image_folder=self.output_extract_image_folder,
+                    extract_way=self.extract_way,
-            )
+                    output_image_folder=self.output_extract_image_folder,
-            data_from_gpt = data_extraction.extract_data()
+                )
-        except Exception as e:
+                data_from_gpt = data_extraction.extract_data()
-            logger.error(f"Error: {e}")
+            except Exception as e:
-            data_from_gpt = {"data": []}
+                logger.error(f"Error: {e}")
                data_from_gpt = {"data": []}
        # Drilldown data to relevant PDF document
        self.drilldown_pdf_document(data_from_gpt)
        return data_from_gpt
    def drilldown_pdf_document(self, data_from_gpt: list) -> list:
        logger.info(f"Drilldown PDF document for doc_id: {self.doc_id}")
        pdf_util = PDFUtil(self.pdf_file)
        pdf_doc = self.get_pdf_doc(self.pdf_file)
        highlight_annotation = False
        for data in data_from_gpt:
            page_index = data.get("page_index", -1)
            if page_index == -1:
                continue
            extract_data_list = data.get("extract_data", {}).get("data", [])
            dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {})
            highlighted_value_list = []
            for extract_data in extract_data_list:
                for data_point, value in extract_data.items():
                    if value in highlighted_value_list:
                        continue
                    if data_point in ["ter", "ogc", "performance_fee"]:
                        continue
                    drilldown_data = self.highlight_pdf_doc(pdf_doc=pdf_doc,
                                                            page_index=page_index,
                                                            highlight_value=value,
                                                            data_point=data_point,
                                                            pdf_util=pdf_util)
                    if len(drilldown_data.get("matching_val_area", [])) > 0:
                        highlight_annotation = True
                    highlighted_value_list.append(value)
            for data_point, reported_name in dp_reported_name_dict.items():
                if reported_name in highlighted_value_list:
                    continue
                data_point = f"{data_point}_reported_name"
                drilldown_data = self.highlight_pdf_doc(pdf_doc=pdf_doc,
                                                        page_index=page_index,
                                                        highlight_value=reported_name,
                                                        data_point=data_point,
                                                        pdf_util=pdf_util)
                if len(drilldown_data.get("matching_val_area", [])) > 0:
                    highlight_annotation = True
                highlighted_value_list.append(reported_name)
        if highlight_annotation:
            annotated_pdf_file = self.save_annotated_pdf(pdf_doc)
            return annotated_pdf_file
    def highlight_pdf_doc(self,
                          pdf_doc: fitz.Document,
                          page_index: int,
                          highlight_value: str,
                          data_point: str = None,
                          pdf_util: PDFUtil = None,):
        page = pdf_doc[page_index]
        page_text = page.get_text()
        highlight_value = str(highlight_value)
        highlight_value_regex = add_slash_to_text_as_regex(highlight_value)
        highlight_value_search = re.search(highlight_value_regex, page_text)
        highlight_value_search_text = None
        if highlight_value_search is not None:
            highlight_value_search_text = highlight_value_search.group()
        drilldown_data = {"DocumentId": self.doc_id,
                          "page_index": page_index,
                          "data_point": data_point,
                          "value": highlight_value,
                          "matching_val_area": []}
        if highlight_value_search_text is not None:
            content = {
                "data_point": data_point,
                "data_value": highlight_value
            }
            matching_val_area = pdf_util.highlight_matching_data(
                page=page,
                text_block=highlight_value_search_text,
                content=content,
                title=data_point,
                only_hightlight_first=False,
                merge_nearby_lines=False
            )
            bbox_list = []
            for area in matching_val_area:
                bbox = [area.x0, area.y0, area.x1, area.y1]
                bbox_list.append(bbox)
            # order bbox_list by y0, x0, y1, x1
            bbox_list = sorted(bbox_list, key=lambda x: (x[1], x[0], x[3], x[2]))
            drilldown_data["matching_val_area"] = bbox_list
        return drilldown_data
    def get_pdf_doc(self, pdf_file):
        pdf_doc = fitz.open(pdf_file)
        try:
            pdf_encrypted = pdf_doc.isEncrypted
        except:
            pdf_encrypted = pdf_doc.is_encrypted
        if pdf_encrypted:
            pdf_doc.authenticate("")
        return pdf_doc
    def save_annotated_pdf(self, pdf_doc: fitz.Document):
        try:
            if pdf_doc is None and pdf_doc.is_closed:
                return
            pdf_file_name = os.path.basename(self.pdf_file)
            pdf_file_name = pdf_file_name.replace(".pdf", "_annotated.pdf")
            output_pdf_dir = os.path.join(self.drilldown_folder, "pdf/")
            os.makedirs(output_pdf_dir, exist_ok=True)
            pdf_file_path = os.path.join(output_pdf_dir, pdf_file_name)
            output_buffer = BytesIO()
            pdf_doc.save(output_buffer)
            # Save the output buffer to the output file
            with open(pdf_file_path, mode="wb") as f:
                f.write(output_buffer.getbuffer())
            pdf_doc.close()
            logger.info(f"File saved to {pdf_file_path}")
            return pdf_file_path
        except Exception as e:
            print_exc()
            logger.error(f"Error when save output file: {e}")
    def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
        if not re_run:
            output_data_json_folder = os.path.join(
@ -988,12 +1122,121 @@ if __name__ == "__main__":
                            "539794746"
    ]
    special_doc_id_list = check_db_mapping_doc_id_list
-    # special_doc_id_list = []
+    special_doc_id_list = ["532500349",
        "535324239",
        "532442891",
        "543243650",
        "528588598",
        "532437639",
        "527525440",
        "534987291",
        "534112055",
        "533482585",
        "544208174",
        "534547266",
        "544713166",
        "526463547",
        "534535569",
        "534106067",
        "532486560",
        "532781760",
        "533727067",
        "527256381",
        "533392425",
        "532179676",
        "534300608",
        "539233950",
        "532438414",
        "533681744",
        "537654645",
        "533594905",
        "537926443",
        "533499655",
        "533862814",
        "544918611",
        "539087870",
        "536343790",
        "479742284",
        "501380497",
        "501380553",
        "501380775",
        "501380801",
        "501600428",
        "501600429",
        "501600541",
        "501600549",
        "503659548",
        "506326520",
        "507720522",
        "507928179",
        "508981020",
        "509133771",
        "509743502",
        "514636951",
        "514636952",
        "514636953",
        "514636954",
        "514636955",
        "514636957",
        "514636958",
        "514636959",
        "514636985",
        "514636988",
        "514636990",
        "514636993",
        "514636994",
        "539794746",
        "292989214",
        "316237292",
        "321733631",
        "323390570",
        "327956364",
        "332223498",
        "333207452",
        "334718372",
        "344636875",
        "362246081",
        "366179419",
        "380945052",
        "382366116",
        "387202452",
        "389171486",
        "391456740",
        "391736837",
        "394778487",
        "401684600",
        "402113224",
        "402181770",
        "402397014",
        "405803396",
        "445102363",
        "445256897",
        "448265376",
        "449555622",
        "449623976",
        "458291624",
        "458359181",
        "463081566",
        "469138353",
        "471641628",
        "476492237",
        "478585901",
        "478586066",
        "479042264",
        "479042269",
        "479793787",
        "481475385",
        "483617247",
        "486378555",
        "486383912",
        "492121213",
        "497497599",
        "502693599"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
    re_run_mapping_data = False
-    force_save_total_data = True
+    force_save_total_data = False
    calculate_metrics = False
    extract_ways = ["text"]
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -65,8 +65,9 @@ def add_slash_to_text_as_regex(text: str):
            continue
        replace = r"\{0}".format(special_iter.group())
        if replace not in text:
-            text = re.sub(replace, replace, text)
+            text = re.sub(replace, r"\\W", text)
-    text = re.sub(r"\s+", r"\\s+", text)
+    text = re.sub(r"( ){2,}", " ", text)
    text = text.replace(" ", r"\s*")
    return text
--- a/utils/pdf_util.py
+++ b/utils/pdf_util.py
@ -9,6 +9,7 @@ import json
 from traceback import print_exc
 from tqdm import tqdm
 import base64
 from copy import deepcopy
 from utils.similarity import Similarity
 from utils.logger import logger
@ -276,9 +277,20 @@ class PDFUtil:
        title: str = "",
        only_hightlight_first: bool = False,
        exact_match: bool = False,
        merge_nearby_lines: bool = False,
    ):
        """
        Highlight matching values
        page: page object in fitz.Document
        within_bbox: bounding box to search for the text
        text_block: text to search for in page text
        highlight_text_inside_block: text to highlight inside parameter: text_block
        content: content as JSON format to add to the highlight annotation, 
        please customize according to relevant business logic
        title: title of the highlight annotation
        only_hightlight_first: only highlight the first match
        exact_match: exact match or not
        merge_nearby_lines: merge nearby lines or not
        """
        # logger.info(f"Highlighting matching values in {self.pdf_file}")
        if within_bbox is not None:
@ -295,6 +307,8 @@ class PDFUtil:
                matching_val_area = page.search_for(text_block)
        else:
            matching_val_area = page.search_for(text_block)
            if len(matching_val_area) == 0:
                matching_val_area = page.search_for(text_block.strip())
            if len(matching_val_area) == 0:
                matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
            if len(matching_val_area) == 0:
@ -304,7 +318,9 @@ class PDFUtil:
            and len(highlight_text_inside_block) > 0
        ):
            highlight_bbox_list = []
-            for area in matching_val_area:
+            merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
            pure_number_regex = re.compile(r"^\d+$")
            for area in merged_matching_val_area:
                text_bbox_area = page.search_for(
                    highlight_text_inside_block,
                    clip=[area.x0, area.y0, area.x1, area.y1],
@ -314,14 +330,42 @@ class PDFUtil:
                        highlight_bbox_list.append(text_bbox_area[0])
                        break
                    else:
-                        highlight_bbox_list.extend(text_bbox_area)
+                        pure_number_match = pure_number_regex.match(highlight_text_inside_block)
                        if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block:
                            for text_bbox in text_bbox_area:
                                # get text by text_bbox
                                copy_text_bbox = deepcopy(text_bbox)
                                copy_text_bbox.x0 -= 10
                                copy_text_bbox.x1 += 10
                                text = page.get_text("text", clip=copy_text_bbox).strip()
                                if text == highlight_text_inside_block:
                                    highlight_bbox_list.append(text_bbox)
                                else:
                                    # get start and end index of the highlight_text_inside_block in text
                                    start_index = text.find(highlight_text_inside_block)
                                    if start_index > 0:
                                        previous_char = text[start_index - 1]
                                        if previous_char not in [" ", "("]:
                                            continue
                                    end_index = start_index + len(highlight_text_inside_block)
                                    if end_index < len(text):
                                        next_char = text[end_index]
                                        if next_char not in [" ", "%", ")"]:
                                            continue
                                    highlight_bbox_list.append(text_bbox)
                        else:
                            highlight_bbox_list.extend(text_bbox_area)
            if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2:
                highlight_bbox_list = text_bbox_area = page.search_for(
                    highlight_text_inside_block
                )
            matching_val_area = highlight_bbox_list
        else:
            if only_hightlight_first:
                matching_val_area = [matching_val_area[0]]
        if matching_val_area is not None and len(matching_val_area) > 0:
-            matching_val_area = self.merge_matching_val_area(matching_val_area)
+            matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
            if exact_match:
                matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block)
            # matching_val_area = self.merge_matching_val_area(matching_val_area)
@ -329,6 +373,11 @@ class PDFUtil:
                highlight = page.add_highlight_annot([area])
                bbox_list = [area.x0, area.y0, area.x1, area.y1]
                content["bbox"] = bbox_list
                normalized_bbox = self.get_bbox_normalized(page, [bbox_list])
                if len(normalized_bbox) > 0:
                    content["normalized_bbox"] = normalized_bbox[0]
                else:
                    content["normalized_bbox"] = []
                content_text = json.dumps(content)
                highlight.set_info(content=content_text, title=title)
                highlight.update()
@ -358,7 +407,7 @@ class PDFUtil:
                pass
        return results
-    def merge_matching_val_area(self, matching_val_area):
+    def merge_matching_val_area(self, matching_val_area, merge_nearby_lines=False):
        """
        Merge the matching val areas which with same y0 and y1, 
        the x0 is the min x0, x1 is the max x1
@ -401,6 +450,91 @@ class PDFUtil:
            min_x0 = min(x0_list)
            max_x1 = max(x1_list)
            new_matching_val_area.append(fitz.Rect(min_x0, y0, max_x1, y1))
        if merge_nearby_lines and len(new_matching_val_area) > 1:
            new_matching_val_area = self.merge_nearby_lines(new_matching_val_area)
            # merge again
            if len(new_matching_val_area) > 1:
                new_matching_val_area = self.merge_nearby_lines(new_matching_val_area)
        elif len(new_matching_val_area) > 1:
            new_matching_val_area = self.remove_small_pitches(new_matching_val_area)
        else:
            pass
        return new_matching_val_area
    def remove_small_pitches(self, matching_val_area):
        x_mini_threshold = 5
        new_matching_val_area = []
        for area in matching_val_area:
            if area.x1 - area.x0 > x_mini_threshold:
                new_matching_val_area.append(area)
        return new_matching_val_area
    def merge_nearby_lines(self, matching_val_area):
        bbox_list = []
        for bbox in matching_val_area:
            bbox = [bbox.x0, bbox.y0, bbox.x1, bbox.y1]
            bbox_list.append(bbox)
        # order bbox_list by y0, x0, y1, x1
        bbox_list = sorted(bbox_list, key=lambda x: (x[1], x[0], x[3], x[2]))
        new_matching_val_area = []
        last_x0 = None
        last_x1 = None
        last_y0 = None
        last_y1 = None
        x_mini_threshold = 5
        y_threshold = 15
        x_threshold = 10
        for index, bbox in enumerate(bbox_list):
            if bbox[2] - bbox[0] <= x_mini_threshold:
                continue
            if index == 0 or last_x0 is None:
                last_x0 = bbox[0]
                last_y0 = bbox[1]
                last_x1 = bbox[2]
                last_y1 = bbox[3]
                continue
            x0 = bbox[0]
            y0 = bbox[1]
            x1 = bbox[2]
            y1 = bbox[3]
            last_x0_x1_range = [i for i in range(int(last_x0), int(last_x1))]
            x0_x1_range = [i for i in range(int(x0), int(x1))]
            x_intersection = list(set(last_x0_x1_range).intersection(set(x0_x1_range)))
            # abs(y0 - last_y1) <= y_threshold and (len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold)
            # exist nearby line as vertical direction, 
            # the horizontal coordinates are intersected or the horizontal coordinates are close to each other
            # abs(y0 - last_y0) <= y_threshold and abs(x0 - last_x1) <= x_threshold
            # exist nearby line as horizontal direction,
            # last sentence is the begin of the current sentence
            # abs(y1 - last_y1) <= y_threshold and (len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold)
            # last sentence and current sentence are in the same horizontal line
            # the horizontal coordinates of are last sentence and current sentence intersected 
            # or the horizontal coordinates are close to each other
            if (abs(y0 - last_y1) <= y_threshold and 
                (len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold)) or \
                (abs(y0 - last_y0) <= y_threshold and abs(x0 - last_x1) <= x_threshold) or \
                (abs(y1 - last_y1) <= y_threshold and 
                 (len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold)):
                last_x0 = min(last_x0, x0)
                last_x1 = max(last_x1, x1)
                last_y0 = min(last_y0, y0)
                last_y1 = max(last_y1, y1)
            else:
                new_matching_val_area.append(fitz.Rect(last_x0, last_y0, last_x1, last_y1))
                last_x0 = x0
                last_x1 = x1
                last_y0 = y0
                last_y1 = y1
        new_matching_val_area.append(fitz.Rect(last_x0, last_y0, last_x1, last_y1))
        return new_matching_val_area
    def highlight_matching_paragraph_text(