support drilldown data to PDF
This commit is contained in:
parent
0349033eaf
commit
81f855f725
285
main.py
285
main.py
|
|
@ -4,9 +4,15 @@ import pandas as pd
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import time
|
import time
|
||||||
|
import fitz
|
||||||
|
import re
|
||||||
|
from io import BytesIO
|
||||||
|
from traceback import print_exc
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
from utils.pdf_download import download_pdf_from_documents_warehouse
|
from utils.pdf_download import download_pdf_from_documents_warehouse
|
||||||
from utils.sql_query_util import query_document_fund_mapping
|
from utils.sql_query_util import query_document_fund_mapping
|
||||||
|
from utils.pdf_util import PDFUtil
|
||||||
|
from utils.biz_utils import add_slash_to_text_as_regex
|
||||||
from core.page_filter import FilterPages
|
from core.page_filter import FilterPages
|
||||||
from core.data_extraction import DataExtraction
|
from core.data_extraction import DataExtraction
|
||||||
from core.data_mapping import DataMapping
|
from core.data_mapping import DataMapping
|
||||||
|
|
@ -21,6 +27,7 @@ class EMEA_AR_Parsing:
|
||||||
output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/",
|
output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/",
|
||||||
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
||||||
extract_way: str = "text",
|
extract_way: str = "text",
|
||||||
|
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
||||||
) -> None:
|
) -> None:
|
||||||
self.doc_id = doc_id
|
self.doc_id = doc_id
|
||||||
self.pdf_folder = pdf_folder
|
self.pdf_folder = pdf_folder
|
||||||
|
|
@ -66,7 +73,12 @@ class EMEA_AR_Parsing:
|
||||||
self.page_text_dict = self.filter_pages.page_text_dict
|
self.page_text_dict = self.filter_pages.page_text_dict
|
||||||
self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
|
self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
|
||||||
self.datapoints = self.get_datapoints_from_datapoint_page_info()
|
self.datapoints = self.get_datapoints_from_datapoint_page_info()
|
||||||
|
|
||||||
|
if drilldown_folder is None or len(drilldown_folder) == 0:
|
||||||
|
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
||||||
|
os.makedirs(drilldown_folder, exist_ok=True)
|
||||||
|
self.drilldown_folder = drilldown_folder
|
||||||
|
|
||||||
def download_pdf(self) -> str:
|
def download_pdf(self) -> str:
|
||||||
pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
|
pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
|
||||||
return pdf_file
|
return pdf_file
|
||||||
|
|
@ -85,6 +97,7 @@ class EMEA_AR_Parsing:
|
||||||
self,
|
self,
|
||||||
re_run: bool = False,
|
re_run: bool = False,
|
||||||
) -> list:
|
) -> list:
|
||||||
|
found_data = False
|
||||||
if not re_run:
|
if not re_run:
|
||||||
output_data_json_folder = os.path.join(
|
output_data_json_folder = os.path.join(
|
||||||
self.output_extract_data_folder, "json/"
|
self.output_extract_data_folder, "json/"
|
||||||
|
|
@ -97,26 +110,147 @@ class EMEA_AR_Parsing:
|
||||||
)
|
)
|
||||||
with open(json_file, "r", encoding="utf-8") as f:
|
with open(json_file, "r", encoding="utf-8") as f:
|
||||||
data_from_gpt = json.load(f)
|
data_from_gpt = json.load(f)
|
||||||
return data_from_gpt
|
found_data = True
|
||||||
|
|
||||||
try:
|
if not found_data:
|
||||||
data_extraction = DataExtraction(
|
try:
|
||||||
self.doc_id,
|
data_extraction = DataExtraction(
|
||||||
self.pdf_file,
|
self.doc_id,
|
||||||
self.output_extract_data_folder,
|
self.pdf_file,
|
||||||
self.page_text_dict,
|
self.output_extract_data_folder,
|
||||||
self.datapoint_page_info,
|
self.page_text_dict,
|
||||||
self.datapoints,
|
self.datapoint_page_info,
|
||||||
self.document_mapping_info_df,
|
self.datapoints,
|
||||||
extract_way=self.extract_way,
|
self.document_mapping_info_df,
|
||||||
output_image_folder=self.output_extract_image_folder,
|
extract_way=self.extract_way,
|
||||||
)
|
output_image_folder=self.output_extract_image_folder,
|
||||||
data_from_gpt = data_extraction.extract_data()
|
)
|
||||||
except Exception as e:
|
data_from_gpt = data_extraction.extract_data()
|
||||||
logger.error(f"Error: {e}")
|
except Exception as e:
|
||||||
data_from_gpt = {"data": []}
|
logger.error(f"Error: {e}")
|
||||||
|
data_from_gpt = {"data": []}
|
||||||
|
|
||||||
|
# Drilldown data to relevant PDF document
|
||||||
|
self.drilldown_pdf_document(data_from_gpt)
|
||||||
return data_from_gpt
|
return data_from_gpt
|
||||||
|
|
||||||
|
def drilldown_pdf_document(self, data_from_gpt: list) -> list:
|
||||||
|
logger.info(f"Drilldown PDF document for doc_id: {self.doc_id}")
|
||||||
|
pdf_util = PDFUtil(self.pdf_file)
|
||||||
|
pdf_doc = self.get_pdf_doc(self.pdf_file)
|
||||||
|
highlight_annotation = False
|
||||||
|
for data in data_from_gpt:
|
||||||
|
page_index = data.get("page_index", -1)
|
||||||
|
if page_index == -1:
|
||||||
|
continue
|
||||||
|
extract_data_list = data.get("extract_data", {}).get("data", [])
|
||||||
|
dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {})
|
||||||
|
highlighted_value_list = []
|
||||||
|
for extract_data in extract_data_list:
|
||||||
|
for data_point, value in extract_data.items():
|
||||||
|
if value in highlighted_value_list:
|
||||||
|
continue
|
||||||
|
if data_point in ["ter", "ogc", "performance_fee"]:
|
||||||
|
continue
|
||||||
|
drilldown_data = self.highlight_pdf_doc(pdf_doc=pdf_doc,
|
||||||
|
page_index=page_index,
|
||||||
|
highlight_value=value,
|
||||||
|
data_point=data_point,
|
||||||
|
pdf_util=pdf_util)
|
||||||
|
if len(drilldown_data.get("matching_val_area", [])) > 0:
|
||||||
|
highlight_annotation = True
|
||||||
|
highlighted_value_list.append(value)
|
||||||
|
|
||||||
|
for data_point, reported_name in dp_reported_name_dict.items():
|
||||||
|
if reported_name in highlighted_value_list:
|
||||||
|
continue
|
||||||
|
data_point = f"{data_point}_reported_name"
|
||||||
|
drilldown_data = self.highlight_pdf_doc(pdf_doc=pdf_doc,
|
||||||
|
page_index=page_index,
|
||||||
|
highlight_value=reported_name,
|
||||||
|
data_point=data_point,
|
||||||
|
pdf_util=pdf_util)
|
||||||
|
if len(drilldown_data.get("matching_val_area", [])) > 0:
|
||||||
|
highlight_annotation = True
|
||||||
|
highlighted_value_list.append(reported_name)
|
||||||
|
if highlight_annotation:
|
||||||
|
annotated_pdf_file = self.save_annotated_pdf(pdf_doc)
|
||||||
|
return annotated_pdf_file
|
||||||
|
|
||||||
|
def highlight_pdf_doc(self,
|
||||||
|
pdf_doc: fitz.Document,
|
||||||
|
page_index: int,
|
||||||
|
highlight_value: str,
|
||||||
|
data_point: str = None,
|
||||||
|
pdf_util: PDFUtil = None,):
|
||||||
|
page = pdf_doc[page_index]
|
||||||
|
page_text = page.get_text()
|
||||||
|
highlight_value = str(highlight_value)
|
||||||
|
highlight_value_regex = add_slash_to_text_as_regex(highlight_value)
|
||||||
|
highlight_value_search = re.search(highlight_value_regex, page_text)
|
||||||
|
highlight_value_search_text = None
|
||||||
|
if highlight_value_search is not None:
|
||||||
|
highlight_value_search_text = highlight_value_search.group()
|
||||||
|
drilldown_data = {"DocumentId": self.doc_id,
|
||||||
|
"page_index": page_index,
|
||||||
|
"data_point": data_point,
|
||||||
|
"value": highlight_value,
|
||||||
|
"matching_val_area": []}
|
||||||
|
if highlight_value_search_text is not None:
|
||||||
|
content = {
|
||||||
|
"data_point": data_point,
|
||||||
|
"data_value": highlight_value
|
||||||
|
}
|
||||||
|
matching_val_area = pdf_util.highlight_matching_data(
|
||||||
|
page=page,
|
||||||
|
text_block=highlight_value_search_text,
|
||||||
|
content=content,
|
||||||
|
title=data_point,
|
||||||
|
only_hightlight_first=False,
|
||||||
|
merge_nearby_lines=False
|
||||||
|
)
|
||||||
|
|
||||||
|
bbox_list = []
|
||||||
|
for area in matching_val_area:
|
||||||
|
bbox = [area.x0, area.y0, area.x1, area.y1]
|
||||||
|
bbox_list.append(bbox)
|
||||||
|
# order bbox_list by y0, x0, y1, x1
|
||||||
|
bbox_list = sorted(bbox_list, key=lambda x: (x[1], x[0], x[3], x[2]))
|
||||||
|
drilldown_data["matching_val_area"] = bbox_list
|
||||||
|
return drilldown_data
|
||||||
|
|
||||||
|
def get_pdf_doc(self, pdf_file):
|
||||||
|
pdf_doc = fitz.open(pdf_file)
|
||||||
|
try:
|
||||||
|
pdf_encrypted = pdf_doc.isEncrypted
|
||||||
|
except:
|
||||||
|
pdf_encrypted = pdf_doc.is_encrypted
|
||||||
|
if pdf_encrypted:
|
||||||
|
pdf_doc.authenticate("")
|
||||||
|
return pdf_doc
|
||||||
|
|
||||||
|
def save_annotated_pdf(self, pdf_doc: fitz.Document):
|
||||||
|
try:
|
||||||
|
if pdf_doc is None and pdf_doc.is_closed:
|
||||||
|
return
|
||||||
|
pdf_file_name = os.path.basename(self.pdf_file)
|
||||||
|
pdf_file_name = pdf_file_name.replace(".pdf", "_annotated.pdf")
|
||||||
|
output_pdf_dir = os.path.join(self.drilldown_folder, "pdf/")
|
||||||
|
os.makedirs(output_pdf_dir, exist_ok=True)
|
||||||
|
pdf_file_path = os.path.join(output_pdf_dir, pdf_file_name)
|
||||||
|
output_buffer = BytesIO()
|
||||||
|
pdf_doc.save(output_buffer)
|
||||||
|
|
||||||
|
# Save the output buffer to the output file
|
||||||
|
with open(pdf_file_path, mode="wb") as f:
|
||||||
|
f.write(output_buffer.getbuffer())
|
||||||
|
pdf_doc.close()
|
||||||
|
logger.info(f"File saved to {pdf_file_path}")
|
||||||
|
return pdf_file_path
|
||||||
|
except Exception as e:
|
||||||
|
print_exc()
|
||||||
|
logger.error(f"Error when save output file: {e}")
|
||||||
|
|
||||||
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
||||||
if not re_run:
|
if not re_run:
|
||||||
output_data_json_folder = os.path.join(
|
output_data_json_folder = os.path.join(
|
||||||
|
|
@ -988,12 +1122,121 @@ if __name__ == "__main__":
|
||||||
"539794746"
|
"539794746"
|
||||||
]
|
]
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = []
|
special_doc_id_list = ["532500349",
|
||||||
|
"535324239",
|
||||||
|
"532442891",
|
||||||
|
"543243650",
|
||||||
|
"528588598",
|
||||||
|
"532437639",
|
||||||
|
"527525440",
|
||||||
|
"534987291",
|
||||||
|
"534112055",
|
||||||
|
"533482585",
|
||||||
|
"544208174",
|
||||||
|
"534547266",
|
||||||
|
"544713166",
|
||||||
|
"526463547",
|
||||||
|
"534535569",
|
||||||
|
"534106067",
|
||||||
|
"532486560",
|
||||||
|
"532781760",
|
||||||
|
"533727067",
|
||||||
|
"527256381",
|
||||||
|
"533392425",
|
||||||
|
"532179676",
|
||||||
|
"534300608",
|
||||||
|
"539233950",
|
||||||
|
"532438414",
|
||||||
|
"533681744",
|
||||||
|
"537654645",
|
||||||
|
"533594905",
|
||||||
|
"537926443",
|
||||||
|
"533499655",
|
||||||
|
"533862814",
|
||||||
|
"544918611",
|
||||||
|
"539087870",
|
||||||
|
"536343790",
|
||||||
|
"479742284",
|
||||||
|
"501380497",
|
||||||
|
"501380553",
|
||||||
|
"501380775",
|
||||||
|
"501380801",
|
||||||
|
"501600428",
|
||||||
|
"501600429",
|
||||||
|
"501600541",
|
||||||
|
"501600549",
|
||||||
|
"503659548",
|
||||||
|
"506326520",
|
||||||
|
"507720522",
|
||||||
|
"507928179",
|
||||||
|
"508981020",
|
||||||
|
"509133771",
|
||||||
|
"509743502",
|
||||||
|
"514636951",
|
||||||
|
"514636952",
|
||||||
|
"514636953",
|
||||||
|
"514636954",
|
||||||
|
"514636955",
|
||||||
|
"514636957",
|
||||||
|
"514636958",
|
||||||
|
"514636959",
|
||||||
|
"514636985",
|
||||||
|
"514636988",
|
||||||
|
"514636990",
|
||||||
|
"514636993",
|
||||||
|
"514636994",
|
||||||
|
"539794746",
|
||||||
|
"292989214",
|
||||||
|
"316237292",
|
||||||
|
"321733631",
|
||||||
|
"323390570",
|
||||||
|
"327956364",
|
||||||
|
"332223498",
|
||||||
|
"333207452",
|
||||||
|
"334718372",
|
||||||
|
"344636875",
|
||||||
|
"362246081",
|
||||||
|
"366179419",
|
||||||
|
"380945052",
|
||||||
|
"382366116",
|
||||||
|
"387202452",
|
||||||
|
"389171486",
|
||||||
|
"391456740",
|
||||||
|
"391736837",
|
||||||
|
"394778487",
|
||||||
|
"401684600",
|
||||||
|
"402113224",
|
||||||
|
"402181770",
|
||||||
|
"402397014",
|
||||||
|
"405803396",
|
||||||
|
"445102363",
|
||||||
|
"445256897",
|
||||||
|
"448265376",
|
||||||
|
"449555622",
|
||||||
|
"449623976",
|
||||||
|
"458291624",
|
||||||
|
"458359181",
|
||||||
|
"463081566",
|
||||||
|
"469138353",
|
||||||
|
"471641628",
|
||||||
|
"476492237",
|
||||||
|
"478585901",
|
||||||
|
"478586066",
|
||||||
|
"479042264",
|
||||||
|
"479042269",
|
||||||
|
"479793787",
|
||||||
|
"481475385",
|
||||||
|
"483617247",
|
||||||
|
"486378555",
|
||||||
|
"486383912",
|
||||||
|
"492121213",
|
||||||
|
"497497599",
|
||||||
|
"502693599"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = False
|
||||||
force_save_total_data = True
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
|
||||||
|
|
@ -65,8 +65,9 @@ def add_slash_to_text_as_regex(text: str):
|
||||||
continue
|
continue
|
||||||
replace = r"\{0}".format(special_iter.group())
|
replace = r"\{0}".format(special_iter.group())
|
||||||
if replace not in text:
|
if replace not in text:
|
||||||
text = re.sub(replace, replace, text)
|
text = re.sub(replace, r"\\W", text)
|
||||||
text = re.sub(r"\s+", r"\\s+", text)
|
text = re.sub(r"( ){2,}", " ", text)
|
||||||
|
text = text.replace(" ", r"\s*")
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ import json
|
||||||
from traceback import print_exc
|
from traceback import print_exc
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import base64
|
import base64
|
||||||
|
from copy import deepcopy
|
||||||
from utils.similarity import Similarity
|
from utils.similarity import Similarity
|
||||||
|
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
|
|
@ -276,9 +277,20 @@ class PDFUtil:
|
||||||
title: str = "",
|
title: str = "",
|
||||||
only_hightlight_first: bool = False,
|
only_hightlight_first: bool = False,
|
||||||
exact_match: bool = False,
|
exact_match: bool = False,
|
||||||
|
merge_nearby_lines: bool = False,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Highlight matching values
|
Highlight matching values
|
||||||
|
page: page object in fitz.Document
|
||||||
|
within_bbox: bounding box to search for the text
|
||||||
|
text_block: text to search for in page text
|
||||||
|
highlight_text_inside_block: text to highlight inside parameter: text_block
|
||||||
|
content: content as JSON format to add to the highlight annotation,
|
||||||
|
please customize according to relevant business logic
|
||||||
|
title: title of the highlight annotation
|
||||||
|
only_hightlight_first: only highlight the first match
|
||||||
|
exact_match: exact match or not
|
||||||
|
merge_nearby_lines: merge nearby lines or not
|
||||||
"""
|
"""
|
||||||
# logger.info(f"Highlighting matching values in {self.pdf_file}")
|
# logger.info(f"Highlighting matching values in {self.pdf_file}")
|
||||||
if within_bbox is not None:
|
if within_bbox is not None:
|
||||||
|
|
@ -295,6 +307,8 @@ class PDFUtil:
|
||||||
matching_val_area = page.search_for(text_block)
|
matching_val_area = page.search_for(text_block)
|
||||||
else:
|
else:
|
||||||
matching_val_area = page.search_for(text_block)
|
matching_val_area = page.search_for(text_block)
|
||||||
|
if len(matching_val_area) == 0:
|
||||||
|
matching_val_area = page.search_for(text_block.strip())
|
||||||
if len(matching_val_area) == 0:
|
if len(matching_val_area) == 0:
|
||||||
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
|
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
|
||||||
if len(matching_val_area) == 0:
|
if len(matching_val_area) == 0:
|
||||||
|
|
@ -304,7 +318,9 @@ class PDFUtil:
|
||||||
and len(highlight_text_inside_block) > 0
|
and len(highlight_text_inside_block) > 0
|
||||||
):
|
):
|
||||||
highlight_bbox_list = []
|
highlight_bbox_list = []
|
||||||
for area in matching_val_area:
|
merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
|
||||||
|
pure_number_regex = re.compile(r"^\d+$")
|
||||||
|
for area in merged_matching_val_area:
|
||||||
text_bbox_area = page.search_for(
|
text_bbox_area = page.search_for(
|
||||||
highlight_text_inside_block,
|
highlight_text_inside_block,
|
||||||
clip=[area.x0, area.y0, area.x1, area.y1],
|
clip=[area.x0, area.y0, area.x1, area.y1],
|
||||||
|
|
@ -314,14 +330,42 @@ class PDFUtil:
|
||||||
highlight_bbox_list.append(text_bbox_area[0])
|
highlight_bbox_list.append(text_bbox_area[0])
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
highlight_bbox_list.extend(text_bbox_area)
|
pure_number_match = pure_number_regex.match(highlight_text_inside_block)
|
||||||
|
if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block:
|
||||||
|
for text_bbox in text_bbox_area:
|
||||||
|
# get text by text_bbox
|
||||||
|
copy_text_bbox = deepcopy(text_bbox)
|
||||||
|
copy_text_bbox.x0 -= 10
|
||||||
|
copy_text_bbox.x1 += 10
|
||||||
|
text = page.get_text("text", clip=copy_text_bbox).strip()
|
||||||
|
if text == highlight_text_inside_block:
|
||||||
|
highlight_bbox_list.append(text_bbox)
|
||||||
|
else:
|
||||||
|
# get start and end index of the highlight_text_inside_block in text
|
||||||
|
start_index = text.find(highlight_text_inside_block)
|
||||||
|
if start_index > 0:
|
||||||
|
previous_char = text[start_index - 1]
|
||||||
|
if previous_char not in [" ", "("]:
|
||||||
|
continue
|
||||||
|
end_index = start_index + len(highlight_text_inside_block)
|
||||||
|
if end_index < len(text):
|
||||||
|
next_char = text[end_index]
|
||||||
|
if next_char not in [" ", "%", ")"]:
|
||||||
|
continue
|
||||||
|
highlight_bbox_list.append(text_bbox)
|
||||||
|
else:
|
||||||
|
highlight_bbox_list.extend(text_bbox_area)
|
||||||
|
if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2:
|
||||||
|
highlight_bbox_list = text_bbox_area = page.search_for(
|
||||||
|
highlight_text_inside_block
|
||||||
|
)
|
||||||
matching_val_area = highlight_bbox_list
|
matching_val_area = highlight_bbox_list
|
||||||
else:
|
else:
|
||||||
if only_hightlight_first:
|
if only_hightlight_first:
|
||||||
matching_val_area = [matching_val_area[0]]
|
matching_val_area = [matching_val_area[0]]
|
||||||
|
|
||||||
if matching_val_area is not None and len(matching_val_area) > 0:
|
if matching_val_area is not None and len(matching_val_area) > 0:
|
||||||
matching_val_area = self.merge_matching_val_area(matching_val_area)
|
matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
|
||||||
if exact_match:
|
if exact_match:
|
||||||
matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block)
|
matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block)
|
||||||
# matching_val_area = self.merge_matching_val_area(matching_val_area)
|
# matching_val_area = self.merge_matching_val_area(matching_val_area)
|
||||||
|
|
@ -329,6 +373,11 @@ class PDFUtil:
|
||||||
highlight = page.add_highlight_annot([area])
|
highlight = page.add_highlight_annot([area])
|
||||||
bbox_list = [area.x0, area.y0, area.x1, area.y1]
|
bbox_list = [area.x0, area.y0, area.x1, area.y1]
|
||||||
content["bbox"] = bbox_list
|
content["bbox"] = bbox_list
|
||||||
|
normalized_bbox = self.get_bbox_normalized(page, [bbox_list])
|
||||||
|
if len(normalized_bbox) > 0:
|
||||||
|
content["normalized_bbox"] = normalized_bbox[0]
|
||||||
|
else:
|
||||||
|
content["normalized_bbox"] = []
|
||||||
content_text = json.dumps(content)
|
content_text = json.dumps(content)
|
||||||
highlight.set_info(content=content_text, title=title)
|
highlight.set_info(content=content_text, title=title)
|
||||||
highlight.update()
|
highlight.update()
|
||||||
|
|
@ -358,7 +407,7 @@ class PDFUtil:
|
||||||
pass
|
pass
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def merge_matching_val_area(self, matching_val_area):
|
def merge_matching_val_area(self, matching_val_area, merge_nearby_lines=False):
|
||||||
"""
|
"""
|
||||||
Merge the matching val areas which with same y0 and y1,
|
Merge the matching val areas which with same y0 and y1,
|
||||||
the x0 is the min x0, x1 is the max x1
|
the x0 is the min x0, x1 is the max x1
|
||||||
|
|
@ -401,6 +450,91 @@ class PDFUtil:
|
||||||
min_x0 = min(x0_list)
|
min_x0 = min(x0_list)
|
||||||
max_x1 = max(x1_list)
|
max_x1 = max(x1_list)
|
||||||
new_matching_val_area.append(fitz.Rect(min_x0, y0, max_x1, y1))
|
new_matching_val_area.append(fitz.Rect(min_x0, y0, max_x1, y1))
|
||||||
|
if merge_nearby_lines and len(new_matching_val_area) > 1:
|
||||||
|
new_matching_val_area = self.merge_nearby_lines(new_matching_val_area)
|
||||||
|
# merge again
|
||||||
|
if len(new_matching_val_area) > 1:
|
||||||
|
new_matching_val_area = self.merge_nearby_lines(new_matching_val_area)
|
||||||
|
elif len(new_matching_val_area) > 1:
|
||||||
|
new_matching_val_area = self.remove_small_pitches(new_matching_val_area)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
return new_matching_val_area
|
||||||
|
|
||||||
|
def remove_small_pitches(self, matching_val_area):
|
||||||
|
x_mini_threshold = 5
|
||||||
|
new_matching_val_area = []
|
||||||
|
for area in matching_val_area:
|
||||||
|
if area.x1 - area.x0 > x_mini_threshold:
|
||||||
|
new_matching_val_area.append(area)
|
||||||
|
return new_matching_val_area
|
||||||
|
|
||||||
|
def merge_nearby_lines(self, matching_val_area):
|
||||||
|
bbox_list = []
|
||||||
|
|
||||||
|
for bbox in matching_val_area:
|
||||||
|
bbox = [bbox.x0, bbox.y0, bbox.x1, bbox.y1]
|
||||||
|
bbox_list.append(bbox)
|
||||||
|
# order bbox_list by y0, x0, y1, x1
|
||||||
|
bbox_list = sorted(bbox_list, key=lambda x: (x[1], x[0], x[3], x[2]))
|
||||||
|
new_matching_val_area = []
|
||||||
|
|
||||||
|
last_x0 = None
|
||||||
|
last_x1 = None
|
||||||
|
last_y0 = None
|
||||||
|
last_y1 = None
|
||||||
|
x_mini_threshold = 5
|
||||||
|
y_threshold = 15
|
||||||
|
x_threshold = 10
|
||||||
|
for index, bbox in enumerate(bbox_list):
|
||||||
|
if bbox[2] - bbox[0] <= x_mini_threshold:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if index == 0 or last_x0 is None:
|
||||||
|
last_x0 = bbox[0]
|
||||||
|
last_y0 = bbox[1]
|
||||||
|
last_x1 = bbox[2]
|
||||||
|
last_y1 = bbox[3]
|
||||||
|
continue
|
||||||
|
|
||||||
|
x0 = bbox[0]
|
||||||
|
y0 = bbox[1]
|
||||||
|
x1 = bbox[2]
|
||||||
|
y1 = bbox[3]
|
||||||
|
|
||||||
|
last_x0_x1_range = [i for i in range(int(last_x0), int(last_x1))]
|
||||||
|
x0_x1_range = [i for i in range(int(x0), int(x1))]
|
||||||
|
x_intersection = list(set(last_x0_x1_range).intersection(set(x0_x1_range)))
|
||||||
|
|
||||||
|
# abs(y0 - last_y1) <= y_threshold and (len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold)
|
||||||
|
# exist nearby line as vertical direction,
|
||||||
|
# the horizontal coordinates are intersected or the horizontal coordinates are close to each other
|
||||||
|
|
||||||
|
# abs(y0 - last_y0) <= y_threshold and abs(x0 - last_x1) <= x_threshold
|
||||||
|
# exist nearby line as horizontal direction,
|
||||||
|
# last sentence is the begin of the current sentence
|
||||||
|
|
||||||
|
# abs(y1 - last_y1) <= y_threshold and (len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold)
|
||||||
|
# last sentence and current sentence are in the same horizontal line
|
||||||
|
# the horizontal coordinates of are last sentence and current sentence intersected
|
||||||
|
# or the horizontal coordinates are close to each other
|
||||||
|
|
||||||
|
if (abs(y0 - last_y1) <= y_threshold and
|
||||||
|
(len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold)) or \
|
||||||
|
(abs(y0 - last_y0) <= y_threshold and abs(x0 - last_x1) <= x_threshold) or \
|
||||||
|
(abs(y1 - last_y1) <= y_threshold and
|
||||||
|
(len(x_intersection) > 0 or abs(x0 - last_x1) <= x_threshold)):
|
||||||
|
last_x0 = min(last_x0, x0)
|
||||||
|
last_x1 = max(last_x1, x1)
|
||||||
|
last_y0 = min(last_y0, y0)
|
||||||
|
last_y1 = max(last_y1, y1)
|
||||||
|
else:
|
||||||
|
new_matching_val_area.append(fitz.Rect(last_x0, last_y0, last_x1, last_y1))
|
||||||
|
last_x0 = x0
|
||||||
|
last_x1 = x1
|
||||||
|
last_y0 = y0
|
||||||
|
last_y1 = y1
|
||||||
|
new_matching_val_area.append(fitz.Rect(last_x0, last_y0, last_x1, last_y1))
|
||||||
return new_matching_val_area
|
return new_matching_val_area
|
||||||
|
|
||||||
def highlight_matching_paragraph_text(
|
def highlight_matching_paragraph_text(
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue