import os import json import base64 import json_repair from utils.pdf_util import PDFUtil from utils.logger import logger from utils.gpt_utils import chat def get_base64_pdf_image_list(pdf_file: str, pdf_page_index_list: list, output_folder: str=None) -> dict: if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file): logger.error("pdf_file is not provided") return None pdf_util = PDFUtil(pdf_file) if pdf_page_index_list is None or len(pdf_page_index_list) == 0: pdf_page_index_list = list(range(pdf_util.get_page_count())) if output_folder is not None and len(output_folder) > 0: os.makedirs(output_folder, exist_ok=True) pdf_image_info = pdf_util.extract_images(pdf_page_index_list=pdf_page_index_list, output_folder=output_folder) return pdf_image_info def encode_image(image_path: str): if image_path is None or len(image_path) == 0 or not os.path.exists(image_path): return None with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") def chat_with_image(pdf_file: str, pdf_page_index_list: list, image_folder: str, gpt_folder: str): if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file): logger.error("pdf_file is not provided") return None pdf_image_info = get_base64_pdf_image_list(pdf_file, pdf_page_index_list, image_folder) image_instructions_file = r'./instructions/table_extraction_image_prompts.txt' with open(image_instructions_file, "r", encoding="utf-8") as file: image_instructions = file.read() os.makedirs(gpt_folder, exist_ok=True) pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "") response_list = {} for page_index, data in pdf_image_info.items(): logger.info(f"Processing image in page {page_index}") image_file = data.get("img_file", None) image_base64 = data.get("img_base64", None) response, error = chat(prompt=image_instructions, image_base64=image_base64) if error: logger.error(f"Error in processing image in page {page_index}") continue try: response_json = json.loads(response) except: response_json = json_repair.loads(response) response_json_file = os.path.join(gpt_folder, f"{pdf_base_name}_{page_index}.json") with open(response_json_file, "w", encoding="utf-8") as file: json.dump(response_json, file, indent=4) logger.info(f"Response for image in page {page_index}: {response}") logger.info("Done") if __name__ == "__main__": pdf_file = r"/data/emea_ar/small_pdf/382366116.pdf" pdf_page_index_list = [29, 35, 71, 77, 83, 89, 97, 103, 112, 121, 130, 140, 195, 250, 305] image_output_folder = r"/data/emea_ar/small_pdf_image/" gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/" chat_with_image(pdf_file, pdf_page_index_list, image_output_folder, gpt_output_folder)