import os import json import json_repair import re import fitz import pandas as pd from utils.gpt_utils import chat from utils.pdf_util import PDFUtil from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider from utils.logger import logger class Translate_PDF: def __init__(self, pdf_file: str, output_folder: str, target_language: str = "English") -> None: self.pdf_file = pdf_file if not os.path.exists(self.pdf_file): raise Exception(f"File {self.pdf_file} not found") if not os.path.exists(output_folder): os.makedirs(output_folder) self.output_folder = output_folder self.target_language = target_language def start_job(self): try: pdf_util = PDFUtil(self.pdf_file) page_text_dict = self.get_pdf_page_text_dict() total_text = "" for page_num, page_text in page_text_dict.items(): logger.info(f"Translate from page {page_num}") total_text += f"----------------- Page {page_num} -----------------\n" if page_text.strip() == "": total_text += "\n\n" continue total_text += self.translate_text(page_text) + "\n" if self.output_folder: output_file = os.path.join(self.output_folder, os.path.basename(self.pdf_file).replace(".pdf", "_translated.txt")) with open(output_file, "w", encoding="utf-8") as f: f.write(total_text) except Exception as e: logger.error(f"Error: {e}") raise Exception(e) def get_pdf_page_text_dict(self) -> dict: pdf_util = PDFUtil(self.pdf_file) success, text, page_text_dict = pdf_util.extract_text() return page_text_dict def translate_text(self, text: str): instructions = f"Context: \n{text}\n\nInstructions: Translate the contex in {self.target_language}. \n" instructions += "Please output the translated text in the following JSON format: {\"translated_text\": \"translated text\"} \n\n" instructions += "Answer: \n" response, with_error = chat( instructions, response_format={"type": "json_object"} ) try: data = json.loads(response) except: try: data = json_repair.loads(response) except: data = {"translated_text": ""} return data.get("translated_text", "")