66 lines
2.5 KiB
Python
66 lines
2.5 KiB
Python
import os
|
|
import json
|
|
import json_repair
|
|
import re
|
|
import fitz
|
|
import pandas as pd
|
|
from utils.gpt_utils import chat
|
|
from utils.pdf_util import PDFUtil
|
|
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
|
|
from utils.logger import logger
|
|
|
|
|
|
class Translate_PDF:
|
|
def __init__(self, pdf_file: str, output_folder: str, target_language: str = "English") -> None:
|
|
self.pdf_file = pdf_file
|
|
if not os.path.exists(self.pdf_file):
|
|
raise Exception(f"File {self.pdf_file} not found")
|
|
if not os.path.exists(output_folder):
|
|
os.makedirs(output_folder)
|
|
self.output_folder = output_folder
|
|
self.target_language = target_language
|
|
|
|
|
|
def start_job(self):
|
|
try:
|
|
pdf_util = PDFUtil(self.pdf_file)
|
|
page_text_dict = self.get_pdf_page_text_dict()
|
|
|
|
total_text = ""
|
|
for page_num, page_text in page_text_dict.items():
|
|
logger.info(f"Translate from page {page_num}")
|
|
total_text += f"----------------- Page {page_num} -----------------\n"
|
|
if page_text.strip() == "":
|
|
total_text += "\n\n"
|
|
continue
|
|
total_text += self.translate_text(page_text) + "\n"
|
|
if self.output_folder:
|
|
output_file = os.path.join(self.output_folder, os.path.basename(self.pdf_file).replace(".pdf", "_translated.txt"))
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
f.write(total_text)
|
|
except Exception as e:
|
|
logger.error(f"Error: {e}")
|
|
raise Exception(e)
|
|
|
|
def get_pdf_page_text_dict(self) -> dict:
|
|
pdf_util = PDFUtil(self.pdf_file)
|
|
success, text, page_text_dict = pdf_util.extract_text()
|
|
return page_text_dict
|
|
|
|
def translate_text(self, text: str):
|
|
instructions = f"Context: \n{text}\n\nInstructions: Translate the contex in {self.target_language}. \n"
|
|
instructions += "Please output the translated text in the following JSON format: {\"translated_text\": \"translated text\"} \n\n"
|
|
instructions += "Answer: \n"
|
|
response, with_error = chat(
|
|
instructions, response_format={"type": "json_object"}
|
|
)
|
|
try:
|
|
data = json.loads(response)
|
|
except:
|
|
try:
|
|
data = json_repair.loads(response)
|
|
except:
|
|
data = {"translated_text": ""}
|
|
return data.get("translated_text", "")
|
|
|
|
|