dc-ml-emea-ar/core/data_translate.py

66 lines
2.5 KiB
Python
Raw Normal View History

2024-10-28 20:15:55 +00:00
import os
import json
import json_repair
import re
import fitz
import pandas as pd
from utils.gpt_utils import chat
from utils.pdf_util import PDFUtil
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
from utils.logger import logger
class Translate_PDF:
def __init__(self, pdf_file: str, output_folder: str, target_language: str = "English") -> None:
self.pdf_file = pdf_file
if not os.path.exists(self.pdf_file):
raise Exception(f"File {self.pdf_file} not found")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
self.output_folder = output_folder
self.target_language = target_language
def start_job(self):
try:
pdf_util = PDFUtil(self.pdf_file)
page_text_dict = self.get_pdf_page_text_dict()
total_text = ""
for page_num, page_text in page_text_dict.items():
logger.info(f"Translate from page {page_num}")
total_text += f"----------------- Page {page_num} -----------------\n"
if page_text.strip() == "":
total_text += "\n\n"
continue
total_text += self.translate_text(page_text) + "\n"
if self.output_folder:
output_file = os.path.join(self.output_folder, os.path.basename(self.pdf_file).replace(".pdf", "_translated.txt"))
with open(output_file, "w", encoding="utf-8") as f:
f.write(total_text)
except Exception as e:
logger.error(f"Error: {e}")
raise Exception(e)
def get_pdf_page_text_dict(self) -> dict:
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text()
return page_text_dict
def translate_text(self, text: str):
instructions = f"Context: \n{text}\n\nInstructions: Translate the contex in {self.target_language}. \n"
instructions += "Please output the translated text in the following JSON format: {\"translated_text\": \"translated text\"} \n\n"
instructions += "Answer: \n"
response, with_error = chat(
instructions, response_format={"type": "json_object"}
)
try:
data = json.loads(response)
except:
try:
data = json_repair.loads(response)
except:
data = {"translated_text": ""}
return data.get("translated_text", "")