diff --git a/core/data_translate.py b/core/data_translate.py new file mode 100644 index 0000000..fb3f744 --- /dev/null +++ b/core/data_translate.py @@ -0,0 +1,66 @@ +import os +import json +import json_repair +import re +import fitz +import pandas as pd +from utils.gpt_utils import chat +from utils.pdf_util import PDFUtil +from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider +from utils.logger import logger + + +class Translate_PDF: + def __init__(self, pdf_file: str, output_folder: str, target_language: str = "English") -> None: + self.pdf_file = pdf_file + if not os.path.exists(self.pdf_file): + raise Exception(f"File {self.pdf_file} not found") + if not os.path.exists(output_folder): + os.makedirs(output_folder) + self.output_folder = output_folder + self.target_language = target_language + + + def start_job(self): + try: + pdf_util = PDFUtil(self.pdf_file) + page_text_dict = self.get_pdf_page_text_dict() + + total_text = "" + for page_num, page_text in page_text_dict.items(): + logger.info(f"Translate from page {page_num}") + total_text += f"----------------- Page {page_num} -----------------\n" + if page_text.strip() == "": + total_text += "\n\n" + continue + total_text += self.translate_text(page_text) + "\n" + if self.output_folder: + output_file = os.path.join(self.output_folder, os.path.basename(self.pdf_file).replace(".pdf", "_translated.txt")) + with open(output_file, "w", encoding="utf-8") as f: + f.write(total_text) + except Exception as e: + logger.error(f"Error: {e}") + raise Exception(e) + + def get_pdf_page_text_dict(self) -> dict: + pdf_util = PDFUtil(self.pdf_file) + success, text, page_text_dict = pdf_util.extract_text() + return page_text_dict + + def translate_text(self, text: str): + instructions = f"Context: \n{text}\n\nInstructions: Translate the contex in {self.target_language}. \n" + instructions += "Please output the translated text in the following JSON format: {\"translated_text\": \"translated text\"} \n\n" + instructions += "Answer: \n" + response, with_error = chat( + instructions, response_format={"type": "json_object"} + ) + try: + data = json.loads(response) + except: + try: + data = json_repair.loads(response) + except: + data = {"translated_text": ""} + return data.get("translated_text", "") + + \ No newline at end of file diff --git a/main.py b/main.py index dca10df..b195e80 100644 --- a/main.py +++ b/main.py @@ -622,9 +622,18 @@ def test_mapping_raw_name(): process_cache=process_cache ) print(mapping_info) + + +def test_translate_pdf(): + from core.data_translate import Translate_PDF + pdf_file = r"/data/emea_ar/pdf/451063582.pdf" + output_folder = r"/data/translate/output/" + translate_pdf = Translate_PDF(pdf_file, output_folder) + translate_pdf.start_job() if __name__ == "__main__": + # test_translate_pdf() pdf_folder = r"/data/emea_ar/pdf/" page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" @@ -821,45 +830,98 @@ if __name__ == "__main__": # 543243585, # 543243654 # """ + # check_db_mapping_doc_id_list = [ + # "511052670", + # "520733219", + # "524306810", + # "526747539", + # "528783089", + # "532422720", + # "532438210", + # "534112077", + # "534538571", + # "534538682", + # "535798742", + # "536299372", + # "539566148", + # "539604165", + # "540056900", + # "541343431", + # "541669780", + # "541669996", + # "541670397", + # "541923319", + # "542335994", + # "543243585", + # "543243654", + # "543244170", + # "543519140", + # "543519615", + # "543628379", + # "543809340", + # "543944737" + # ] + + # documents in Final list of EMEA documents.xlsx as typical documents check_db_mapping_doc_id_list = [ - "511052670", - "520733219", - "524306810", - "526747539", - "528783089", - "532422720", - "532438210", - "534112077", - "534538571", - "534538682", - "535798742", - "536299372", - "539566148", - "539604165", - "540056900", - "541343431", - "541669780", - "541669996", - "541670397", - "541923319", - "542335994", - "543243585", - "543243654", - "543244170", - "543519140", - "543519615", - "543628379", - "543809340", - "543944737" - ] - # special_doc_id_list = check_mapping_doc_id_list + "532500349", + "535324239", + "532442891", + "543243650", + "528588598", + "532437639", + "527525440", + "534987291", + "534112055", + "533482585", + "544208174", + "534547266", + "544713166", + "526463547", + "534535569", + "534106067", + "532486560", + "532781760", + "533727067", + "527256381", + "533392425", + "532179676", + "534300608", + "539233950", + "533727908", + "532438414", + "533681744", + "537654645", + "533594905", + "537926443", + "533499655", + "533862814", + "544918611", + "539087870", + "536343790" + ] special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["423418540"] + # special_doc_id_list = ["425595958", + # "451063582", + # "451878128", + # "466580448", + # "481482392", + # "492029971", + # "508704368", + # "510300817", + # "512745032", + # "514213638", + # "527525440", + # "532422548", + # "532998065", + # "534535767", + # "536344026", + # "540307575"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = False + force_save_total_data = True calculate_metrics = False extract_ways = ["text"] diff --git a/prepare_data.py b/prepare_data.py index cfed2ce..5da0a56 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -997,12 +997,12 @@ if __name__ == "__main__": r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx" ) - doc_provider_file_path = r"/data/emea_ar/basic_information/English/emea_sample_doc_from_Kshitij.xlsx" + doc_provider_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/Final list of EMEA documents.xlsx" pdf_folder = r"/data/emea_ar/pdf/" download_pdf( doc_provider_file_path=doc_provider_file_path, - sheet_name="Sample EMEA Docs", - doc_id_column="Document ID", + sheet_name="Sheet1", + doc_id_column="Document Id", pdf_path=pdf_folder) # output_pdf_page_text(pdf_folder, output_folder) @@ -1013,13 +1013,15 @@ if __name__ == "__main__": # download_pdf(latest_top_100_provider_ar_data_file, # 'latest_ar_document_most_mapping', # pdf_folder) - - output_data_folder = r"/data/emea_ar/basic_information/English/top_100_provider_latest_document_most_mapping/" + + + doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/doc_ar_data_for_final_list_emea_documents.xlsx" + output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/" # statistics_document(pdf_folder=pdf_folder, - # doc_mapping_file_path=latest_top_100_provider_ar_data_file, - # sheet_name="latest_doc_ar_data", + # doc_mapping_file_path=doc_mapping_file_path, + # sheet_name="doc_ar_data_in_db", # output_folder=output_data_folder, - # output_file="latest_doc_ar_mapping_statistics.xlsx") + # output_file="doc_ar_data_statistics.xlsx") # get_document_extracted_share_diff_by_db() # statistics_provider_mapping( # provider_mapping_data_file=provider_mapping_data_file,