a little updates
This commit is contained in:
parent
fa763f4f14
commit
9d453c9fae
|
|
@ -0,0 +1,66 @@
|
|||
import os
|
||||
import json
|
||||
import json_repair
|
||||
import re
|
||||
import fitz
|
||||
import pandas as pd
|
||||
from utils.gpt_utils import chat
|
||||
from utils.pdf_util import PDFUtil
|
||||
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
|
||||
from utils.logger import logger
|
||||
|
||||
|
||||
class Translate_PDF:
|
||||
def __init__(self, pdf_file: str, output_folder: str, target_language: str = "English") -> None:
|
||||
self.pdf_file = pdf_file
|
||||
if not os.path.exists(self.pdf_file):
|
||||
raise Exception(f"File {self.pdf_file} not found")
|
||||
if not os.path.exists(output_folder):
|
||||
os.makedirs(output_folder)
|
||||
self.output_folder = output_folder
|
||||
self.target_language = target_language
|
||||
|
||||
|
||||
def start_job(self):
|
||||
try:
|
||||
pdf_util = PDFUtil(self.pdf_file)
|
||||
page_text_dict = self.get_pdf_page_text_dict()
|
||||
|
||||
total_text = ""
|
||||
for page_num, page_text in page_text_dict.items():
|
||||
logger.info(f"Translate from page {page_num}")
|
||||
total_text += f"----------------- Page {page_num} -----------------\n"
|
||||
if page_text.strip() == "":
|
||||
total_text += "\n\n"
|
||||
continue
|
||||
total_text += self.translate_text(page_text) + "\n"
|
||||
if self.output_folder:
|
||||
output_file = os.path.join(self.output_folder, os.path.basename(self.pdf_file).replace(".pdf", "_translated.txt"))
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(total_text)
|
||||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
raise Exception(e)
|
||||
|
||||
def get_pdf_page_text_dict(self) -> dict:
|
||||
pdf_util = PDFUtil(self.pdf_file)
|
||||
success, text, page_text_dict = pdf_util.extract_text()
|
||||
return page_text_dict
|
||||
|
||||
def translate_text(self, text: str):
|
||||
instructions = f"Context: \n{text}\n\nInstructions: Translate the contex in {self.target_language}. \n"
|
||||
instructions += "Please output the translated text in the following JSON format: {\"translated_text\": \"translated text\"} \n\n"
|
||||
instructions += "Answer: \n"
|
||||
response, with_error = chat(
|
||||
instructions, response_format={"type": "json_object"}
|
||||
)
|
||||
try:
|
||||
data = json.loads(response)
|
||||
except:
|
||||
try:
|
||||
data = json_repair.loads(response)
|
||||
except:
|
||||
data = {"translated_text": ""}
|
||||
return data.get("translated_text", "")
|
||||
|
||||
|
||||
128
main.py
128
main.py
|
|
@ -622,9 +622,18 @@ def test_mapping_raw_name():
|
|||
process_cache=process_cache
|
||||
)
|
||||
print(mapping_info)
|
||||
|
||||
|
||||
def test_translate_pdf():
|
||||
from core.data_translate import Translate_PDF
|
||||
pdf_file = r"/data/emea_ar/pdf/451063582.pdf"
|
||||
output_folder = r"/data/translate/output/"
|
||||
translate_pdf = Translate_PDF(pdf_file, output_folder)
|
||||
translate_pdf.start_job()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test_translate_pdf()
|
||||
pdf_folder = r"/data/emea_ar/pdf/"
|
||||
page_filter_ground_truth_file = (
|
||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||
|
|
@ -821,45 +830,98 @@ if __name__ == "__main__":
|
|||
# 543243585,
|
||||
# 543243654
|
||||
# """
|
||||
# check_db_mapping_doc_id_list = [
|
||||
# "511052670",
|
||||
# "520733219",
|
||||
# "524306810",
|
||||
# "526747539",
|
||||
# "528783089",
|
||||
# "532422720",
|
||||
# "532438210",
|
||||
# "534112077",
|
||||
# "534538571",
|
||||
# "534538682",
|
||||
# "535798742",
|
||||
# "536299372",
|
||||
# "539566148",
|
||||
# "539604165",
|
||||
# "540056900",
|
||||
# "541343431",
|
||||
# "541669780",
|
||||
# "541669996",
|
||||
# "541670397",
|
||||
# "541923319",
|
||||
# "542335994",
|
||||
# "543243585",
|
||||
# "543243654",
|
||||
# "543244170",
|
||||
# "543519140",
|
||||
# "543519615",
|
||||
# "543628379",
|
||||
# "543809340",
|
||||
# "543944737"
|
||||
# ]
|
||||
|
||||
# documents in Final list of EMEA documents.xlsx as typical documents
|
||||
check_db_mapping_doc_id_list = [
|
||||
"511052670",
|
||||
"520733219",
|
||||
"524306810",
|
||||
"526747539",
|
||||
"528783089",
|
||||
"532422720",
|
||||
"532438210",
|
||||
"534112077",
|
||||
"534538571",
|
||||
"534538682",
|
||||
"535798742",
|
||||
"536299372",
|
||||
"539566148",
|
||||
"539604165",
|
||||
"540056900",
|
||||
"541343431",
|
||||
"541669780",
|
||||
"541669996",
|
||||
"541670397",
|
||||
"541923319",
|
||||
"542335994",
|
||||
"543243585",
|
||||
"543243654",
|
||||
"543244170",
|
||||
"543519140",
|
||||
"543519615",
|
||||
"543628379",
|
||||
"543809340",
|
||||
"543944737"
|
||||
]
|
||||
# special_doc_id_list = check_mapping_doc_id_list
|
||||
"532500349",
|
||||
"535324239",
|
||||
"532442891",
|
||||
"543243650",
|
||||
"528588598",
|
||||
"532437639",
|
||||
"527525440",
|
||||
"534987291",
|
||||
"534112055",
|
||||
"533482585",
|
||||
"544208174",
|
||||
"534547266",
|
||||
"544713166",
|
||||
"526463547",
|
||||
"534535569",
|
||||
"534106067",
|
||||
"532486560",
|
||||
"532781760",
|
||||
"533727067",
|
||||
"527256381",
|
||||
"533392425",
|
||||
"532179676",
|
||||
"534300608",
|
||||
"539233950",
|
||||
"533727908",
|
||||
"532438414",
|
||||
"533681744",
|
||||
"537654645",
|
||||
"533594905",
|
||||
"537926443",
|
||||
"533499655",
|
||||
"533862814",
|
||||
"544918611",
|
||||
"539087870",
|
||||
"536343790"
|
||||
]
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
special_doc_id_list = ["423418540"]
|
||||
# special_doc_id_list = ["425595958",
|
||||
# "451063582",
|
||||
# "451878128",
|
||||
# "466580448",
|
||||
# "481482392",
|
||||
# "492029971",
|
||||
# "508704368",
|
||||
# "510300817",
|
||||
# "512745032",
|
||||
# "514213638",
|
||||
# "527525440",
|
||||
# "532422548",
|
||||
# "532998065",
|
||||
# "534535767",
|
||||
# "536344026",
|
||||
# "540307575"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = False
|
||||
force_save_total_data = True
|
||||
calculate_metrics = False
|
||||
|
||||
extract_ways = ["text"]
|
||||
|
|
|
|||
|
|
@ -997,12 +997,12 @@ if __name__ == "__main__":
|
|||
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
|
||||
)
|
||||
|
||||
doc_provider_file_path = r"/data/emea_ar/basic_information/English/emea_sample_doc_from_Kshitij.xlsx"
|
||||
doc_provider_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/Final list of EMEA documents.xlsx"
|
||||
pdf_folder = r"/data/emea_ar/pdf/"
|
||||
download_pdf(
|
||||
doc_provider_file_path=doc_provider_file_path,
|
||||
sheet_name="Sample EMEA Docs",
|
||||
doc_id_column="Document ID",
|
||||
sheet_name="Sheet1",
|
||||
doc_id_column="Document Id",
|
||||
pdf_path=pdf_folder)
|
||||
# output_pdf_page_text(pdf_folder, output_folder)
|
||||
|
||||
|
|
@ -1013,13 +1013,15 @@ if __name__ == "__main__":
|
|||
# download_pdf(latest_top_100_provider_ar_data_file,
|
||||
# 'latest_ar_document_most_mapping',
|
||||
# pdf_folder)
|
||||
|
||||
output_data_folder = r"/data/emea_ar/basic_information/English/top_100_provider_latest_document_most_mapping/"
|
||||
|
||||
|
||||
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/doc_ar_data_for_final_list_emea_documents.xlsx"
|
||||
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/"
|
||||
# statistics_document(pdf_folder=pdf_folder,
|
||||
# doc_mapping_file_path=latest_top_100_provider_ar_data_file,
|
||||
# sheet_name="latest_doc_ar_data",
|
||||
# doc_mapping_file_path=doc_mapping_file_path,
|
||||
# sheet_name="doc_ar_data_in_db",
|
||||
# output_folder=output_data_folder,
|
||||
# output_file="latest_doc_ar_mapping_statistics.xlsx")
|
||||
# output_file="doc_ar_data_statistics.xlsx")
|
||||
# get_document_extracted_share_diff_by_db()
|
||||
# statistics_provider_mapping(
|
||||
# provider_mapping_data_file=provider_mapping_data_file,
|
||||
|
|
|
|||
Loading…
Reference in New Issue