a little updates
This commit is contained in:
parent
fa763f4f14
commit
9d453c9fae
|
|
@ -0,0 +1,66 @@
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import json_repair
|
||||||
|
import re
|
||||||
|
import fitz
|
||||||
|
import pandas as pd
|
||||||
|
from utils.gpt_utils import chat
|
||||||
|
from utils.pdf_util import PDFUtil
|
||||||
|
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
|
||||||
|
from utils.logger import logger
|
||||||
|
|
||||||
|
|
||||||
|
class Translate_PDF:
|
||||||
|
def __init__(self, pdf_file: str, output_folder: str, target_language: str = "English") -> None:
|
||||||
|
self.pdf_file = pdf_file
|
||||||
|
if not os.path.exists(self.pdf_file):
|
||||||
|
raise Exception(f"File {self.pdf_file} not found")
|
||||||
|
if not os.path.exists(output_folder):
|
||||||
|
os.makedirs(output_folder)
|
||||||
|
self.output_folder = output_folder
|
||||||
|
self.target_language = target_language
|
||||||
|
|
||||||
|
|
||||||
|
def start_job(self):
|
||||||
|
try:
|
||||||
|
pdf_util = PDFUtil(self.pdf_file)
|
||||||
|
page_text_dict = self.get_pdf_page_text_dict()
|
||||||
|
|
||||||
|
total_text = ""
|
||||||
|
for page_num, page_text in page_text_dict.items():
|
||||||
|
logger.info(f"Translate from page {page_num}")
|
||||||
|
total_text += f"----------------- Page {page_num} -----------------\n"
|
||||||
|
if page_text.strip() == "":
|
||||||
|
total_text += "\n\n"
|
||||||
|
continue
|
||||||
|
total_text += self.translate_text(page_text) + "\n"
|
||||||
|
if self.output_folder:
|
||||||
|
output_file = os.path.join(self.output_folder, os.path.basename(self.pdf_file).replace(".pdf", "_translated.txt"))
|
||||||
|
with open(output_file, "w", encoding="utf-8") as f:
|
||||||
|
f.write(total_text)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
|
raise Exception(e)
|
||||||
|
|
||||||
|
def get_pdf_page_text_dict(self) -> dict:
|
||||||
|
pdf_util = PDFUtil(self.pdf_file)
|
||||||
|
success, text, page_text_dict = pdf_util.extract_text()
|
||||||
|
return page_text_dict
|
||||||
|
|
||||||
|
def translate_text(self, text: str):
|
||||||
|
instructions = f"Context: \n{text}\n\nInstructions: Translate the contex in {self.target_language}. \n"
|
||||||
|
instructions += "Please output the translated text in the following JSON format: {\"translated_text\": \"translated text\"} \n\n"
|
||||||
|
instructions += "Answer: \n"
|
||||||
|
response, with_error = chat(
|
||||||
|
instructions, response_format={"type": "json_object"}
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
data = json.loads(response)
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
data = json_repair.loads(response)
|
||||||
|
except:
|
||||||
|
data = {"translated_text": ""}
|
||||||
|
return data.get("translated_text", "")
|
||||||
|
|
||||||
|
|
||||||
128
main.py
128
main.py
|
|
@ -624,7 +624,16 @@ def test_mapping_raw_name():
|
||||||
print(mapping_info)
|
print(mapping_info)
|
||||||
|
|
||||||
|
|
||||||
|
def test_translate_pdf():
|
||||||
|
from core.data_translate import Translate_PDF
|
||||||
|
pdf_file = r"/data/emea_ar/pdf/451063582.pdf"
|
||||||
|
output_folder = r"/data/translate/output/"
|
||||||
|
translate_pdf = Translate_PDF(pdf_file, output_folder)
|
||||||
|
translate_pdf.start_job()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# test_translate_pdf()
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
|
|
@ -821,45 +830,98 @@ if __name__ == "__main__":
|
||||||
# 543243585,
|
# 543243585,
|
||||||
# 543243654
|
# 543243654
|
||||||
# """
|
# """
|
||||||
|
# check_db_mapping_doc_id_list = [
|
||||||
|
# "511052670",
|
||||||
|
# "520733219",
|
||||||
|
# "524306810",
|
||||||
|
# "526747539",
|
||||||
|
# "528783089",
|
||||||
|
# "532422720",
|
||||||
|
# "532438210",
|
||||||
|
# "534112077",
|
||||||
|
# "534538571",
|
||||||
|
# "534538682",
|
||||||
|
# "535798742",
|
||||||
|
# "536299372",
|
||||||
|
# "539566148",
|
||||||
|
# "539604165",
|
||||||
|
# "540056900",
|
||||||
|
# "541343431",
|
||||||
|
# "541669780",
|
||||||
|
# "541669996",
|
||||||
|
# "541670397",
|
||||||
|
# "541923319",
|
||||||
|
# "542335994",
|
||||||
|
# "543243585",
|
||||||
|
# "543243654",
|
||||||
|
# "543244170",
|
||||||
|
# "543519140",
|
||||||
|
# "543519615",
|
||||||
|
# "543628379",
|
||||||
|
# "543809340",
|
||||||
|
# "543944737"
|
||||||
|
# ]
|
||||||
|
|
||||||
|
# documents in Final list of EMEA documents.xlsx as typical documents
|
||||||
check_db_mapping_doc_id_list = [
|
check_db_mapping_doc_id_list = [
|
||||||
"511052670",
|
"532500349",
|
||||||
"520733219",
|
"535324239",
|
||||||
"524306810",
|
"532442891",
|
||||||
"526747539",
|
"543243650",
|
||||||
"528783089",
|
"528588598",
|
||||||
"532422720",
|
"532437639",
|
||||||
"532438210",
|
"527525440",
|
||||||
"534112077",
|
"534987291",
|
||||||
"534538571",
|
"534112055",
|
||||||
"534538682",
|
"533482585",
|
||||||
"535798742",
|
"544208174",
|
||||||
"536299372",
|
"534547266",
|
||||||
"539566148",
|
"544713166",
|
||||||
"539604165",
|
"526463547",
|
||||||
"540056900",
|
"534535569",
|
||||||
"541343431",
|
"534106067",
|
||||||
"541669780",
|
"532486560",
|
||||||
"541669996",
|
"532781760",
|
||||||
"541670397",
|
"533727067",
|
||||||
"541923319",
|
"527256381",
|
||||||
"542335994",
|
"533392425",
|
||||||
"543243585",
|
"532179676",
|
||||||
"543243654",
|
"534300608",
|
||||||
"543244170",
|
"539233950",
|
||||||
"543519140",
|
"533727908",
|
||||||
"543519615",
|
"532438414",
|
||||||
"543628379",
|
"533681744",
|
||||||
"543809340",
|
"537654645",
|
||||||
"543944737"
|
"533594905",
|
||||||
]
|
"537926443",
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
"533499655",
|
||||||
|
"533862814",
|
||||||
|
"544918611",
|
||||||
|
"539087870",
|
||||||
|
"536343790"
|
||||||
|
]
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
special_doc_id_list = ["423418540"]
|
# special_doc_id_list = ["425595958",
|
||||||
|
# "451063582",
|
||||||
|
# "451878128",
|
||||||
|
# "466580448",
|
||||||
|
# "481482392",
|
||||||
|
# "492029971",
|
||||||
|
# "508704368",
|
||||||
|
# "510300817",
|
||||||
|
# "512745032",
|
||||||
|
# "514213638",
|
||||||
|
# "527525440",
|
||||||
|
# "532422548",
|
||||||
|
# "532998065",
|
||||||
|
# "534535767",
|
||||||
|
# "536344026",
|
||||||
|
# "540307575"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = True
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = True
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
|
||||||
|
|
@ -997,12 +997,12 @@ if __name__ == "__main__":
|
||||||
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
|
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
|
||||||
)
|
)
|
||||||
|
|
||||||
doc_provider_file_path = r"/data/emea_ar/basic_information/English/emea_sample_doc_from_Kshitij.xlsx"
|
doc_provider_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/Final list of EMEA documents.xlsx"
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
download_pdf(
|
download_pdf(
|
||||||
doc_provider_file_path=doc_provider_file_path,
|
doc_provider_file_path=doc_provider_file_path,
|
||||||
sheet_name="Sample EMEA Docs",
|
sheet_name="Sheet1",
|
||||||
doc_id_column="Document ID",
|
doc_id_column="Document Id",
|
||||||
pdf_path=pdf_folder)
|
pdf_path=pdf_folder)
|
||||||
# output_pdf_page_text(pdf_folder, output_folder)
|
# output_pdf_page_text(pdf_folder, output_folder)
|
||||||
|
|
||||||
|
|
@ -1014,12 +1014,14 @@ if __name__ == "__main__":
|
||||||
# 'latest_ar_document_most_mapping',
|
# 'latest_ar_document_most_mapping',
|
||||||
# pdf_folder)
|
# pdf_folder)
|
||||||
|
|
||||||
output_data_folder = r"/data/emea_ar/basic_information/English/top_100_provider_latest_document_most_mapping/"
|
|
||||||
|
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/doc_ar_data_for_final_list_emea_documents.xlsx"
|
||||||
|
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/"
|
||||||
# statistics_document(pdf_folder=pdf_folder,
|
# statistics_document(pdf_folder=pdf_folder,
|
||||||
# doc_mapping_file_path=latest_top_100_provider_ar_data_file,
|
# doc_mapping_file_path=doc_mapping_file_path,
|
||||||
# sheet_name="latest_doc_ar_data",
|
# sheet_name="doc_ar_data_in_db",
|
||||||
# output_folder=output_data_folder,
|
# output_folder=output_data_folder,
|
||||||
# output_file="latest_doc_ar_mapping_statistics.xlsx")
|
# output_file="doc_ar_data_statistics.xlsx")
|
||||||
# get_document_extracted_share_diff_by_db()
|
# get_document_extracted_share_diff_by_db()
|
||||||
# statistics_provider_mapping(
|
# statistics_provider_mapping(
|
||||||
# provider_mapping_data_file=provider_mapping_data_file,
|
# provider_mapping_data_file=provider_mapping_data_file,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue