a little updates

This commit is contained in:
Blade He 2024-10-28 15:15:55 -05:00
parent fa763f4f14
commit 9d453c9fae
3 changed files with 171 additions and 41 deletions

66
core/data_translate.py Normal file
View File

@ -0,0 +1,66 @@
import os
import json
import json_repair
import re
import fitz
import pandas as pd
from utils.gpt_utils import chat
from utils.pdf_util import PDFUtil
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
from utils.logger import logger
class Translate_PDF:
def __init__(self, pdf_file: str, output_folder: str, target_language: str = "English") -> None:
self.pdf_file = pdf_file
if not os.path.exists(self.pdf_file):
raise Exception(f"File {self.pdf_file} not found")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
self.output_folder = output_folder
self.target_language = target_language
def start_job(self):
try:
pdf_util = PDFUtil(self.pdf_file)
page_text_dict = self.get_pdf_page_text_dict()
total_text = ""
for page_num, page_text in page_text_dict.items():
logger.info(f"Translate from page {page_num}")
total_text += f"----------------- Page {page_num} -----------------\n"
if page_text.strip() == "":
total_text += "\n\n"
continue
total_text += self.translate_text(page_text) + "\n"
if self.output_folder:
output_file = os.path.join(self.output_folder, os.path.basename(self.pdf_file).replace(".pdf", "_translated.txt"))
with open(output_file, "w", encoding="utf-8") as f:
f.write(total_text)
except Exception as e:
logger.error(f"Error: {e}")
raise Exception(e)
def get_pdf_page_text_dict(self) -> dict:
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text()
return page_text_dict
def translate_text(self, text: str):
instructions = f"Context: \n{text}\n\nInstructions: Translate the contex in {self.target_language}. \n"
instructions += "Please output the translated text in the following JSON format: {\"translated_text\": \"translated text\"} \n\n"
instructions += "Answer: \n"
response, with_error = chat(
instructions, response_format={"type": "json_object"}
)
try:
data = json.loads(response)
except:
try:
data = json_repair.loads(response)
except:
data = {"translated_text": ""}
return data.get("translated_text", "")

128
main.py
View File

@ -622,9 +622,18 @@ def test_mapping_raw_name():
process_cache=process_cache
)
print(mapping_info)
def test_translate_pdf():
from core.data_translate import Translate_PDF
pdf_file = r"/data/emea_ar/pdf/451063582.pdf"
output_folder = r"/data/translate/output/"
translate_pdf = Translate_PDF(pdf_file, output_folder)
translate_pdf.start_job()
if __name__ == "__main__":
# test_translate_pdf()
pdf_folder = r"/data/emea_ar/pdf/"
page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@ -821,45 +830,98 @@ if __name__ == "__main__":
# 543243585,
# 543243654
# """
# check_db_mapping_doc_id_list = [
# "511052670",
# "520733219",
# "524306810",
# "526747539",
# "528783089",
# "532422720",
# "532438210",
# "534112077",
# "534538571",
# "534538682",
# "535798742",
# "536299372",
# "539566148",
# "539604165",
# "540056900",
# "541343431",
# "541669780",
# "541669996",
# "541670397",
# "541923319",
# "542335994",
# "543243585",
# "543243654",
# "543244170",
# "543519140",
# "543519615",
# "543628379",
# "543809340",
# "543944737"
# ]
# documents in Final list of EMEA documents.xlsx as typical documents
check_db_mapping_doc_id_list = [
"511052670",
"520733219",
"524306810",
"526747539",
"528783089",
"532422720",
"532438210",
"534112077",
"534538571",
"534538682",
"535798742",
"536299372",
"539566148",
"539604165",
"540056900",
"541343431",
"541669780",
"541669996",
"541670397",
"541923319",
"542335994",
"543243585",
"543243654",
"543244170",
"543519140",
"543519615",
"543628379",
"543809340",
"543944737"
]
# special_doc_id_list = check_mapping_doc_id_list
"532500349",
"535324239",
"532442891",
"543243650",
"528588598",
"532437639",
"527525440",
"534987291",
"534112055",
"533482585",
"544208174",
"534547266",
"544713166",
"526463547",
"534535569",
"534106067",
"532486560",
"532781760",
"533727067",
"527256381",
"533392425",
"532179676",
"534300608",
"539233950",
"533727908",
"532438414",
"533681744",
"537654645",
"533594905",
"537926443",
"533499655",
"533862814",
"544918611",
"539087870",
"536343790"
]
special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["423418540"]
# special_doc_id_list = ["425595958",
# "451063582",
# "451878128",
# "466580448",
# "481482392",
# "492029971",
# "508704368",
# "510300817",
# "512745032",
# "514213638",
# "527525440",
# "532422548",
# "532998065",
# "534535767",
# "536344026",
# "540307575"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = False
force_save_total_data = True
calculate_metrics = False
extract_ways = ["text"]

View File

@ -997,12 +997,12 @@ if __name__ == "__main__":
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
)
doc_provider_file_path = r"/data/emea_ar/basic_information/English/emea_sample_doc_from_Kshitij.xlsx"
doc_provider_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/Final list of EMEA documents.xlsx"
pdf_folder = r"/data/emea_ar/pdf/"
download_pdf(
doc_provider_file_path=doc_provider_file_path,
sheet_name="Sample EMEA Docs",
doc_id_column="Document ID",
sheet_name="Sheet1",
doc_id_column="Document Id",
pdf_path=pdf_folder)
# output_pdf_page_text(pdf_folder, output_folder)
@ -1013,13 +1013,15 @@ if __name__ == "__main__":
# download_pdf(latest_top_100_provider_ar_data_file,
# 'latest_ar_document_most_mapping',
# pdf_folder)
output_data_folder = r"/data/emea_ar/basic_information/English/top_100_provider_latest_document_most_mapping/"
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/doc_ar_data_for_final_list_emea_documents.xlsx"
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/"
# statistics_document(pdf_folder=pdf_folder,
# doc_mapping_file_path=latest_top_100_provider_ar_data_file,
# sheet_name="latest_doc_ar_data",
# doc_mapping_file_path=doc_mapping_file_path,
# sheet_name="doc_ar_data_in_db",
# output_folder=output_data_folder,
# output_file="latest_doc_ar_mapping_statistics.xlsx")
# output_file="doc_ar_data_statistics.xlsx")
# get_document_extracted_share_diff_by_db()
# statistics_provider_mapping(
# provider_mapping_data_file=provider_mapping_data_file,