a little updates

This commit is contained in:
Blade He 2024-10-28 15:15:55 -05:00
parent fa763f4f14
commit 9d453c9fae
3 changed files with 171 additions and 41 deletions

66
core/data_translate.py Normal file
View File

@ -0,0 +1,66 @@
import os
import json
import json_repair
import re
import fitz
import pandas as pd
from utils.gpt_utils import chat
from utils.pdf_util import PDFUtil
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
from utils.logger import logger
class Translate_PDF:
def __init__(self, pdf_file: str, output_folder: str, target_language: str = "English") -> None:
self.pdf_file = pdf_file
if not os.path.exists(self.pdf_file):
raise Exception(f"File {self.pdf_file} not found")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
self.output_folder = output_folder
self.target_language = target_language
def start_job(self):
try:
pdf_util = PDFUtil(self.pdf_file)
page_text_dict = self.get_pdf_page_text_dict()
total_text = ""
for page_num, page_text in page_text_dict.items():
logger.info(f"Translate from page {page_num}")
total_text += f"----------------- Page {page_num} -----------------\n"
if page_text.strip() == "":
total_text += "\n\n"
continue
total_text += self.translate_text(page_text) + "\n"
if self.output_folder:
output_file = os.path.join(self.output_folder, os.path.basename(self.pdf_file).replace(".pdf", "_translated.txt"))
with open(output_file, "w", encoding="utf-8") as f:
f.write(total_text)
except Exception as e:
logger.error(f"Error: {e}")
raise Exception(e)
def get_pdf_page_text_dict(self) -> dict:
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text()
return page_text_dict
def translate_text(self, text: str):
instructions = f"Context: \n{text}\n\nInstructions: Translate the contex in {self.target_language}. \n"
instructions += "Please output the translated text in the following JSON format: {\"translated_text\": \"translated text\"} \n\n"
instructions += "Answer: \n"
response, with_error = chat(
instructions, response_format={"type": "json_object"}
)
try:
data = json.loads(response)
except:
try:
data = json_repair.loads(response)
except:
data = {"translated_text": ""}
return data.get("translated_text", "")

126
main.py
View File

@ -624,7 +624,16 @@ def test_mapping_raw_name():
print(mapping_info) print(mapping_info)
def test_translate_pdf():
from core.data_translate import Translate_PDF
pdf_file = r"/data/emea_ar/pdf/451063582.pdf"
output_folder = r"/data/translate/output/"
translate_pdf = Translate_PDF(pdf_file, output_folder)
translate_pdf.start_job()
if __name__ == "__main__": if __name__ == "__main__":
# test_translate_pdf()
pdf_folder = r"/data/emea_ar/pdf/" pdf_folder = r"/data/emea_ar/pdf/"
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@ -821,45 +830,98 @@ if __name__ == "__main__":
# 543243585, # 543243585,
# 543243654 # 543243654
# """ # """
# check_db_mapping_doc_id_list = [
# "511052670",
# "520733219",
# "524306810",
# "526747539",
# "528783089",
# "532422720",
# "532438210",
# "534112077",
# "534538571",
# "534538682",
# "535798742",
# "536299372",
# "539566148",
# "539604165",
# "540056900",
# "541343431",
# "541669780",
# "541669996",
# "541670397",
# "541923319",
# "542335994",
# "543243585",
# "543243654",
# "543244170",
# "543519140",
# "543519615",
# "543628379",
# "543809340",
# "543944737"
# ]
# documents in Final list of EMEA documents.xlsx as typical documents
check_db_mapping_doc_id_list = [ check_db_mapping_doc_id_list = [
"511052670", "532500349",
"520733219", "535324239",
"524306810", "532442891",
"526747539", "543243650",
"528783089", "528588598",
"532422720", "532437639",
"532438210", "527525440",
"534112077", "534987291",
"534538571", "534112055",
"534538682", "533482585",
"535798742", "544208174",
"536299372", "534547266",
"539566148", "544713166",
"539604165", "526463547",
"540056900", "534535569",
"541343431", "534106067",
"541669780", "532486560",
"541669996", "532781760",
"541670397", "533727067",
"541923319", "527256381",
"542335994", "533392425",
"543243585", "532179676",
"543243654", "534300608",
"543244170", "539233950",
"543519140", "533727908",
"543519615", "532438414",
"543628379", "533681744",
"543809340", "537654645",
"543944737" "533594905",
"537926443",
"533499655",
"533862814",
"544918611",
"539087870",
"536343790"
] ]
# special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["423418540"] # special_doc_id_list = ["425595958",
# "451063582",
# "451878128",
# "466580448",
# "481482392",
# "492029971",
# "508704368",
# "510300817",
# "512745032",
# "514213638",
# "527525440",
# "532422548",
# "532998065",
# "534535767",
# "536344026",
# "540307575"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True re_run_extract_data = True
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = False force_save_total_data = True
calculate_metrics = False calculate_metrics = False
extract_ways = ["text"] extract_ways = ["text"]

View File

@ -997,12 +997,12 @@ if __name__ == "__main__":
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx" r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
) )
doc_provider_file_path = r"/data/emea_ar/basic_information/English/emea_sample_doc_from_Kshitij.xlsx" doc_provider_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/Final list of EMEA documents.xlsx"
pdf_folder = r"/data/emea_ar/pdf/" pdf_folder = r"/data/emea_ar/pdf/"
download_pdf( download_pdf(
doc_provider_file_path=doc_provider_file_path, doc_provider_file_path=doc_provider_file_path,
sheet_name="Sample EMEA Docs", sheet_name="Sheet1",
doc_id_column="Document ID", doc_id_column="Document Id",
pdf_path=pdf_folder) pdf_path=pdf_folder)
# output_pdf_page_text(pdf_folder, output_folder) # output_pdf_page_text(pdf_folder, output_folder)
@ -1014,12 +1014,14 @@ if __name__ == "__main__":
# 'latest_ar_document_most_mapping', # 'latest_ar_document_most_mapping',
# pdf_folder) # pdf_folder)
output_data_folder = r"/data/emea_ar/basic_information/English/top_100_provider_latest_document_most_mapping/"
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/doc_ar_data_for_final_list_emea_documents.xlsx"
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/"
# statistics_document(pdf_folder=pdf_folder, # statistics_document(pdf_folder=pdf_folder,
# doc_mapping_file_path=latest_top_100_provider_ar_data_file, # doc_mapping_file_path=doc_mapping_file_path,
# sheet_name="latest_doc_ar_data", # sheet_name="doc_ar_data_in_db",
# output_folder=output_data_folder, # output_folder=output_data_folder,
# output_file="latest_doc_ar_mapping_statistics.xlsx") # output_file="doc_ar_data_statistics.xlsx")
# get_document_extracted_share_diff_by_db() # get_document_extracted_share_diff_by_db()
# statistics_provider_mapping( # statistics_provider_mapping(
# provider_mapping_data_file=provider_mapping_data_file, # provider_mapping_data_file=provider_mapping_data_file,