support more configurable parts

This commit is contained in:
Blade He 2025-01-16 13:54:45 -06:00
parent db0827435b
commit 2eace81f51
5 changed files with 26 additions and 11 deletions

View File

@ -60,6 +60,7 @@ def us_ar_data_extract():
try:
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
doc_source="emea_ar",
pdf_folder=pdf_folder,
output_extract_data_folder=output_extract_data_folder,
output_mapping_data_folder=output_mapping_data_folder,

View File

@ -1,3 +1,4 @@
{
"apply_pdf2html": true
"apply_pdf2html": true,
"apply_drilldown": false
}

View File

@ -1,3 +1,4 @@
{
"apply_pdf2html": false
"apply_pdf2html": false,
"apply_drilldown": true
}

16
main.py
View File

@ -100,6 +100,13 @@ class EMEA_AR_Parsing:
drilldown_folder = r"/data/emea_ar/output/drilldown/"
os.makedirs(drilldown_folder, exist_ok=True)
self.drilldown_folder = drilldown_folder
misc_config_file = os.path.join(f"./configuration/{doc_source}/", "misc_config.json")
if os.path.exists(misc_config_file):
with open(misc_config_file, "r", encoding="utf-8") as f:
misc_config = json.load(f)
self.apply_drilldown = misc_config.get("apply_drilldown", False)
else:
self.apply_drilldown = False
def download_pdf(self) -> str:
pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
@ -155,10 +162,11 @@ class EMEA_AR_Parsing:
# Drilldown data to relevant PDF document
annotation_list = []
# try:
# annotation_list = self.drilldown_pdf_document(data_from_gpt)
# except Exception as e:
# logger.error(f"Error: {e}")
if self.apply_drilldown:
try:
annotation_list = self.drilldown_pdf_document(data_from_gpt)
except Exception as e:
logger.error(f"Error: {e}")
return data_from_gpt, annotation_list
def drilldown_pdf_document(self, data_from_gpt: list) -> list:

View File

@ -1,6 +1,7 @@
import boto3
import time
import os
import platform
from utils.logger import logger
import dotenv
# loads .env file with your OPENAI_API_KEY
@ -30,15 +31,18 @@ def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str):
pdf_file_path = os.path.join(pdf_directory, f"{doc_id}.pdf")
os_name = platform.system().lower()
if os.path.exists(pdf_file_path):
logger.info(f"PDF file for {os.path.basename(pdf_file_path)} already exists. Skipping...")
return pdf_file_path
else:
if os_name == "windows":
ACCESS_KEY = os.getenv('ACCESS_KEY')
SECRET_KEY = os.getenv('SECRET_KEY')
session = boto3.Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3 = session.client('s3')
else:
s3 = boto3.client('s3')
# s3 = boto3.client('s3')
bucket_name = os.getenv('BUCKET_NAME')