support more configurable parts
This commit is contained in:
parent
db0827435b
commit
2eace81f51
|
|
@ -60,6 +60,7 @@ def us_ar_data_extract():
|
|||
|
||||
try:
|
||||
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
||||
doc_source="emea_ar",
|
||||
pdf_folder=pdf_folder,
|
||||
output_extract_data_folder=output_extract_data_folder,
|
||||
output_mapping_data_folder=output_mapping_data_folder,
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
{
|
||||
"apply_pdf2html": true
|
||||
"apply_pdf2html": true,
|
||||
"apply_drilldown": false
|
||||
}
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
{
|
||||
"apply_pdf2html": false
|
||||
"apply_pdf2html": false,
|
||||
"apply_drilldown": true
|
||||
}
|
||||
16
main.py
16
main.py
|
|
@ -100,6 +100,13 @@ class EMEA_AR_Parsing:
|
|||
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
||||
os.makedirs(drilldown_folder, exist_ok=True)
|
||||
self.drilldown_folder = drilldown_folder
|
||||
misc_config_file = os.path.join(f"./configuration/{doc_source}/", "misc_config.json")
|
||||
if os.path.exists(misc_config_file):
|
||||
with open(misc_config_file, "r", encoding="utf-8") as f:
|
||||
misc_config = json.load(f)
|
||||
self.apply_drilldown = misc_config.get("apply_drilldown", False)
|
||||
else:
|
||||
self.apply_drilldown = False
|
||||
|
||||
def download_pdf(self) -> str:
|
||||
pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
|
||||
|
|
@ -155,10 +162,11 @@ class EMEA_AR_Parsing:
|
|||
|
||||
# Drilldown data to relevant PDF document
|
||||
annotation_list = []
|
||||
# try:
|
||||
# annotation_list = self.drilldown_pdf_document(data_from_gpt)
|
||||
# except Exception as e:
|
||||
# logger.error(f"Error: {e}")
|
||||
if self.apply_drilldown:
|
||||
try:
|
||||
annotation_list = self.drilldown_pdf_document(data_from_gpt)
|
||||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
return data_from_gpt, annotation_list
|
||||
|
||||
def drilldown_pdf_document(self, data_from_gpt: list) -> list:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import boto3
|
||||
import time
|
||||
import os
|
||||
import platform
|
||||
from utils.logger import logger
|
||||
import dotenv
|
||||
# loads .env file with your OPENAI_API_KEY
|
||||
|
|
@ -30,15 +31,18 @@ def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str):
|
|||
|
||||
pdf_file_path = os.path.join(pdf_directory, f"{doc_id}.pdf")
|
||||
|
||||
os_name = platform.system().lower()
|
||||
if os.path.exists(pdf_file_path):
|
||||
logger.info(f"PDF file for {os.path.basename(pdf_file_path)} already exists. Skipping...")
|
||||
return pdf_file_path
|
||||
else:
|
||||
if os_name == "windows":
|
||||
ACCESS_KEY = os.getenv('ACCESS_KEY')
|
||||
SECRET_KEY = os.getenv('SECRET_KEY')
|
||||
|
||||
session = boto3.Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
|
||||
s3 = session.client('s3')
|
||||
else:
|
||||
s3 = boto3.client('s3')
|
||||
|
||||
# s3 = boto3.client('s3')
|
||||
bucket_name = os.getenv('BUCKET_NAME')
|
||||
|
|
|
|||
Loading…
Reference in New Issue