From 2eace81f51402dd272c3b85f6cb5e9f464664736 Mon Sep 17 00:00:00 2001 From: Blade He Date: Thu, 16 Jan 2025 13:54:45 -0600 Subject: [PATCH] support more configurable parts --- app_emea_ar.py | 1 + configuration/aus_prospectus/misc_config.json | 3 ++- configuration/emea_ar/misc_config.json | 3 ++- main.py | 16 ++++++++++++---- utils/pdf_download.py | 14 +++++++++----- 5 files changed, 26 insertions(+), 11 deletions(-) diff --git a/app_emea_ar.py b/app_emea_ar.py index 8460a3a..133561c 100644 --- a/app_emea_ar.py +++ b/app_emea_ar.py @@ -60,6 +60,7 @@ def us_ar_data_extract(): try: emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id, + doc_source="emea_ar", pdf_folder=pdf_folder, output_extract_data_folder=output_extract_data_folder, output_mapping_data_folder=output_mapping_data_folder, diff --git a/configuration/aus_prospectus/misc_config.json b/configuration/aus_prospectus/misc_config.json index 1cd6c97..b576cc0 100644 --- a/configuration/aus_prospectus/misc_config.json +++ b/configuration/aus_prospectus/misc_config.json @@ -1,3 +1,4 @@ { - "apply_pdf2html": true + "apply_pdf2html": true, + "apply_drilldown": false } \ No newline at end of file diff --git a/configuration/emea_ar/misc_config.json b/configuration/emea_ar/misc_config.json index 17d7885..3948e5e 100644 --- a/configuration/emea_ar/misc_config.json +++ b/configuration/emea_ar/misc_config.json @@ -1,3 +1,4 @@ { - "apply_pdf2html": false + "apply_pdf2html": false, + "apply_drilldown": true } \ No newline at end of file diff --git a/main.py b/main.py index f861a92..e4d79bd 100644 --- a/main.py +++ b/main.py @@ -100,6 +100,13 @@ class EMEA_AR_Parsing: drilldown_folder = r"/data/emea_ar/output/drilldown/" os.makedirs(drilldown_folder, exist_ok=True) self.drilldown_folder = drilldown_folder + misc_config_file = os.path.join(f"./configuration/{doc_source}/", "misc_config.json") + if os.path.exists(misc_config_file): + with open(misc_config_file, "r", encoding="utf-8") as f: + misc_config = json.load(f) + self.apply_drilldown = misc_config.get("apply_drilldown", False) + else: + self.apply_drilldown = False def download_pdf(self) -> str: pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id) @@ -155,10 +162,11 @@ class EMEA_AR_Parsing: # Drilldown data to relevant PDF document annotation_list = [] - # try: - # annotation_list = self.drilldown_pdf_document(data_from_gpt) - # except Exception as e: - # logger.error(f"Error: {e}") + if self.apply_drilldown: + try: + annotation_list = self.drilldown_pdf_document(data_from_gpt) + except Exception as e: + logger.error(f"Error: {e}") return data_from_gpt, annotation_list def drilldown_pdf_document(self, data_from_gpt: list) -> list: diff --git a/utils/pdf_download.py b/utils/pdf_download.py index b47027a..b5281a5 100644 --- a/utils/pdf_download.py +++ b/utils/pdf_download.py @@ -1,6 +1,7 @@ import boto3 import time import os +import platform from utils.logger import logger import dotenv # loads .env file with your OPENAI_API_KEY @@ -30,15 +31,18 @@ def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str): pdf_file_path = os.path.join(pdf_directory, f"{doc_id}.pdf") + os_name = platform.system().lower() if os.path.exists(pdf_file_path): logger.info(f"PDF file for {os.path.basename(pdf_file_path)} already exists. Skipping...") return pdf_file_path else: - ACCESS_KEY = os.getenv('ACCESS_KEY') - SECRET_KEY = os.getenv('SECRET_KEY') - - session = boto3.Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY) - s3 = session.client('s3') + if os_name == "windows": + ACCESS_KEY = os.getenv('ACCESS_KEY') + SECRET_KEY = os.getenv('SECRET_KEY') + session = boto3.Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY) + s3 = session.client('s3') + else: + s3 = boto3.client('s3') # s3 = boto3.client('s3') bucket_name = os.getenv('BUCKET_NAME')