dc-ml-emea-ar/utils/pdf_download.py

57 lines
2.1 KiB
Python
Raw Normal View History

2024-08-19 14:52:13 +00:00
import boto3
import time
import os
from utils.logger import logger
import dotenv
# loads .env file with your OPENAI_API_KEY
dotenv.load_dotenv()
def try_ntimes(func, params, success_msg='', error_msg='', error_res=False, ntimes=1, interval=1):
count = 1
while True:
try:
res = func(**params)
print(success_msg)
return res
except Exception as e:
if count == ntimes:
print(error_msg)
return error_res
print(f'Please Set AWS environment variables at first, error: {e}')
print("Having tried {} times and trying one more time...".format(count))
time.sleep(interval)
count += 1
def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str):
if pdf_directory is None or pdf_directory == "":
logger.error("pdf_directory is not provided")
return None
os.makedirs(pdf_directory, exist_ok=True)
pdf_file_path = os.path.join(pdf_directory, f"{doc_id}.pdf")
if os.path.exists(pdf_file_path):
logger.info(f"PDF file for {os.path.basename(pdf_file_path)} already exists. Skipping...")
return pdf_file_path
else:
ACCESS_KEY = os.getenv('ACCESS_KEY')
SECRET_KEY = os.getenv('SECRET_KEY')
session = boto3.Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3 = session.client('s3')
# s3 = boto3.client('s3')
bucket_name = os.getenv('BUCKET_NAME')
params = {'Bucket': bucket_name, 'Key': doc_id, 'Filename': pdf_file_path}
success_msg = f'file downloaded from S3 successfully: {doc_id}'
error_msg = f'failed to download file {doc_id} from S3'
error_res = '__process_failed__'
res = try_ntimes(func=s3.download_file, params=params,
success_msg=success_msg,
error_msg=error_msg, error_res=error_res,
ntimes=3, interval=5)
return pdf_file_path