66 lines
2.5 KiB
Python
66 lines
2.5 KiB
Python
import boto3
|
|
import time
|
|
import os
|
|
import platform
|
|
from utils.logger import logger
|
|
import dotenv
|
|
import certifi
|
|
# loads .env file with your OPENAI_API_KEY
|
|
dotenv.load_dotenv()
|
|
|
|
def try_ntimes(func, params, success_msg='', error_msg='', error_res=False, ntimes=1, interval=1):
|
|
count = 1
|
|
while True:
|
|
try:
|
|
res = func(**params)
|
|
print(success_msg)
|
|
return res
|
|
except Exception as e:
|
|
if count == ntimes:
|
|
print(error_msg)
|
|
return error_res
|
|
print(f'Please Set AWS environment variables at first, error: {e}')
|
|
print("Having tried {} times and trying one more time...".format(count))
|
|
time.sleep(interval)
|
|
count += 1
|
|
|
|
def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str):
|
|
if pdf_directory is None or pdf_directory == "":
|
|
logger.error("pdf_directory is not provided")
|
|
return None
|
|
os.makedirs(pdf_directory, exist_ok=True)
|
|
|
|
pdf_file_path = os.path.join(pdf_directory, f"{doc_id}.pdf")
|
|
|
|
os_name = platform.system().lower()
|
|
if os.path.exists(pdf_file_path):
|
|
logger.info(f"PDF file for {os.path.basename(pdf_file_path)} already exists. Skipping...")
|
|
return pdf_file_path
|
|
else:
|
|
if os_name == "windows":
|
|
ACCESS_KEY = os.getenv('ACCESS_KEY')
|
|
SECRET_KEY = os.getenv('SECRET_KEY')
|
|
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
|
|
s3 = boto3.client("s3", region_name="us-east-1", verify=certifi.where(),
|
|
aws_access_key_id=ACCESS_KEY,
|
|
aws_secret_access_key=SECRET_KEY,
|
|
aws_session_token=AWS_SESSION_TOKEN
|
|
)
|
|
else:
|
|
s3 = boto3.client('s3')
|
|
|
|
# s3 = boto3.client('s3')
|
|
bucket_name = os.getenv('BUCKET_NAME')
|
|
|
|
params = {'Bucket': bucket_name, 'Key': doc_id, 'Filename': pdf_file_path}
|
|
success_msg = f'file downloaded from S3 successfully: {doc_id}'
|
|
error_msg = f'failed to download file {doc_id} from S3'
|
|
error_res = '__process_failed__'
|
|
|
|
res = try_ntimes(func=s3.download_file, params=params,
|
|
success_msg=success_msg,
|
|
error_msg=error_msg, error_res=error_res,
|
|
ntimes=3, interval=5)
|
|
|
|
return pdf_file_path
|