support more configurable parts
This commit is contained in:
parent
db0827435b
commit
2eace81f51
|
|
@ -60,6 +60,7 @@ def us_ar_data_extract():
|
||||||
|
|
||||||
try:
|
try:
|
||||||
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
||||||
|
doc_source="emea_ar",
|
||||||
pdf_folder=pdf_folder,
|
pdf_folder=pdf_folder,
|
||||||
output_extract_data_folder=output_extract_data_folder,
|
output_extract_data_folder=output_extract_data_folder,
|
||||||
output_mapping_data_folder=output_mapping_data_folder,
|
output_mapping_data_folder=output_mapping_data_folder,
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
{
|
{
|
||||||
"apply_pdf2html": true
|
"apply_pdf2html": true,
|
||||||
|
"apply_drilldown": false
|
||||||
}
|
}
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
{
|
{
|
||||||
"apply_pdf2html": false
|
"apply_pdf2html": false,
|
||||||
|
"apply_drilldown": true
|
||||||
}
|
}
|
||||||
16
main.py
16
main.py
|
|
@ -100,6 +100,13 @@ class EMEA_AR_Parsing:
|
||||||
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
||||||
os.makedirs(drilldown_folder, exist_ok=True)
|
os.makedirs(drilldown_folder, exist_ok=True)
|
||||||
self.drilldown_folder = drilldown_folder
|
self.drilldown_folder = drilldown_folder
|
||||||
|
misc_config_file = os.path.join(f"./configuration/{doc_source}/", "misc_config.json")
|
||||||
|
if os.path.exists(misc_config_file):
|
||||||
|
with open(misc_config_file, "r", encoding="utf-8") as f:
|
||||||
|
misc_config = json.load(f)
|
||||||
|
self.apply_drilldown = misc_config.get("apply_drilldown", False)
|
||||||
|
else:
|
||||||
|
self.apply_drilldown = False
|
||||||
|
|
||||||
def download_pdf(self) -> str:
|
def download_pdf(self) -> str:
|
||||||
pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
|
pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
|
||||||
|
|
@ -155,10 +162,11 @@ class EMEA_AR_Parsing:
|
||||||
|
|
||||||
# Drilldown data to relevant PDF document
|
# Drilldown data to relevant PDF document
|
||||||
annotation_list = []
|
annotation_list = []
|
||||||
# try:
|
if self.apply_drilldown:
|
||||||
# annotation_list = self.drilldown_pdf_document(data_from_gpt)
|
try:
|
||||||
# except Exception as e:
|
annotation_list = self.drilldown_pdf_document(data_from_gpt)
|
||||||
# logger.error(f"Error: {e}")
|
except Exception as e:
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
return data_from_gpt, annotation_list
|
return data_from_gpt, annotation_list
|
||||||
|
|
||||||
def drilldown_pdf_document(self, data_from_gpt: list) -> list:
|
def drilldown_pdf_document(self, data_from_gpt: list) -> list:
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import boto3
|
import boto3
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
|
import platform
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
import dotenv
|
import dotenv
|
||||||
# loads .env file with your OPENAI_API_KEY
|
# loads .env file with your OPENAI_API_KEY
|
||||||
|
|
@ -30,15 +31,18 @@ def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str):
|
||||||
|
|
||||||
pdf_file_path = os.path.join(pdf_directory, f"{doc_id}.pdf")
|
pdf_file_path = os.path.join(pdf_directory, f"{doc_id}.pdf")
|
||||||
|
|
||||||
|
os_name = platform.system().lower()
|
||||||
if os.path.exists(pdf_file_path):
|
if os.path.exists(pdf_file_path):
|
||||||
logger.info(f"PDF file for {os.path.basename(pdf_file_path)} already exists. Skipping...")
|
logger.info(f"PDF file for {os.path.basename(pdf_file_path)} already exists. Skipping...")
|
||||||
return pdf_file_path
|
return pdf_file_path
|
||||||
else:
|
else:
|
||||||
ACCESS_KEY = os.getenv('ACCESS_KEY')
|
if os_name == "windows":
|
||||||
SECRET_KEY = os.getenv('SECRET_KEY')
|
ACCESS_KEY = os.getenv('ACCESS_KEY')
|
||||||
|
SECRET_KEY = os.getenv('SECRET_KEY')
|
||||||
session = boto3.Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
|
session = boto3.Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
|
||||||
s3 = session.client('s3')
|
s3 = session.client('s3')
|
||||||
|
else:
|
||||||
|
s3 = boto3.client('s3')
|
||||||
|
|
||||||
# s3 = boto3.client('s3')
|
# s3 = boto3.client('s3')
|
||||||
bucket_name = os.getenv('BUCKET_NAME')
|
bucket_name = os.getenv('BUCKET_NAME')
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue