dc-ml-emea-ar/main.py

66 lines
2.4 KiB
Python
Raw Normal View History

import os
import json
import pandas as pd
from glob import glob
from tqdm import tqdm
import time
from utils.logger import logger
from utils.pdf_download import download_pdf_from_documents_warehouse
from utils.sql_query_util import query_document_fund_mapping
from core.page_filter import FilterPages
class EMEA_AR_Parsing:
def __init__(self, doc_id: str, pdf_folder: str = r"/data/emea_ar/pdf/") -> None:
self.doc_id = doc_id
self.pdf_folder = pdf_folder
os.makedirs(self.pdf_folder, exist_ok=True)
self.pdf_file = self.download_pdf()
self.document_mapping_info_df = query_document_fund_mapping(doc_id)
self.datapoint_page_info = self.get_datapoint_page_info()
def download_pdf(self) -> str:
pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
return pdf_file
def get_datapoint_page_info(self) -> dict:
filter_pages = FilterPages(
self.doc_id, self.pdf_file, self.document_mapping_info_df
)
datapoint_page_info = filter_pages.start_job()
return datapoint_page_info
def filter_pages(doc_id: str, pdf_folder: str) -> None:
logger.info(f"Parsing EMEA AR for doc_id: {doc_id}")
emea_ar_parsing = EMEA_AR_Parsing(doc_id, pdf_folder)
return emea_ar_parsing.datapoint_page_info
def batch_filter_pdf_files(pdf_folder: str, output_folder: str) -> None:
pdf_files = glob(pdf_folder + "*.pdf")
result_list = []
for pdf_file in tqdm(pdf_files):
pdf_base_name = os.path.basename(pdf_file)
doc_id = pdf_base_name.split(".")[0]
datapoint_page_info = filter_pages(doc_id=doc_id, pdf_folder=pdf_folder)
result_list.append(datapoint_page_info)
result_df = pd.DataFrame(result_list)
result_df.reset_index(drop=True, inplace=True)
logger.info(f"Saving the result to {output_folder}")
os.makedirs(output_folder, exist_ok=True)
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
output_file = os.path.join(
output_folder,
f"datapoint_page_info_{len(result_df)}_documents_{time_stamp}.xlsx",
)
with pd.ExcelWriter(output_file) as writer:
result_df.to_excel(writer, index=False)
if __name__ == "__main__":
pdf_folder = r"/data/emea_ar/small_pdf/"
output_folder = r"/data/emea_ar/output/filter_pages/"
batch_filter_pdf_files(pdf_folder, output_folder)