import os import json import pandas as pd from glob import glob from tqdm import tqdm import time from utils.logger import logger from utils.pdf_download import download_pdf_from_documents_warehouse from utils.sql_query_util import query_document_fund_mapping from core.page_filter import FilterPages from core.metrics import Metrics class EMEA_AR_Parsing: def __init__(self, doc_id: str, pdf_folder: str = r"/data/emea_ar/pdf/") -> None: self.doc_id = doc_id self.pdf_folder = pdf_folder os.makedirs(self.pdf_folder, exist_ok=True) self.pdf_file = self.download_pdf() self.document_mapping_info_df = query_document_fund_mapping(doc_id) self.datapoint_page_info, self.result_details = self.get_datapoint_page_info() def download_pdf(self) -> str: pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id) return pdf_file def get_datapoint_page_info(self) -> dict: filter_pages = FilterPages( self.doc_id, self.pdf_file, self.document_mapping_info_df ) datapoint_page_info, result_details = filter_pages.start_job() return datapoint_page_info, result_details def filter_pages(doc_id: str, pdf_folder: str) -> None: logger.info(f"Parsing EMEA AR for doc_id: {doc_id}") emea_ar_parsing = EMEA_AR_Parsing(doc_id, pdf_folder) return emea_ar_parsing.datapoint_page_info, emea_ar_parsing.result_details def batch_filter_pdf_files( pdf_folder: str, doc_data_excel_file: str = None, output_folder: str = r"/data/emea_ar/output/filter_pages/", special_doc_id_list: list = None, ) -> None: pdf_files = glob(pdf_folder + "*.pdf") doc_list = [] if special_doc_id_list is not None and len(special_doc_id_list) > 0: doc_list = special_doc_id_list if ( len(doc_list) == 0 and doc_data_excel_file is not None and len(doc_data_excel_file) > 0 and os.path.exists(doc_data_excel_file) ): doc_data_df = pd.read_excel(doc_data_excel_file) doc_data_df = doc_data_df[doc_data_df["Checked"] == 1] doc_list = [str(doc_id) for doc_id in doc_data_df["doc_id"].tolist()] result_list = [] result_details = [] for pdf_file in tqdm(pdf_files): pdf_base_name = os.path.basename(pdf_file) doc_id = pdf_base_name.split(".")[0] if doc_list is not None and doc_id not in doc_list: continue doc_datapoint_page_info, doc_result_details = filter_pages(doc_id=doc_id, pdf_folder=pdf_folder) result_list.append(doc_datapoint_page_info) result_details.extend(doc_result_details) result_df = pd.DataFrame(result_list) result_df.reset_index(drop=True, inplace=True) result_details_df = pd.DataFrame(result_details) result_details_df.reset_index(drop=True, inplace=True) logger.info(f"Saving the result to {output_folder}") os.makedirs(output_folder, exist_ok=True) time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime()) output_file = os.path.join( output_folder, f"datapoint_page_info_{len(result_df)}_documents_{time_stamp}.xlsx", ) with pd.ExcelWriter(output_file) as writer: result_df.to_excel(writer, index=False, sheet_name="dp_page_info") result_details_df.to_excel(writer, index=False, sheet_name="dp_page_info_details") if len(special_doc_id_list) == 0: logger.info(f"Calculating metrics for {output_file}") metrics_output_folder = r"/data/emea_ar/output/metrics/" missing_error_list, metrics_list, metrics_file = get_metrics( data_type="page_filter", prediction_file=output_file, prediction_sheet_name="dp_page_info", ground_truth_file=doc_data_excel_file, output_folder=metrics_output_folder, ) return missing_error_list, metrics_list, metrics_file def get_metrics( data_type: str, prediction_file: str, prediction_sheet_name: str, ground_truth_file: str, output_folder: str = None ) -> None: metrics = Metrics( data_type=data_type, prediction_file=prediction_file, prediction_sheet_name=prediction_sheet_name, ground_truth_file=ground_truth_file, output_folder=output_folder ) missing_error_list, metrics_list, metrics_file = metrics.get_metrics() return missing_error_list, metrics_list, metrics_file if __name__ == "__main__": pdf_folder = r"/data/emea_ar/small_pdf/" page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) prediction_output_folder = r"/data/emea_ar/output/filter_pages/" metrics_output_folder = r"/data/emea_ar/output/metrics/" special_doc_id_list = [] batch_filter_pdf_files( pdf_folder, page_filter_ground_truth_file, prediction_output_folder, special_doc_id_list ) # data_type = "page_filter" # prediction_file = r"/data/emea_ar/output/filter_pages/datapoint_page_info_73_documents_20240903145002.xlsx" # missing_error_list, metrics_list, metrics_file = get_metrics( # data_type, prediction_file, page_filter_ground_truth_file, metrics_output_folder # )