2024-08-23 21:38:11 +00:00
|
|
|
import os
|
|
|
|
|
import json
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from glob import glob
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
import time
|
|
|
|
|
from utils.logger import logger
|
|
|
|
|
from utils.pdf_download import download_pdf_from_documents_warehouse
|
|
|
|
|
from utils.sql_query_util import query_document_fund_mapping
|
|
|
|
|
from core.page_filter import FilterPages
|
2024-09-03 22:07:53 +00:00
|
|
|
from core.metrics import Metrics
|
2024-08-23 21:38:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class EMEA_AR_Parsing:
|
|
|
|
|
def __init__(self, doc_id: str, pdf_folder: str = r"/data/emea_ar/pdf/") -> None:
|
|
|
|
|
self.doc_id = doc_id
|
|
|
|
|
self.pdf_folder = pdf_folder
|
|
|
|
|
os.makedirs(self.pdf_folder, exist_ok=True)
|
|
|
|
|
self.pdf_file = self.download_pdf()
|
|
|
|
|
self.document_mapping_info_df = query_document_fund_mapping(doc_id)
|
2024-09-03 22:07:53 +00:00
|
|
|
self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
|
2024-08-23 21:38:11 +00:00
|
|
|
|
|
|
|
|
def download_pdf(self) -> str:
|
|
|
|
|
pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
|
|
|
|
|
return pdf_file
|
|
|
|
|
|
|
|
|
|
def get_datapoint_page_info(self) -> dict:
|
|
|
|
|
filter_pages = FilterPages(
|
|
|
|
|
self.doc_id, self.pdf_file, self.document_mapping_info_df
|
|
|
|
|
)
|
2024-09-03 22:07:53 +00:00
|
|
|
datapoint_page_info, result_details = filter_pages.start_job()
|
|
|
|
|
return datapoint_page_info, result_details
|
2024-08-23 21:38:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def filter_pages(doc_id: str, pdf_folder: str) -> None:
|
|
|
|
|
logger.info(f"Parsing EMEA AR for doc_id: {doc_id}")
|
|
|
|
|
emea_ar_parsing = EMEA_AR_Parsing(doc_id, pdf_folder)
|
2024-09-03 22:07:53 +00:00
|
|
|
return emea_ar_parsing.datapoint_page_info, emea_ar_parsing.result_details
|
2024-08-23 21:38:11 +00:00
|
|
|
|
|
|
|
|
|
2024-09-03 22:07:53 +00:00
|
|
|
def batch_filter_pdf_files(
|
|
|
|
|
pdf_folder: str,
|
|
|
|
|
doc_data_excel_file: str = None,
|
|
|
|
|
output_folder: str = r"/data/emea_ar/output/filter_pages/",
|
|
|
|
|
special_doc_id_list: list = None,
|
|
|
|
|
) -> None:
|
2024-08-23 21:38:11 +00:00
|
|
|
pdf_files = glob(pdf_folder + "*.pdf")
|
2024-09-03 22:07:53 +00:00
|
|
|
doc_list = []
|
|
|
|
|
if special_doc_id_list is not None and len(special_doc_id_list) > 0:
|
|
|
|
|
doc_list = special_doc_id_list
|
|
|
|
|
if (
|
|
|
|
|
len(doc_list) == 0
|
|
|
|
|
and doc_data_excel_file is not None
|
|
|
|
|
and len(doc_data_excel_file) > 0
|
|
|
|
|
and os.path.exists(doc_data_excel_file)
|
|
|
|
|
):
|
|
|
|
|
doc_data_df = pd.read_excel(doc_data_excel_file)
|
|
|
|
|
doc_data_df = doc_data_df[doc_data_df["Checked"] == 1]
|
|
|
|
|
doc_list = [str(doc_id) for doc_id in doc_data_df["doc_id"].tolist()]
|
2024-08-23 21:38:11 +00:00
|
|
|
result_list = []
|
2024-09-03 22:07:53 +00:00
|
|
|
result_details = []
|
2024-08-23 21:38:11 +00:00
|
|
|
for pdf_file in tqdm(pdf_files):
|
|
|
|
|
pdf_base_name = os.path.basename(pdf_file)
|
|
|
|
|
doc_id = pdf_base_name.split(".")[0]
|
2024-09-03 22:07:53 +00:00
|
|
|
if doc_list is not None and doc_id not in doc_list:
|
|
|
|
|
continue
|
|
|
|
|
doc_datapoint_page_info, doc_result_details = filter_pages(doc_id=doc_id, pdf_folder=pdf_folder)
|
|
|
|
|
result_list.append(doc_datapoint_page_info)
|
|
|
|
|
result_details.extend(doc_result_details)
|
|
|
|
|
|
2024-08-23 21:38:11 +00:00
|
|
|
result_df = pd.DataFrame(result_list)
|
|
|
|
|
result_df.reset_index(drop=True, inplace=True)
|
2024-09-03 22:07:53 +00:00
|
|
|
|
|
|
|
|
result_details_df = pd.DataFrame(result_details)
|
|
|
|
|
result_details_df.reset_index(drop=True, inplace=True)
|
2024-08-23 21:38:11 +00:00
|
|
|
|
|
|
|
|
logger.info(f"Saving the result to {output_folder}")
|
|
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
|
|
|
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
|
|
|
|
output_file = os.path.join(
|
|
|
|
|
output_folder,
|
|
|
|
|
f"datapoint_page_info_{len(result_df)}_documents_{time_stamp}.xlsx",
|
|
|
|
|
)
|
|
|
|
|
with pd.ExcelWriter(output_file) as writer:
|
2024-09-03 22:07:53 +00:00
|
|
|
result_df.to_excel(writer, index=False, sheet_name="dp_page_info")
|
|
|
|
|
result_details_df.to_excel(writer, index=False, sheet_name="dp_page_info_details")
|
|
|
|
|
|
|
|
|
|
if len(special_doc_id_list) == 0:
|
|
|
|
|
logger.info(f"Calculating metrics for {output_file}")
|
|
|
|
|
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
|
|
|
|
missing_error_list, metrics_list, metrics_file = get_metrics(
|
|
|
|
|
data_type="page_filter",
|
|
|
|
|
prediction_file=output_file,
|
|
|
|
|
prediction_sheet_name="dp_page_info",
|
|
|
|
|
ground_truth_file=doc_data_excel_file,
|
|
|
|
|
output_folder=metrics_output_folder,
|
|
|
|
|
)
|
|
|
|
|
return missing_error_list, metrics_list, metrics_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_metrics(
|
|
|
|
|
data_type: str,
|
|
|
|
|
prediction_file: str,
|
|
|
|
|
prediction_sheet_name: str,
|
|
|
|
|
ground_truth_file: str,
|
|
|
|
|
output_folder: str = None
|
|
|
|
|
) -> None:
|
|
|
|
|
metrics = Metrics(
|
|
|
|
|
data_type=data_type,
|
|
|
|
|
prediction_file=prediction_file,
|
|
|
|
|
prediction_sheet_name=prediction_sheet_name,
|
|
|
|
|
ground_truth_file=ground_truth_file,
|
|
|
|
|
output_folder=output_folder
|
|
|
|
|
)
|
|
|
|
|
missing_error_list, metrics_list, metrics_file = metrics.get_metrics()
|
|
|
|
|
return missing_error_list, metrics_list, metrics_file
|
2024-08-23 21:38:11 +00:00
|
|
|
|
|
|
|
|
|
2024-08-19 20:49:45 +00:00
|
|
|
if __name__ == "__main__":
|
2024-08-23 21:38:11 +00:00
|
|
|
pdf_folder = r"/data/emea_ar/small_pdf/"
|
2024-09-03 22:07:53 +00:00
|
|
|
page_filter_ground_truth_file = (
|
|
|
|
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
|
|
|
|
)
|
|
|
|
|
prediction_output_folder = r"/data/emea_ar/output/filter_pages/"
|
|
|
|
|
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
|
|
|
|
special_doc_id_list = []
|
|
|
|
|
batch_filter_pdf_files(
|
|
|
|
|
pdf_folder, page_filter_ground_truth_file, prediction_output_folder, special_doc_id_list
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# data_type = "page_filter"
|
|
|
|
|
# prediction_file = r"/data/emea_ar/output/filter_pages/datapoint_page_info_73_documents_20240903145002.xlsx"
|
|
|
|
|
# missing_error_list, metrics_list, metrics_file = get_metrics(
|
|
|
|
|
# data_type, prediction_file, page_filter_ground_truth_file, metrics_output_folder
|
|
|
|
|
# )
|