from tqdm import tqdm from glob import glob import json import pandas as pd import os from traceback import print_exc from sklearn.metrics import recall_score from utils.logger import logger from utils.pdf_util import PDFUtil def drilldown_documents(): # doc_id: str, pdf_folder = r"/data/emea_ar/pdf/" drilldown_folder = r"/data/emea_ar/output/drilldown/" extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/' extract_files = glob(extract_data_folder + '*.json') for index, json_file in enumerate(tqdm(extract_files)): try: # doc_id = file.split('/')[-1].split('.')[0] json_base_name = os.path.basename(json_file) doc_id = json_base_name.split('.')[0] logger.info(f"Processing {doc_id}") pdf_file = os.path.join(pdf_folder, f"{doc_id}.pdf") if not os.path.exists(pdf_file): logger.error(f"PDF file not found for {doc_id}") continue with open(json_file, "r", encoding="utf-8") as f: data_from_gpt = json.load(f) drilldown_pdf_document(doc_id=doc_id, pdf_file=pdf_file, drilldown_folder=drilldown_folder, data_from_gpt=data_from_gpt) except Exception as e: print_exc() logger.error(f"Error in processing {doc_id}: {e}") def drilldown_pdf_document(doc_id:str, pdf_file: str, drilldown_folder: str, data_from_gpt: list) -> list: logger.info(f"Drilldown PDF document for doc_id: {doc_id}") pdf_util = PDFUtil(pdf_file) drilldown_data_list = [] for data in data_from_gpt: doc_id = str(data.get("doc_id", "")) # if doc_id != "506326520": # continue page_index = data.get("page_index", -1) if page_index == -1: continue extract_data_list = data.get("extract_data", {}).get("data", []) dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {}) if len(dp_reported_name_dict.keys()) == 0: continue highlighted_value_list = [] for extract_data in extract_data_list: for data_point, value in extract_data.items(): if value in highlighted_value_list: continue if data_point in ["ter", "ogc", "performance_fee"]: continue drilldown_data = { "doc_id": doc_id, "page_index": page_index, "data_point": data_point, "parent_text_block": None, "value": value, "annotation_attribute": {} } drilldown_data_list.append(drilldown_data) highlighted_value_list.append(value) for data_point, reported_name in dp_reported_name_dict.items(): if reported_name in highlighted_value_list: continue data_point = f"{data_point}_reported_name" drilldown_data = { "doc_id": doc_id, "page_index": page_index, "data_point": data_point, "parent_text_block": None, "value": reported_name, "annotation_attribute": {} } drilldown_data_list.append(drilldown_data) highlighted_value_list.append(reported_name) drilldown_result = [] if len(drilldown_data_list) > 0: drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list, output_pdf_folder=drilldown_folder) if len(drilldown_result) > 0: logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully") annotation_list = drilldown_result.get("annotation_list", []) for annotation in annotation_list: annotation["doc_id"] = doc_id if drilldown_folder is not None and len(drilldown_folder) > 0: drilldown_data_folder = os.path.join(drilldown_folder, "data/") os.makedirs(drilldown_data_folder, exist_ok=True) drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx") drilldown_source_df = pd.DataFrame(drilldown_data_list) annotation_list_df = pd.DataFrame(annotation_list) # set drilldown_result_df column order as doc_id, pdf_file, page_index, # data_point, value, matching_val_area, normalized_bbox annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index", "data_point", "value", "matching_val_area", "normalized_bbox"]] logger.info(f"Writing drilldown data to {drilldown_file}") with pd.ExcelWriter(drilldown_file) as writer: drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data") annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data") def calculate_metrics(): drilldown_folder = r"/data/emea_ar/output/drilldown/" drilldown_data_folder = os.path.join(drilldown_folder, "data/") drilldown_files = glob(drilldown_data_folder + '*.xlsx') y_true_list = [] y_pred_list = [] series_list = [] for drilldown_file in drilldown_files: drilldown_file_base_name = os.path.basename(drilldown_file) if drilldown_file_base_name.startswith("~"): continue drilldown_data = pd.read_excel(drilldown_file, sheet_name="drilldown_data") for index, row in drilldown_data.iterrows(): matching_val_area = row["matching_val_area"] # transform matching_val_area to list if isinstance(matching_val_area, str): matching_val_area = eval(matching_val_area) y_true_list.append(1) if len(matching_val_area) > 0: y_pred_list.append(1) else: y_pred_list.append(0) series_list.append(row) recall = recall_score(y_true_list, y_pred_list) logger.info(f"Recall: {recall}, Support: {len(y_true_list)}") no_annotation_df = pd.DataFrame(series_list) no_annotation_df.reset_index(drop=True, inplace=True) metrics_folder = os.path.join(drilldown_folder, "metrics/") os.makedirs(metrics_folder, exist_ok=True) metrics_file = os.path.join(metrics_folder, "metrics.xlsx") metrics_result = { "recall": recall, "support": len(y_true_list) } metrics_df = pd.DataFrame([metrics_result]) with pd.ExcelWriter(metrics_file) as writer: metrics_df.to_excel(writer, index=False, sheet_name="metrics") no_annotation_df.to_excel(writer, index=False, sheet_name="no_annotation") if __name__ == "__main__": drilldown_documents() # calculate_metrics()