diff --git a/drilldown_practice.py b/drilldown_practice.py new file mode 100644 index 0000000..3869179 --- /dev/null +++ b/drilldown_practice.py @@ -0,0 +1,160 @@ +from tqdm import tqdm +from glob import glob +import json +import pandas as pd +import os +from traceback import print_exc +from sklearn.metrics import recall_score + +from utils.logger import logger +from utils.pdf_util import PDFUtil + + +def drilldown_documents(): + # doc_id: str, + pdf_folder = r"/data/emea_ar/pdf/" + drilldown_folder = r"/data/emea_ar/output/drilldown/" + extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/' + extract_files = glob(extract_data_folder + '*.json') + + for index, json_file in enumerate(tqdm(extract_files)): + try: + # doc_id = file.split('/')[-1].split('.')[0] + json_base_name = os.path.basename(json_file) + doc_id = json_base_name.split('.')[0] + logger.info(f"Processing {doc_id}") + pdf_file = os.path.join(pdf_folder, f"{doc_id}.pdf") + if not os.path.exists(pdf_file): + logger.error(f"PDF file not found for {doc_id}") + continue + with open(json_file, "r", encoding="utf-8") as f: + data_from_gpt = json.load(f) + drilldown_pdf_document(doc_id=doc_id, + pdf_file=pdf_file, + drilldown_folder=drilldown_folder, + data_from_gpt=data_from_gpt) + + except Exception as e: + print_exc() + logger.error(f"Error in processing {doc_id}: {e}") + + +def drilldown_pdf_document(doc_id:str, + pdf_file: str, + drilldown_folder: str, + data_from_gpt: list) -> list: + logger.info(f"Drilldown PDF document for doc_id: {doc_id}") + pdf_util = PDFUtil(pdf_file) + drilldown_data_list = [] + for data in data_from_gpt: + doc_id = str(data.get("doc_id", "")) + # if doc_id != "506326520": + # continue + page_index = data.get("page_index", -1) + if page_index == -1: + continue + extract_data_list = data.get("extract_data", {}).get("data", []) + dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {}) + if len(dp_reported_name_dict.keys()) == 0: + continue + highlighted_value_list = [] + for extract_data in extract_data_list: + for data_point, value in extract_data.items(): + if value in highlighted_value_list: + continue + if data_point in ["ter", "ogc", "performance_fee"]: + continue + drilldown_data = { + "doc_id": doc_id, + "page_index": page_index, + "data_point": data_point, + "parent_text_block": None, + "value": value, + "annotation_attribute": {} + } + drilldown_data_list.append(drilldown_data) + highlighted_value_list.append(value) + + for data_point, reported_name in dp_reported_name_dict.items(): + if reported_name in highlighted_value_list: + continue + data_point = f"{data_point}_reported_name" + drilldown_data = { + "doc_id": doc_id, + "page_index": page_index, + "data_point": data_point, + "parent_text_block": None, + "value": reported_name, + "annotation_attribute": {} + } + drilldown_data_list.append(drilldown_data) + highlighted_value_list.append(reported_name) + drilldown_result = [] + if len(drilldown_data_list) > 0: + drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list, + output_pdf_folder=drilldown_folder) + if len(drilldown_result) > 0: + logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully") + annotation_list = drilldown_result.get("annotation_list", []) + for annotation in annotation_list: + annotation["doc_id"] = doc_id + if drilldown_folder is not None and len(drilldown_folder) > 0: + drilldown_data_folder = os.path.join(drilldown_folder, "data/") + os.makedirs(drilldown_data_folder, exist_ok=True) + drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx") + + drilldown_source_df = pd.DataFrame(drilldown_data_list) + annotation_list_df = pd.DataFrame(annotation_list) + # set drilldown_result_df column order as doc_id, pdf_file, page_index, + # data_point, value, matching_val_area, normalized_bbox + annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index", + "data_point", "value", "matching_val_area", "normalized_bbox"]] + logger.info(f"Writing drilldown data to {drilldown_file}") + with pd.ExcelWriter(drilldown_file) as writer: + drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data") + annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data") + + +def calculate_metrics(): + drilldown_folder = r"/data/emea_ar/output/drilldown/" + drilldown_data_folder = os.path.join(drilldown_folder, "data/") + drilldown_files = glob(drilldown_data_folder + '*.xlsx') + y_true_list = [] + y_pred_list = [] + series_list = [] + for drilldown_file in drilldown_files: + drilldown_file_base_name = os.path.basename(drilldown_file) + if drilldown_file_base_name.startswith("~"): + continue + drilldown_data = pd.read_excel(drilldown_file, sheet_name="drilldown_data") + for index, row in drilldown_data.iterrows(): + matching_val_area = row["matching_val_area"] + # transform matching_val_area to list + if isinstance(matching_val_area, str): + matching_val_area = eval(matching_val_area) + y_true_list.append(1) + if len(matching_val_area) > 0: + y_pred_list.append(1) + else: + y_pred_list.append(0) + series_list.append(row) + recall = recall_score(y_true_list, y_pred_list) + logger.info(f"Recall: {recall}, Support: {len(y_true_list)}") + no_annotation_df = pd.DataFrame(series_list) + no_annotation_df.reset_index(drop=True, inplace=True) + metrics_folder = os.path.join(drilldown_folder, "metrics/") + os.makedirs(metrics_folder, exist_ok=True) + metrics_file = os.path.join(metrics_folder, "metrics.xlsx") + metrics_result = { + "recall": recall, + "support": len(y_true_list) + } + metrics_df = pd.DataFrame([metrics_result]) + with pd.ExcelWriter(metrics_file) as writer: + metrics_df.to_excel(writer, index=False, sheet_name="metrics") + no_annotation_df.to_excel(writer, index=False, sheet_name="no_annotation") + + +if __name__ == "__main__": + drilldown_documents() + calculate_metrics() \ No newline at end of file diff --git a/main.py b/main.py index a1cfd3d..fa8be69 100644 --- a/main.py +++ b/main.py @@ -139,6 +139,7 @@ class EMEA_AR_Parsing: pdf_util = PDFUtil(self.pdf_file) drilldown_data_list = [] for data in data_from_gpt: + doc_id = str(data.get("doc_id", "")) page_index = data.get("page_index", -1) if page_index == -1: continue @@ -152,6 +153,7 @@ class EMEA_AR_Parsing: if data_point in ["ter", "ogc", "performance_fee"]: continue drilldown_data = { + "doc_id": doc_id, "page_index": page_index, "data_point": data_point, "parent_text_block": None, @@ -166,6 +168,7 @@ class EMEA_AR_Parsing: continue data_point = f"{data_point}_reported_name" drilldown_data = { + "doc_id": doc_id, "page_index": page_index, "data_point": data_point, "parent_text_block": None, @@ -177,6 +180,25 @@ class EMEA_AR_Parsing: drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list, output_pdf_folder=self.drilldown_folder) + if len(drilldown_result) > 0: + logger.info(f"Drilldown PDF document for doc_id: {self.doc_id} successfully") + for drilldown_data in drilldown_result: + drilldown_data["doc_id"] = self.doc_id + if self.drilldown_folder is not None and len(self.drilldown_folder) > 0: + drilldown_data_folder = os.path.join(self.drilldown_folder, "data/") + os.makedirs(drilldown_data_folder, exist_ok=True) + drilldown_file = os.path.join(drilldown_data_folder, f"{self.doc_id}_drilldown.xlsx") + + drilldown_source_df = pd.DataFrame(drilldown_data_list) + drilldown_result_df = pd.DataFrame(drilldown_result) + # set drilldown_result_df column order as doc_id, pdf_file, page_index, + # data_point, value, matching_val_area, normalized_bbox + drilldown_result_df = drilldown_result_df[["doc_id", "pdf_file", "page_index", + "data_point", "value", "matching_val_area", "normalized_bbox"]] + with pd.ExcelWriter(drilldown_file) as writer: + drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data") + drilldown_result_df.to_excel(writer, index=False, sheet_name="drilldown_data") + def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list: if not re_run: diff --git a/utils/pdf_util.py b/utils/pdf_util.py index bdadff2..698c133 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -430,6 +430,8 @@ class PDFUtil: # order bbox_list by y0, x0, y1, x1 bbox_list = sorted(bbox_list, key=lambda x: (x[1], x[0], x[3], x[2])) annotation_data["matching_val_area"] = bbox_list + if len(bbox_list) > 0: + annotation_data["normalized_bbox"] = self.get_bbox_normalized(page, bbox_list) return annotation_data def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True): @@ -543,7 +545,7 @@ class PDFUtil: end_index = start_index + len(pure_text_block) if end_index < len(text): next_char = text[end_index].strip() - if next_char not in ["", " ", "%", ")"]: + if next_char not in ["", " ", "%", ")", "0"]: continue new_matching_val_area.append(area) matching_val_area = new_matching_val_area