1. optimize drilldown algorithm
2. support calculate drilldown recall metrics
This commit is contained in:
parent
78fb283130
commit
fb356fce76
|
|
@ -0,0 +1,160 @@
|
||||||
|
from tqdm import tqdm
|
||||||
|
from glob import glob
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
from traceback import print_exc
|
||||||
|
from sklearn.metrics import recall_score
|
||||||
|
|
||||||
|
from utils.logger import logger
|
||||||
|
from utils.pdf_util import PDFUtil
|
||||||
|
|
||||||
|
|
||||||
|
def drilldown_documents():
|
||||||
|
# doc_id: str,
|
||||||
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
|
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
||||||
|
extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/'
|
||||||
|
extract_files = glob(extract_data_folder + '*.json')
|
||||||
|
|
||||||
|
for index, json_file in enumerate(tqdm(extract_files)):
|
||||||
|
try:
|
||||||
|
# doc_id = file.split('/')[-1].split('.')[0]
|
||||||
|
json_base_name = os.path.basename(json_file)
|
||||||
|
doc_id = json_base_name.split('.')[0]
|
||||||
|
logger.info(f"Processing {doc_id}")
|
||||||
|
pdf_file = os.path.join(pdf_folder, f"{doc_id}.pdf")
|
||||||
|
if not os.path.exists(pdf_file):
|
||||||
|
logger.error(f"PDF file not found for {doc_id}")
|
||||||
|
continue
|
||||||
|
with open(json_file, "r", encoding="utf-8") as f:
|
||||||
|
data_from_gpt = json.load(f)
|
||||||
|
drilldown_pdf_document(doc_id=doc_id,
|
||||||
|
pdf_file=pdf_file,
|
||||||
|
drilldown_folder=drilldown_folder,
|
||||||
|
data_from_gpt=data_from_gpt)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print_exc()
|
||||||
|
logger.error(f"Error in processing {doc_id}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def drilldown_pdf_document(doc_id:str,
|
||||||
|
pdf_file: str,
|
||||||
|
drilldown_folder: str,
|
||||||
|
data_from_gpt: list) -> list:
|
||||||
|
logger.info(f"Drilldown PDF document for doc_id: {doc_id}")
|
||||||
|
pdf_util = PDFUtil(pdf_file)
|
||||||
|
drilldown_data_list = []
|
||||||
|
for data in data_from_gpt:
|
||||||
|
doc_id = str(data.get("doc_id", ""))
|
||||||
|
# if doc_id != "506326520":
|
||||||
|
# continue
|
||||||
|
page_index = data.get("page_index", -1)
|
||||||
|
if page_index == -1:
|
||||||
|
continue
|
||||||
|
extract_data_list = data.get("extract_data", {}).get("data", [])
|
||||||
|
dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {})
|
||||||
|
if len(dp_reported_name_dict.keys()) == 0:
|
||||||
|
continue
|
||||||
|
highlighted_value_list = []
|
||||||
|
for extract_data in extract_data_list:
|
||||||
|
for data_point, value in extract_data.items():
|
||||||
|
if value in highlighted_value_list:
|
||||||
|
continue
|
||||||
|
if data_point in ["ter", "ogc", "performance_fee"]:
|
||||||
|
continue
|
||||||
|
drilldown_data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"page_index": page_index,
|
||||||
|
"data_point": data_point,
|
||||||
|
"parent_text_block": None,
|
||||||
|
"value": value,
|
||||||
|
"annotation_attribute": {}
|
||||||
|
}
|
||||||
|
drilldown_data_list.append(drilldown_data)
|
||||||
|
highlighted_value_list.append(value)
|
||||||
|
|
||||||
|
for data_point, reported_name in dp_reported_name_dict.items():
|
||||||
|
if reported_name in highlighted_value_list:
|
||||||
|
continue
|
||||||
|
data_point = f"{data_point}_reported_name"
|
||||||
|
drilldown_data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"page_index": page_index,
|
||||||
|
"data_point": data_point,
|
||||||
|
"parent_text_block": None,
|
||||||
|
"value": reported_name,
|
||||||
|
"annotation_attribute": {}
|
||||||
|
}
|
||||||
|
drilldown_data_list.append(drilldown_data)
|
||||||
|
highlighted_value_list.append(reported_name)
|
||||||
|
drilldown_result = []
|
||||||
|
if len(drilldown_data_list) > 0:
|
||||||
|
drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list,
|
||||||
|
output_pdf_folder=drilldown_folder)
|
||||||
|
if len(drilldown_result) > 0:
|
||||||
|
logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully")
|
||||||
|
annotation_list = drilldown_result.get("annotation_list", [])
|
||||||
|
for annotation in annotation_list:
|
||||||
|
annotation["doc_id"] = doc_id
|
||||||
|
if drilldown_folder is not None and len(drilldown_folder) > 0:
|
||||||
|
drilldown_data_folder = os.path.join(drilldown_folder, "data/")
|
||||||
|
os.makedirs(drilldown_data_folder, exist_ok=True)
|
||||||
|
drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx")
|
||||||
|
|
||||||
|
drilldown_source_df = pd.DataFrame(drilldown_data_list)
|
||||||
|
annotation_list_df = pd.DataFrame(annotation_list)
|
||||||
|
# set drilldown_result_df column order as doc_id, pdf_file, page_index,
|
||||||
|
# data_point, value, matching_val_area, normalized_bbox
|
||||||
|
annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index",
|
||||||
|
"data_point", "value", "matching_val_area", "normalized_bbox"]]
|
||||||
|
logger.info(f"Writing drilldown data to {drilldown_file}")
|
||||||
|
with pd.ExcelWriter(drilldown_file) as writer:
|
||||||
|
drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data")
|
||||||
|
annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data")
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_metrics():
|
||||||
|
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
||||||
|
drilldown_data_folder = os.path.join(drilldown_folder, "data/")
|
||||||
|
drilldown_files = glob(drilldown_data_folder + '*.xlsx')
|
||||||
|
y_true_list = []
|
||||||
|
y_pred_list = []
|
||||||
|
series_list = []
|
||||||
|
for drilldown_file in drilldown_files:
|
||||||
|
drilldown_file_base_name = os.path.basename(drilldown_file)
|
||||||
|
if drilldown_file_base_name.startswith("~"):
|
||||||
|
continue
|
||||||
|
drilldown_data = pd.read_excel(drilldown_file, sheet_name="drilldown_data")
|
||||||
|
for index, row in drilldown_data.iterrows():
|
||||||
|
matching_val_area = row["matching_val_area"]
|
||||||
|
# transform matching_val_area to list
|
||||||
|
if isinstance(matching_val_area, str):
|
||||||
|
matching_val_area = eval(matching_val_area)
|
||||||
|
y_true_list.append(1)
|
||||||
|
if len(matching_val_area) > 0:
|
||||||
|
y_pred_list.append(1)
|
||||||
|
else:
|
||||||
|
y_pred_list.append(0)
|
||||||
|
series_list.append(row)
|
||||||
|
recall = recall_score(y_true_list, y_pred_list)
|
||||||
|
logger.info(f"Recall: {recall}, Support: {len(y_true_list)}")
|
||||||
|
no_annotation_df = pd.DataFrame(series_list)
|
||||||
|
no_annotation_df.reset_index(drop=True, inplace=True)
|
||||||
|
metrics_folder = os.path.join(drilldown_folder, "metrics/")
|
||||||
|
os.makedirs(metrics_folder, exist_ok=True)
|
||||||
|
metrics_file = os.path.join(metrics_folder, "metrics.xlsx")
|
||||||
|
metrics_result = {
|
||||||
|
"recall": recall,
|
||||||
|
"support": len(y_true_list)
|
||||||
|
}
|
||||||
|
metrics_df = pd.DataFrame([metrics_result])
|
||||||
|
with pd.ExcelWriter(metrics_file) as writer:
|
||||||
|
metrics_df.to_excel(writer, index=False, sheet_name="metrics")
|
||||||
|
no_annotation_df.to_excel(writer, index=False, sheet_name="no_annotation")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
drilldown_documents()
|
||||||
|
calculate_metrics()
|
||||||
22
main.py
22
main.py
|
|
@ -139,6 +139,7 @@ class EMEA_AR_Parsing:
|
||||||
pdf_util = PDFUtil(self.pdf_file)
|
pdf_util = PDFUtil(self.pdf_file)
|
||||||
drilldown_data_list = []
|
drilldown_data_list = []
|
||||||
for data in data_from_gpt:
|
for data in data_from_gpt:
|
||||||
|
doc_id = str(data.get("doc_id", ""))
|
||||||
page_index = data.get("page_index", -1)
|
page_index = data.get("page_index", -1)
|
||||||
if page_index == -1:
|
if page_index == -1:
|
||||||
continue
|
continue
|
||||||
|
|
@ -152,6 +153,7 @@ class EMEA_AR_Parsing:
|
||||||
if data_point in ["ter", "ogc", "performance_fee"]:
|
if data_point in ["ter", "ogc", "performance_fee"]:
|
||||||
continue
|
continue
|
||||||
drilldown_data = {
|
drilldown_data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
"page_index": page_index,
|
"page_index": page_index,
|
||||||
"data_point": data_point,
|
"data_point": data_point,
|
||||||
"parent_text_block": None,
|
"parent_text_block": None,
|
||||||
|
|
@ -166,6 +168,7 @@ class EMEA_AR_Parsing:
|
||||||
continue
|
continue
|
||||||
data_point = f"{data_point}_reported_name"
|
data_point = f"{data_point}_reported_name"
|
||||||
drilldown_data = {
|
drilldown_data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
"page_index": page_index,
|
"page_index": page_index,
|
||||||
"data_point": data_point,
|
"data_point": data_point,
|
||||||
"parent_text_block": None,
|
"parent_text_block": None,
|
||||||
|
|
@ -177,6 +180,25 @@ class EMEA_AR_Parsing:
|
||||||
|
|
||||||
drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list,
|
drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list,
|
||||||
output_pdf_folder=self.drilldown_folder)
|
output_pdf_folder=self.drilldown_folder)
|
||||||
|
if len(drilldown_result) > 0:
|
||||||
|
logger.info(f"Drilldown PDF document for doc_id: {self.doc_id} successfully")
|
||||||
|
for drilldown_data in drilldown_result:
|
||||||
|
drilldown_data["doc_id"] = self.doc_id
|
||||||
|
if self.drilldown_folder is not None and len(self.drilldown_folder) > 0:
|
||||||
|
drilldown_data_folder = os.path.join(self.drilldown_folder, "data/")
|
||||||
|
os.makedirs(drilldown_data_folder, exist_ok=True)
|
||||||
|
drilldown_file = os.path.join(drilldown_data_folder, f"{self.doc_id}_drilldown.xlsx")
|
||||||
|
|
||||||
|
drilldown_source_df = pd.DataFrame(drilldown_data_list)
|
||||||
|
drilldown_result_df = pd.DataFrame(drilldown_result)
|
||||||
|
# set drilldown_result_df column order as doc_id, pdf_file, page_index,
|
||||||
|
# data_point, value, matching_val_area, normalized_bbox
|
||||||
|
drilldown_result_df = drilldown_result_df[["doc_id", "pdf_file", "page_index",
|
||||||
|
"data_point", "value", "matching_val_area", "normalized_bbox"]]
|
||||||
|
with pd.ExcelWriter(drilldown_file) as writer:
|
||||||
|
drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data")
|
||||||
|
drilldown_result_df.to_excel(writer, index=False, sheet_name="drilldown_data")
|
||||||
|
|
||||||
|
|
||||||
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
||||||
if not re_run:
|
if not re_run:
|
||||||
|
|
|
||||||
|
|
@ -430,6 +430,8 @@ class PDFUtil:
|
||||||
# order bbox_list by y0, x0, y1, x1
|
# order bbox_list by y0, x0, y1, x1
|
||||||
bbox_list = sorted(bbox_list, key=lambda x: (x[1], x[0], x[3], x[2]))
|
bbox_list = sorted(bbox_list, key=lambda x: (x[1], x[0], x[3], x[2]))
|
||||||
annotation_data["matching_val_area"] = bbox_list
|
annotation_data["matching_val_area"] = bbox_list
|
||||||
|
if len(bbox_list) > 0:
|
||||||
|
annotation_data["normalized_bbox"] = self.get_bbox_normalized(page, bbox_list)
|
||||||
return annotation_data
|
return annotation_data
|
||||||
|
|
||||||
def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True):
|
def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True):
|
||||||
|
|
@ -543,7 +545,7 @@ class PDFUtil:
|
||||||
end_index = start_index + len(pure_text_block)
|
end_index = start_index + len(pure_text_block)
|
||||||
if end_index < len(text):
|
if end_index < len(text):
|
||||||
next_char = text[end_index].strip()
|
next_char = text[end_index].strip()
|
||||||
if next_char not in ["", " ", "%", ")"]:
|
if next_char not in ["", " ", "%", ")", "0"]:
|
||||||
continue
|
continue
|
||||||
new_matching_val_area.append(area)
|
new_matching_val_area.append(area)
|
||||||
matching_val_area = new_matching_val_area
|
matching_val_area = new_matching_val_area
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue