453 lines
19 KiB
Python
453 lines
19 KiB
Python
import os
|
|
import pandas as pd
|
|
import time
|
|
import json
|
|
from sklearn.metrics import precision_score, recall_score, f1_score
|
|
from utils.biz_utils import get_unique_words_text, get_beginning_common_words
|
|
from utils.logger import logger
|
|
|
|
|
|
class Metrics:
|
|
def __init__(
|
|
self,
|
|
data_type: str,
|
|
prediction_file: str,
|
|
prediction_sheet_name: str = "Sheet1",
|
|
ground_truth_file: str = None,
|
|
ground_truth_sheet_name: str = "Sheet1",
|
|
output_folder: str = None,
|
|
) -> None:
|
|
self.data_type = data_type
|
|
self.prediction_file = prediction_file
|
|
self.prediction_sheet_name = prediction_sheet_name
|
|
self.ground_truth_file = ground_truth_file
|
|
self.ground_truth_sheet_name = ground_truth_sheet_name
|
|
|
|
if output_folder is None or len(output_folder) == 0:
|
|
output_folder = r"/data/emea_ar/output/metrics/"
|
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
|
self.output_file = os.path.join(
|
|
output_folder,
|
|
f"metrics_{data_type}_{time_stamp}.xlsx",
|
|
)
|
|
|
|
def get_metrics(self):
|
|
if (
|
|
self.prediction_file is None
|
|
or len(self.prediction_file) == 0
|
|
or not os.path.exists(self.prediction_file)
|
|
):
|
|
logger.error(f"Invalid prediction file: {self.prediction_file}")
|
|
return []
|
|
if (
|
|
self.ground_truth_file is None
|
|
or len(self.ground_truth_file) == 0
|
|
or not os.path.exists(self.ground_truth_file)
|
|
):
|
|
logger.error(f"Invalid ground truth file: {self.ground_truth_file}")
|
|
return []
|
|
|
|
metrics_list = [
|
|
{"Data_Point": "NAN", "Precision": 0, "Recall": 0, "F1": 0, "Support": 0}
|
|
]
|
|
|
|
missing_error_list, metrics_list = self.calculate_metrics()
|
|
|
|
missing_error_df = pd.DataFrame(missing_error_list)
|
|
missing_error_df.reset_index(drop=True, inplace=True)
|
|
|
|
metrics_df = pd.DataFrame(metrics_list)
|
|
metrics_df.reset_index(drop=True, inplace=True)
|
|
|
|
with pd.ExcelWriter(self.output_file) as writer:
|
|
missing_error_df.to_excel(writer, sheet_name="Missing_Error", index=False)
|
|
metrics_df.to_excel(writer, sheet_name="Metrics", index=False)
|
|
return missing_error_list, metrics_list, self.output_file
|
|
|
|
def calculate_metrics(self):
|
|
prediction_df = pd.read_excel(
|
|
self.prediction_file, sheet_name=self.prediction_sheet_name
|
|
)
|
|
ground_truth_df = pd.read_excel(
|
|
self.ground_truth_file, sheet_name=self.ground_truth_sheet_name
|
|
)
|
|
ground_truth_df = ground_truth_df[ground_truth_df["Checked"] == 1]
|
|
|
|
tor_true = []
|
|
tor_pred = []
|
|
|
|
ter_true = []
|
|
ter_pred = []
|
|
|
|
ogc_true = []
|
|
ogc_pred = []
|
|
|
|
performance_fee_true = []
|
|
performance_fee_pred = []
|
|
|
|
missing_error_list = []
|
|
data_point_list = ["tor", "ter", "ogc", "performance_fee"]
|
|
|
|
if self.data_type == "page_filter":
|
|
for index, row in ground_truth_df.iterrows():
|
|
doc_id = row["doc_id"]
|
|
# get first row with the same doc_id
|
|
prediction_data = prediction_df[prediction_df["doc_id"] == doc_id].iloc[
|
|
0
|
|
]
|
|
for data_point in data_point_list:
|
|
true_data, pred_data, missing_error_data = (
|
|
self.get_page_filter_true_pred_data(
|
|
doc_id, row, prediction_data, data_point
|
|
)
|
|
)
|
|
if data_point == "tor":
|
|
tor_true.extend(true_data)
|
|
tor_pred.extend(pred_data)
|
|
elif data_point == "ter":
|
|
ter_true.extend(true_data)
|
|
ter_pred.extend(pred_data)
|
|
elif data_point == "ogc":
|
|
ogc_true.extend(true_data)
|
|
ogc_pred.extend(pred_data)
|
|
elif data_point == "performance_fee":
|
|
performance_fee_true.extend(true_data)
|
|
performance_fee_pred.extend(pred_data)
|
|
missing_error_list.append(missing_error_data)
|
|
else:
|
|
prediction_doc_id_list = prediction_df["doc_id"].unique().tolist()
|
|
ground_truth_doc_id_list = ground_truth_df["doc_id"].unique().tolist()
|
|
# get intersection of doc_id_list
|
|
doc_id_list = list(set(prediction_doc_id_list) & set(ground_truth_doc_id_list))
|
|
# order by doc_id
|
|
doc_id_list.sort()
|
|
|
|
for doc_id in doc_id_list:
|
|
prediction_data = prediction_df[prediction_df["doc_id"] == doc_id]
|
|
ground_truth_data = ground_truth_df[ground_truth_df["doc_id"] == doc_id]
|
|
for data_point in data_point_list:
|
|
true_data, pred_data, missing_error_data = (
|
|
self.get_data_extraction_true_pred_data(
|
|
doc_id, ground_truth_data, prediction_data, data_point
|
|
)
|
|
)
|
|
if data_point == "tor":
|
|
tor_true.extend(true_data)
|
|
tor_pred.extend(pred_data)
|
|
elif data_point == "ter":
|
|
ter_true.extend(true_data)
|
|
ter_pred.extend(pred_data)
|
|
elif data_point == "ogc":
|
|
ogc_true.extend(true_data)
|
|
ogc_pred.extend(pred_data)
|
|
elif data_point == "performance_fee":
|
|
performance_fee_true.extend(true_data)
|
|
performance_fee_pred.extend(pred_data)
|
|
missing_error_list.extend(missing_error_data)
|
|
|
|
metrics_list = []
|
|
for data_point in data_point_list:
|
|
if data_point == "tor":
|
|
precision, recall, f1 = self.get_specific_metrics(tor_true, tor_pred)
|
|
tor_support = self.get_support_number(tor_true)
|
|
metrics_list.append(
|
|
{
|
|
"Data_Point": data_point,
|
|
"Precision": precision,
|
|
"Recall": recall,
|
|
"F1": f1,
|
|
"Support": tor_support,
|
|
}
|
|
)
|
|
logger.info(
|
|
f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {tor_support}"
|
|
)
|
|
elif data_point == "ter":
|
|
precision, recall, f1 = self.get_specific_metrics(ter_true, ter_pred)
|
|
ter_support = self.get_support_number(ter_true)
|
|
metrics_list.append(
|
|
{
|
|
"Data_Point": data_point,
|
|
"Precision": precision,
|
|
"Recall": recall,
|
|
"F1": f1,
|
|
"Support": ter_support,
|
|
}
|
|
)
|
|
logger.info(
|
|
f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ter_support}"
|
|
)
|
|
elif data_point == "ogc":
|
|
precision, recall, f1 = self.get_specific_metrics(ogc_true, ogc_pred)
|
|
ogc_support = self.get_support_number(ogc_true)
|
|
metrics_list.append(
|
|
{
|
|
"Data_Point": data_point,
|
|
"Precision": precision,
|
|
"Recall": recall,
|
|
"F1": f1,
|
|
"Support": ogc_support,
|
|
}
|
|
)
|
|
logger.info(
|
|
f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ogc_support}"
|
|
)
|
|
elif data_point == "performance_fee":
|
|
precision, recall, f1 = self.get_specific_metrics(
|
|
performance_fee_true, performance_fee_pred
|
|
)
|
|
performance_fee_support = self.get_support_number(performance_fee_true)
|
|
metrics_list.append(
|
|
{
|
|
"Data_Point": data_point,
|
|
"Precision": precision,
|
|
"Recall": recall,
|
|
"F1": f1,
|
|
"Support": performance_fee_support,
|
|
}
|
|
)
|
|
logger.info(
|
|
f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {performance_fee_support}"
|
|
)
|
|
|
|
# get average metrics
|
|
precision_list = [metric["Precision"] for metric in metrics_list]
|
|
recall_list = [metric["Recall"] for metric in metrics_list]
|
|
f1_list = [metric["F1"] for metric in metrics_list]
|
|
metrics_list.append(
|
|
{
|
|
"Data_Point": "Average",
|
|
"Precision": sum(precision_list) / len(precision_list),
|
|
"Recall": sum(recall_list) / len(recall_list),
|
|
"F1": sum(f1_list) / len(f1_list),
|
|
"Support": sum([metric["Support"] for metric in metrics_list]),
|
|
}
|
|
)
|
|
return missing_error_list, metrics_list
|
|
|
|
def get_support_number(self, true_data: list):
|
|
# get the count which true_data is 1
|
|
return sum(true_data)
|
|
|
|
def get_page_filter_true_pred_data(
|
|
self,
|
|
doc_id,
|
|
ground_truth_data: pd.Series,
|
|
prediction_data: pd.Series,
|
|
data_point: str,
|
|
):
|
|
ground_truth_list = ground_truth_data[data_point]
|
|
if isinstance(ground_truth_list, str):
|
|
ground_truth_list = json.loads(ground_truth_list)
|
|
prediction_list = prediction_data[data_point]
|
|
if isinstance(prediction_list, str):
|
|
prediction_list = json.loads(prediction_list)
|
|
|
|
true_data = []
|
|
pred_data = []
|
|
|
|
missing_error_data = {
|
|
"doc_id": doc_id,
|
|
"data_point": data_point,
|
|
"missing": [],
|
|
"error": [],
|
|
}
|
|
|
|
missing_data = []
|
|
error_data = []
|
|
|
|
if len(ground_truth_list) == 0 and len(prediction_list) == 0:
|
|
true_data.append(1)
|
|
pred_data.append(1)
|
|
return true_data, pred_data, missing_error_data
|
|
|
|
for prediction in prediction_list:
|
|
if prediction in ground_truth_list:
|
|
true_data.append(1)
|
|
pred_data.append(1)
|
|
else:
|
|
true_data.append(0)
|
|
pred_data.append(1)
|
|
error_data.append(prediction)
|
|
|
|
for ground_truth in ground_truth_list:
|
|
if ground_truth not in prediction_list:
|
|
true_data.append(1)
|
|
pred_data.append(0)
|
|
missing_data.append(ground_truth)
|
|
missing_error_data = {
|
|
"doc_id": doc_id,
|
|
"data_point": data_point,
|
|
"missing": missing_data,
|
|
"error": error_data,
|
|
}
|
|
|
|
return true_data, pred_data, missing_error_data
|
|
|
|
def get_data_extraction_true_pred_data(
|
|
self,
|
|
doc_id,
|
|
ground_truth_data: pd.DataFrame,
|
|
prediction_data: pd.DataFrame,
|
|
data_point: str,
|
|
):
|
|
dp_prediction = prediction_data[prediction_data["datapoint"] == data_point]
|
|
dp_prediction = self.modify_data(dp_prediction)
|
|
pred_simple_raw_names = dp_prediction["simple_raw_name"].unique().tolist()
|
|
pred_simple_name_unique_words_list = dp_prediction["simple_name_unique_words"].unique().tolist()
|
|
|
|
dp_ground_truth = ground_truth_data[
|
|
ground_truth_data["datapoint"] == data_point
|
|
]
|
|
dp_ground_truth = self.modify_data(dp_ground_truth)
|
|
gt_simple_raw_names = dp_ground_truth["simple_raw_name"].unique().tolist()
|
|
gt_simple_name_unique_words_list = dp_ground_truth["simple_name_unique_words"].unique().tolist()
|
|
|
|
true_data = []
|
|
pred_data = []
|
|
|
|
missing_error_data = []
|
|
|
|
if len(dp_ground_truth) == 0 and len(dp_prediction) == 0:
|
|
true_data.append(1)
|
|
pred_data.append(1)
|
|
return true_data, pred_data, missing_error_data
|
|
|
|
for index, prediction in dp_prediction.iterrows():
|
|
pred_page_index = prediction["page_index"]
|
|
pred_raw_name = prediction["raw_name"]
|
|
pred_simple_raw_name = prediction["simple_raw_name"]
|
|
pred_simple_name_unique_words = prediction["simple_name_unique_words"]
|
|
pred_data_point_value = prediction["value"]
|
|
pred_investment_type = prediction["investment_type"]
|
|
|
|
find_raw_name_in_gt = [gt_raw_name for gt_raw_name in gt_simple_raw_names
|
|
if (gt_raw_name in pred_simple_raw_name or pred_simple_raw_name in gt_raw_name)
|
|
and gt_raw_name.endswith(pred_raw_name.split()[-1])]
|
|
if pred_simple_name_unique_words in gt_simple_name_unique_words_list or \
|
|
len(find_raw_name_in_gt) > 0:
|
|
# get the ground truth data with the same unique words
|
|
if pred_simple_name_unique_words in gt_simple_name_unique_words_list:
|
|
gt_data_df = dp_ground_truth[
|
|
dp_ground_truth["simple_name_unique_words"] == pred_simple_name_unique_words
|
|
]
|
|
if len(gt_data_df) > 1:
|
|
if len(gt_data_df[gt_data_df["page_index"] == pred_page_index]) == 0:
|
|
gt_data = gt_data_df.iloc[0]
|
|
else:
|
|
gt_data = gt_data_df[gt_data_df["page_index"] == pred_page_index].iloc[0]
|
|
elif len(gt_data_df) == 1:
|
|
gt_data = gt_data_df.iloc[0]
|
|
else:
|
|
gt_data = None
|
|
else:
|
|
gt_data_df = dp_ground_truth[
|
|
dp_ground_truth["simple_raw_name"] == find_raw_name_in_gt[0]
|
|
]
|
|
if len(gt_data_df) > 1:
|
|
if len(gt_data_df[gt_data_df["page_index"] == pred_page_index]) == 0:
|
|
gt_data = gt_data_df.iloc[0]
|
|
else:
|
|
gt_data = gt_data_df[gt_data_df["page_index"] == pred_page_index].iloc[0]
|
|
elif len(gt_data_df) == 1:
|
|
gt_data = gt_data_df.iloc[0]
|
|
else:
|
|
gt_data = None
|
|
if gt_data is None:
|
|
gt_data_point_value = None
|
|
else:
|
|
gt_data_point_value = gt_data["value"]
|
|
if gt_data_point_value is not None and \
|
|
pred_data_point_value == gt_data_point_value:
|
|
true_data.append(1)
|
|
pred_data.append(1)
|
|
else:
|
|
true_data.append(0)
|
|
pred_data.append(1)
|
|
error_data = {
|
|
"doc_id": doc_id,
|
|
"data_point": data_point,
|
|
"page_index": pred_page_index,
|
|
"pred_raw_name": pred_raw_name,
|
|
"investment_type": pred_investment_type,
|
|
"error_type": "data value incorrect",
|
|
"error_value": pred_data_point_value,
|
|
"correct_value": gt_data_point_value,
|
|
}
|
|
missing_error_data.append(error_data)
|
|
else:
|
|
true_data.append(0)
|
|
pred_data.append(1)
|
|
error_data = {
|
|
"doc_id": doc_id,
|
|
"data_point": data_point,
|
|
"page_index": pred_page_index,
|
|
"pred_raw_name": pred_raw_name,
|
|
"investment_type": pred_investment_type,
|
|
"error_type": "raw name incorrect",
|
|
"error_value": pred_raw_name,
|
|
"correct_value": "",
|
|
}
|
|
missing_error_data.append(error_data)
|
|
|
|
for index, ground_truth in dp_ground_truth.iterrows():
|
|
gt_page_index = ground_truth["page_index"]
|
|
gt_raw_name = ground_truth["raw_name"]
|
|
gt_simple_raw_name = ground_truth["simple_raw_name"]
|
|
gt_simple_name_unique_words = ground_truth["simple_name_unique_words"]
|
|
gt_data_point_value = ground_truth["value"]
|
|
gt_investment_type = ground_truth["investment_type"]
|
|
|
|
find_raw_name_in_pred = [pred_raw_name for pred_raw_name in pred_simple_raw_names
|
|
if (gt_simple_raw_name in pred_raw_name or pred_raw_name in gt_simple_raw_name)
|
|
and pred_raw_name.endswith(gt_raw_name.split()[-1])]
|
|
|
|
if gt_simple_name_unique_words not in pred_simple_name_unique_words_list and \
|
|
len(find_raw_name_in_pred) == 0:
|
|
true_data.append(1)
|
|
pred_data.append(0)
|
|
error_data = {
|
|
"doc_id": doc_id,
|
|
"data_point": data_point,
|
|
"page_index": gt_page_index,
|
|
"pred_raw_name": "",
|
|
"investment_type": gt_investment_type,
|
|
"error_type": "raw name missing",
|
|
"error_value": "",
|
|
"correct_value": gt_raw_name,
|
|
}
|
|
missing_error_data.append(error_data)
|
|
|
|
return true_data, pred_data, missing_error_data
|
|
|
|
def modify_data(self, data: pd.DataFrame):
|
|
data["simple_raw_name"] = ""
|
|
data["simple_name_unique_words"] = ""
|
|
page_index_list = data["page_index"].unique().tolist()
|
|
for pagex_index in page_index_list:
|
|
page_data = data[data["page_index"] == pagex_index]
|
|
raw_name_list = page_data["raw_name"].unique().tolist()
|
|
beginning_common_words = get_beginning_common_words(raw_name_list)
|
|
for raw_name in raw_name_list:
|
|
if beginning_common_words is not None and len(beginning_common_words) > 0:
|
|
simple_raw_name = raw_name.replace(beginning_common_words, "").strip()
|
|
else:
|
|
simple_raw_name = raw_name
|
|
# set simple_raw_name which with the same page and same raw_name
|
|
data.loc[(data["page_index"] == pagex_index) & (data["raw_name"] == raw_name),
|
|
"simple_raw_name"] = simple_raw_name
|
|
data.loc[(data["page_index"] == pagex_index) & (data["raw_name"] == raw_name),
|
|
"simple_name_unique_words"] = get_unique_words_text(simple_raw_name)
|
|
return data
|
|
|
|
def get_specific_metrics(self, true_data: list, pred_data: list):
|
|
precision = precision_score(true_data, pred_data)
|
|
recall = recall_score(true_data, pred_data)
|
|
f1 = f1_score(true_data, pred_data)
|
|
return precision, recall, f1
|
|
|
|
def get_datapoint_metrics(self):
|
|
pass
|