1. optimize investment mapping algorithm
2. realize investment mapping metrics
This commit is contained in:
parent
0f14bf4a7a
commit
dd6701f18c
|
|
@ -1,7 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from utils.biz_utils import get_most_similar_name
|
from utils.biz_utils import get_most_similar_name, remove_common_word
|
||||||
from utils.sql_query_util import (
|
from utils.sql_query_util import (
|
||||||
query_document_fund_mapping,
|
query_document_fund_mapping,
|
||||||
query_investment_by_provider,
|
query_investment_by_provider,
|
||||||
|
|
@ -270,8 +270,12 @@ class DataMapping:
|
||||||
else:
|
else:
|
||||||
if parent_id is not None and len(parent_id) > 0:
|
if parent_id is not None and len(parent_id) > 0:
|
||||||
# filter self.doc_fund_class_mapping by parent_id as FundId
|
# filter self.doc_fund_class_mapping by parent_id as FundId
|
||||||
doc_compare_mapping = None
|
doc_compare_mapping = self.doc_fund_class_mapping[
|
||||||
doc_compare_name_list = None
|
self.doc_fund_class_mapping["FundId"] == parent_id
|
||||||
|
]
|
||||||
|
doc_compare_name_list = (
|
||||||
|
doc_compare_mapping["ShareClassName"].unique().tolist()
|
||||||
|
)
|
||||||
|
|
||||||
provider_compare_mapping = self.provider_fund_class_mapping[
|
provider_compare_mapping = self.provider_fund_class_mapping[
|
||||||
self.provider_fund_class_mapping["FundId"] == parent_id
|
self.provider_fund_class_mapping["FundId"] == parent_id
|
||||||
|
|
@ -290,7 +294,9 @@ class DataMapping:
|
||||||
|
|
||||||
data_info = {"name": raw_name}
|
data_info = {"name": raw_name}
|
||||||
if len(provider_compare_name_list) > 0:
|
if len(provider_compare_name_list) > 0:
|
||||||
|
pre_common_word_list = []
|
||||||
if doc_compare_name_list is not None and len(doc_compare_name_list) > 0:
|
if doc_compare_name_list is not None and len(doc_compare_name_list) > 0:
|
||||||
|
_, pre_common_word_list = remove_common_word(doc_compare_name_list)
|
||||||
max_similarity_name, max_similarity = get_most_similar_name(
|
max_similarity_name, max_similarity = get_most_similar_name(
|
||||||
raw_name, doc_compare_name_list)
|
raw_name, doc_compare_name_list)
|
||||||
if max_similarity is not None and max_similarity >= 0.9:
|
if max_similarity is not None and max_similarity >= 0.9:
|
||||||
|
|
@ -301,8 +307,10 @@ class DataMapping:
|
||||||
data_info["similarity"] = max_similarity
|
data_info["similarity"] = max_similarity
|
||||||
|
|
||||||
if data_info.get("id", None) is None or len(data_info.get("id", "")) == 0:
|
if data_info.get("id", None) is None or len(data_info.get("id", "")) == 0:
|
||||||
|
# set pre_common_word_list, reason: the document mapping for same fund maybe different with provider mapping
|
||||||
|
# the purpose is to get the most common word list, to improve the similarity.
|
||||||
max_similarity_name, max_similarity = get_most_similar_name(
|
max_similarity_name, max_similarity = get_most_similar_name(
|
||||||
raw_name, provider_compare_name_list
|
raw_name, provider_compare_name_list, pre_common_word_list=pre_common_word_list
|
||||||
)
|
)
|
||||||
if max_similarity is not None and max_similarity >= 0.5:
|
if max_similarity is not None and max_similarity >= 0.5:
|
||||||
data_info["id"] = provider_compare_mapping[
|
data_info["id"] = provider_compare_mapping[
|
||||||
|
|
|
||||||
333
core/metrics.py
333
core/metrics.py
|
|
@ -95,6 +95,9 @@ class Metrics:
|
||||||
performance_fee_true = []
|
performance_fee_true = []
|
||||||
performance_fee_pred = []
|
performance_fee_pred = []
|
||||||
|
|
||||||
|
investment_mapping_true = []
|
||||||
|
investment_mapping_pred = []
|
||||||
|
|
||||||
missing_error_list = []
|
missing_error_list = []
|
||||||
data_point_list = ["tor", "ter", "ogc", "performance_fee"]
|
data_point_list = ["tor", "ter", "ogc", "performance_fee"]
|
||||||
|
|
||||||
|
|
@ -157,86 +160,123 @@ class Metrics:
|
||||||
performance_fee_pred.extend(pred_data)
|
performance_fee_pred.extend(pred_data)
|
||||||
missing_error_list.extend(missing_error_data)
|
missing_error_list.extend(missing_error_data)
|
||||||
elif self.data_type == "investment_mapping":
|
elif self.data_type == "investment_mapping":
|
||||||
pass
|
prediction_doc_id_list = prediction_df["doc_id"].unique().tolist()
|
||||||
|
ground_truth_doc_id_list = ground_truth_df["doc_id"].unique().tolist()
|
||||||
|
# get intersection of doc_id_list
|
||||||
|
doc_id_list = list(
|
||||||
|
set(prediction_doc_id_list) & set(ground_truth_doc_id_list)
|
||||||
|
)
|
||||||
|
# order by doc_id
|
||||||
|
doc_id_list.sort()
|
||||||
|
|
||||||
|
for doc_id in doc_id_list:
|
||||||
|
prediction_data = prediction_df[prediction_df["doc_id"] == doc_id]
|
||||||
|
ground_truth_data = ground_truth_df[ground_truth_df["doc_id"] == doc_id]
|
||||||
|
for data_point in data_point_list:
|
||||||
|
true_data, pred_data, missing_error_data = self.get_investment_mapping_true_pred_data(
|
||||||
|
doc_id, ground_truth_data, prediction_data, data_point
|
||||||
|
)
|
||||||
|
investment_mapping_true.extend(true_data)
|
||||||
|
investment_mapping_pred.extend(pred_data)
|
||||||
|
missing_error_list.extend(missing_error_data)
|
||||||
|
|
||||||
metrics_list = []
|
metrics_list = []
|
||||||
for data_point in data_point_list:
|
if self.data_type == "investment_mapping":
|
||||||
if data_point == "tor":
|
if len(investment_mapping_true) == 0 and len(investment_mapping_pred) == 0:
|
||||||
precision, recall, f1 = self.get_specific_metrics(tor_true, tor_pred)
|
investment_mapping_true.append(1)
|
||||||
tor_support = self.get_support_number(tor_true)
|
investment_mapping_pred.append(1)
|
||||||
metrics_list.append(
|
precision, recall, f1 = self.get_specific_metrics(investment_mapping_true, investment_mapping_pred)
|
||||||
{
|
investment_mapping_support = self.get_support_number(investment_mapping_true)
|
||||||
"Data_Point": data_point,
|
metrics_list.append(
|
||||||
"Precision": precision,
|
{
|
||||||
"Recall": recall,
|
"Data_Point": "Investment Mapping",
|
||||||
"F1": f1,
|
"Precision": precision,
|
||||||
"Support": tor_support,
|
"Recall": recall,
|
||||||
}
|
"F1": f1,
|
||||||
)
|
"Support": investment_mapping_support,
|
||||||
logger.info(
|
}
|
||||||
f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {tor_support}"
|
)
|
||||||
)
|
logger.info(
|
||||||
elif data_point == "ter":
|
f"Investment mapping Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {investment_mapping_support}"
|
||||||
precision, recall, f1 = self.get_specific_metrics(ter_true, ter_pred)
|
)
|
||||||
ter_support = self.get_support_number(ter_true)
|
else:
|
||||||
metrics_list.append(
|
for data_point in data_point_list:
|
||||||
{
|
if data_point == "tor":
|
||||||
"Data_Point": data_point,
|
precision, recall, f1 = self.get_specific_metrics(tor_true, tor_pred)
|
||||||
"Precision": precision,
|
tor_support = self.get_support_number(tor_true)
|
||||||
"Recall": recall,
|
metrics_list.append(
|
||||||
"F1": f1,
|
{
|
||||||
"Support": ter_support,
|
"Data_Point": data_point,
|
||||||
}
|
"Precision": precision,
|
||||||
)
|
"Recall": recall,
|
||||||
logger.info(
|
"F1": f1,
|
||||||
f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ter_support}"
|
"Support": tor_support,
|
||||||
)
|
}
|
||||||
elif data_point == "ogc":
|
)
|
||||||
precision, recall, f1 = self.get_specific_metrics(ogc_true, ogc_pred)
|
logger.info(
|
||||||
ogc_support = self.get_support_number(ogc_true)
|
f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {tor_support}"
|
||||||
metrics_list.append(
|
)
|
||||||
{
|
elif data_point == "ter":
|
||||||
"Data_Point": data_point,
|
precision, recall, f1 = self.get_specific_metrics(ter_true, ter_pred)
|
||||||
"Precision": precision,
|
ter_support = self.get_support_number(ter_true)
|
||||||
"Recall": recall,
|
metrics_list.append(
|
||||||
"F1": f1,
|
{
|
||||||
"Support": ogc_support,
|
"Data_Point": data_point,
|
||||||
}
|
"Precision": precision,
|
||||||
)
|
"Recall": recall,
|
||||||
logger.info(
|
"F1": f1,
|
||||||
f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ogc_support}"
|
"Support": ter_support,
|
||||||
)
|
}
|
||||||
elif data_point == "performance_fee":
|
)
|
||||||
precision, recall, f1 = self.get_specific_metrics(
|
logger.info(
|
||||||
performance_fee_true, performance_fee_pred
|
f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ter_support}"
|
||||||
)
|
)
|
||||||
performance_fee_support = self.get_support_number(performance_fee_true)
|
elif data_point == "ogc":
|
||||||
metrics_list.append(
|
precision, recall, f1 = self.get_specific_metrics(ogc_true, ogc_pred)
|
||||||
{
|
ogc_support = self.get_support_number(ogc_true)
|
||||||
"Data_Point": data_point,
|
metrics_list.append(
|
||||||
"Precision": precision,
|
{
|
||||||
"Recall": recall,
|
"Data_Point": data_point,
|
||||||
"F1": f1,
|
"Precision": precision,
|
||||||
"Support": performance_fee_support,
|
"Recall": recall,
|
||||||
}
|
"F1": f1,
|
||||||
)
|
"Support": ogc_support,
|
||||||
logger.info(
|
}
|
||||||
f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {performance_fee_support}"
|
)
|
||||||
)
|
logger.info(
|
||||||
|
f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ogc_support}"
|
||||||
|
)
|
||||||
|
elif data_point == "performance_fee":
|
||||||
|
precision, recall, f1 = self.get_specific_metrics(
|
||||||
|
performance_fee_true, performance_fee_pred
|
||||||
|
)
|
||||||
|
performance_fee_support = self.get_support_number(performance_fee_true)
|
||||||
|
metrics_list.append(
|
||||||
|
{
|
||||||
|
"Data_Point": data_point,
|
||||||
|
"Precision": precision,
|
||||||
|
"Recall": recall,
|
||||||
|
"F1": f1,
|
||||||
|
"Support": performance_fee_support,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {performance_fee_support}"
|
||||||
|
)
|
||||||
|
|
||||||
# get average metrics
|
# get average metrics
|
||||||
precision_list = [metric["Precision"] for metric in metrics_list]
|
precision_list = [metric["Precision"] for metric in metrics_list]
|
||||||
recall_list = [metric["Recall"] for metric in metrics_list]
|
recall_list = [metric["Recall"] for metric in metrics_list]
|
||||||
f1_list = [metric["F1"] for metric in metrics_list]
|
f1_list = [metric["F1"] for metric in metrics_list]
|
||||||
metrics_list.append(
|
metrics_list.append(
|
||||||
{
|
{
|
||||||
"Data_Point": "Average",
|
"Data_Point": "Average",
|
||||||
"Precision": sum(precision_list) / len(precision_list),
|
"Precision": sum(precision_list) / len(precision_list),
|
||||||
"Recall": sum(recall_list) / len(recall_list),
|
"Recall": sum(recall_list) / len(recall_list),
|
||||||
"F1": sum(f1_list) / len(f1_list),
|
"F1": sum(f1_list) / len(f1_list),
|
||||||
"Support": sum([metric["Support"] for metric in metrics_list]),
|
"Support": sum([metric["Support"] for metric in metrics_list]),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return missing_error_list, metrics_list
|
return missing_error_list, metrics_list
|
||||||
|
|
||||||
def get_support_number(self, true_data: list):
|
def get_support_number(self, true_data: list):
|
||||||
|
|
@ -490,6 +530,145 @@ class Metrics:
|
||||||
|
|
||||||
return true_data, pred_data, missing_error_data
|
return true_data, pred_data, missing_error_data
|
||||||
|
|
||||||
|
def get_investment_mapping_true_pred_data(
|
||||||
|
self,
|
||||||
|
doc_id,
|
||||||
|
ground_truth_data: pd.DataFrame,
|
||||||
|
prediction_data: pd.DataFrame,
|
||||||
|
data_point: str,
|
||||||
|
):
|
||||||
|
dp_prediction = prediction_data[prediction_data["datapoint"] == data_point]
|
||||||
|
dp_prediction = self.modify_data(dp_prediction)
|
||||||
|
pred_simple_raw_names = dp_prediction["simple_raw_name"].unique().tolist()
|
||||||
|
pred_simple_name_unique_words_list = (
|
||||||
|
dp_prediction["simple_name_unique_words"].unique().tolist()
|
||||||
|
)
|
||||||
|
|
||||||
|
dp_ground_truth = ground_truth_data[
|
||||||
|
ground_truth_data["datapoint"] == data_point
|
||||||
|
]
|
||||||
|
dp_ground_truth = self.modify_data(dp_ground_truth)
|
||||||
|
gt_simple_raw_names = dp_ground_truth["simple_raw_name"].unique().tolist()
|
||||||
|
gt_simple_name_unique_words_list = (
|
||||||
|
dp_ground_truth["simple_name_unique_words"].unique().tolist()
|
||||||
|
)
|
||||||
|
|
||||||
|
compare_data_list = []
|
||||||
|
for index, ground_truth in dp_ground_truth.iterrows():
|
||||||
|
gt_page_index = ground_truth["page_index"]
|
||||||
|
gt_raw_name = ground_truth["raw_name"]
|
||||||
|
gt_simple_raw_name = ground_truth["simple_raw_name"]
|
||||||
|
gt_simple_name_unique_words = ground_truth["simple_name_unique_words"]
|
||||||
|
gt_investment_type = ground_truth["investment_type"]
|
||||||
|
|
||||||
|
find_raw_name_in_pred = [
|
||||||
|
pred_raw_name
|
||||||
|
for pred_raw_name in pred_simple_raw_names
|
||||||
|
if (
|
||||||
|
gt_simple_raw_name in pred_raw_name
|
||||||
|
or pred_raw_name in gt_simple_raw_name
|
||||||
|
)
|
||||||
|
and pred_raw_name.endswith(gt_simple_raw_name.split()[-1])
|
||||||
|
]
|
||||||
|
|
||||||
|
if (
|
||||||
|
gt_simple_name_unique_words in pred_simple_name_unique_words_list
|
||||||
|
or len(find_raw_name_in_pred) > 0
|
||||||
|
):
|
||||||
|
# get the ground truth data with the same unique words
|
||||||
|
if gt_simple_name_unique_words in pred_simple_name_unique_words_list:
|
||||||
|
pred_data_df = dp_prediction[
|
||||||
|
dp_prediction["simple_name_unique_words"]
|
||||||
|
== gt_simple_name_unique_words
|
||||||
|
]
|
||||||
|
if len(pred_data_df) > 1:
|
||||||
|
if (
|
||||||
|
len(pred_data_df[pred_data_df["page_index"] == gt_page_index])
|
||||||
|
== 0
|
||||||
|
):
|
||||||
|
pred_data = pred_data_df.iloc[0]
|
||||||
|
else:
|
||||||
|
pred_data = pred_data_df[
|
||||||
|
pred_data_df["page_index"] == gt_page_index
|
||||||
|
].iloc[0]
|
||||||
|
elif len(pred_data_df) == 1:
|
||||||
|
pred_data = pred_data_df.iloc[0]
|
||||||
|
else:
|
||||||
|
pred_data = None
|
||||||
|
else:
|
||||||
|
pred_data_df = dp_prediction[
|
||||||
|
dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0]
|
||||||
|
]
|
||||||
|
if len(pred_data_df) > 1:
|
||||||
|
if (
|
||||||
|
len(pred_data_df[pred_data_df["page_index"] == gt_page_index])
|
||||||
|
== 0
|
||||||
|
):
|
||||||
|
pred_data = pred_data_df.iloc[0]
|
||||||
|
else:
|
||||||
|
pred_data = pred_data_df[
|
||||||
|
pred_data_df["page_index"] == gt_page_index
|
||||||
|
].iloc[0]
|
||||||
|
elif len(pred_data_df) == 1:
|
||||||
|
pred_data = pred_data_df.iloc[0]
|
||||||
|
else:
|
||||||
|
pred_data = None
|
||||||
|
if pred_data is not None:
|
||||||
|
compare_data = {"raw_name": gt_raw_name,
|
||||||
|
"investment_type": gt_investment_type,
|
||||||
|
"gt_investment_id": ground_truth["investment_id"],
|
||||||
|
"gt_investment_name": ground_truth["investment_name"],
|
||||||
|
"pred_investment_id": pred_data["investment_id"],
|
||||||
|
"pred_investment_name": pred_data["investment_name"]}
|
||||||
|
compare_data_list.append(compare_data)
|
||||||
|
|
||||||
|
true_data = []
|
||||||
|
pred_data = []
|
||||||
|
missing_error_data = []
|
||||||
|
|
||||||
|
for compare_data in compare_data_list:
|
||||||
|
if compare_data["gt_investment_id"] == compare_data["pred_investment_id"]:
|
||||||
|
true_data.append(1)
|
||||||
|
pred_data.append(1)
|
||||||
|
else:
|
||||||
|
true_data.append(1)
|
||||||
|
pred_data.append(0)
|
||||||
|
error_data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"data_point": data_point,
|
||||||
|
"raw_name": compare_data["raw_name"],
|
||||||
|
"investment_type": compare_data["investment_type"],
|
||||||
|
"error_type": "mapping missing",
|
||||||
|
"error_id": compare_data["pred_investment_id"],
|
||||||
|
"error_name": compare_data["pred_investment_name"],
|
||||||
|
"correct_id": compare_data["gt_investment_id"],
|
||||||
|
"correct_name": compare_data["gt_investment_name"]
|
||||||
|
}
|
||||||
|
missing_error_data.append(error_data)
|
||||||
|
|
||||||
|
for index, prediction in dp_prediction.iterrows():
|
||||||
|
pred_raw_name = prediction["raw_name"]
|
||||||
|
pred_investment_id = prediction["investment_id"]
|
||||||
|
pred_investment_name = prediction["investment_name"]
|
||||||
|
pred_investment_type = prediction["investment_type"]
|
||||||
|
gt_data_df = dp_ground_truth[dp_ground_truth["investment_id"] == pred_investment_id]
|
||||||
|
if len(gt_data_df) == 0:
|
||||||
|
true_data.append(0)
|
||||||
|
pred_data.append(1)
|
||||||
|
error_data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"data_point": data_point,
|
||||||
|
"raw_name": pred_raw_name,
|
||||||
|
"investment_type": pred_investment_type,
|
||||||
|
"error_type": "mapping incorrect",
|
||||||
|
"error_id": pred_investment_id,
|
||||||
|
"error_name": pred_investment_name,
|
||||||
|
"correct_id": "",
|
||||||
|
"correct_name": ""
|
||||||
|
}
|
||||||
|
missing_error_data.append(error_data)
|
||||||
|
return true_data, pred_data, missing_error_data
|
||||||
|
|
||||||
def modify_data(self, data: pd.DataFrame):
|
def modify_data(self, data: pd.DataFrame):
|
||||||
data["simple_raw_name"] = ""
|
data["simple_raw_name"] = ""
|
||||||
data["simple_name_unique_words"] = ""
|
data["simple_name_unique_words"] = ""
|
||||||
|
|
|
||||||
127
main.py
127
main.py
|
|
@ -33,7 +33,9 @@ class EMEA_AR_Parsing:
|
||||||
self.extract_way = extract_way
|
self.extract_way = extract_way
|
||||||
self.output_extract_image_folder = None
|
self.output_extract_image_folder = None
|
||||||
if self.extract_way == "image":
|
if self.extract_way == "image":
|
||||||
self.output_extract_image_folder = r"/data/emea_ar/output/extract_data/images/"
|
self.output_extract_image_folder = (
|
||||||
|
r"/data/emea_ar/output/extract_data/images/"
|
||||||
|
)
|
||||||
os.makedirs(self.output_extract_image_folder, exist_ok=True)
|
os.makedirs(self.output_extract_image_folder, exist_ok=True)
|
||||||
|
|
||||||
if output_extract_data_folder is None or len(output_extract_data_folder) == 0:
|
if output_extract_data_folder is None or len(output_extract_data_folder) == 0:
|
||||||
|
|
@ -41,7 +43,9 @@ class EMEA_AR_Parsing:
|
||||||
if not output_extract_data_folder.endswith("/"):
|
if not output_extract_data_folder.endswith("/"):
|
||||||
output_extract_data_folder = f"{output_extract_data_folder}/"
|
output_extract_data_folder = f"{output_extract_data_folder}/"
|
||||||
if extract_way is not None and len(extract_way) > 0:
|
if extract_way is not None and len(extract_way) > 0:
|
||||||
output_extract_data_folder = f"{output_extract_data_folder}by_{extract_way}/"
|
output_extract_data_folder = (
|
||||||
|
f"{output_extract_data_folder}by_{extract_way}/"
|
||||||
|
)
|
||||||
self.output_extract_data_folder = output_extract_data_folder
|
self.output_extract_data_folder = output_extract_data_folder
|
||||||
os.makedirs(self.output_extract_data_folder, exist_ok=True)
|
os.makedirs(self.output_extract_data_folder, exist_ok=True)
|
||||||
|
|
||||||
|
|
@ -50,7 +54,9 @@ class EMEA_AR_Parsing:
|
||||||
if not output_mapping_data_folder.endswith("/"):
|
if not output_mapping_data_folder.endswith("/"):
|
||||||
output_mapping_data_folder = f"{output_mapping_data_folder}/"
|
output_mapping_data_folder = f"{output_mapping_data_folder}/"
|
||||||
if extract_way is not None and len(extract_way) > 0:
|
if extract_way is not None and len(extract_way) > 0:
|
||||||
output_mapping_data_folder = f"{output_mapping_data_folder}by_{extract_way}/"
|
output_mapping_data_folder = (
|
||||||
|
f"{output_mapping_data_folder}by_{extract_way}/"
|
||||||
|
)
|
||||||
self.output_mapping_data_folder = output_mapping_data_folder
|
self.output_mapping_data_folder = output_mapping_data_folder
|
||||||
os.makedirs(self.output_mapping_data_folder, exist_ok=True)
|
os.makedirs(self.output_mapping_data_folder, exist_ok=True)
|
||||||
|
|
||||||
|
|
@ -75,8 +81,10 @@ class EMEA_AR_Parsing:
|
||||||
datapoints.remove("doc_id")
|
datapoints.remove("doc_id")
|
||||||
return datapoints
|
return datapoints
|
||||||
|
|
||||||
def extract_data(self,
|
def extract_data(
|
||||||
re_run: bool = False,) -> list:
|
self,
|
||||||
|
re_run: bool = False,
|
||||||
|
) -> list:
|
||||||
if not re_run:
|
if not re_run:
|
||||||
output_data_json_folder = os.path.join(
|
output_data_json_folder = os.path.join(
|
||||||
self.output_extract_data_folder, "json/"
|
self.output_extract_data_folder, "json/"
|
||||||
|
|
@ -100,7 +108,7 @@ class EMEA_AR_Parsing:
|
||||||
self.datapoints,
|
self.datapoints,
|
||||||
self.document_mapping_info_df,
|
self.document_mapping_info_df,
|
||||||
extract_way=self.extract_way,
|
extract_way=self.extract_way,
|
||||||
output_image_folder=self.output_extract_image_folder
|
output_image_folder=self.output_extract_image_folder,
|
||||||
)
|
)
|
||||||
data_from_gpt = data_extraction.extract_data()
|
data_from_gpt = data_extraction.extract_data()
|
||||||
return data_from_gpt
|
return data_from_gpt
|
||||||
|
|
@ -148,14 +156,14 @@ def extract_data(
|
||||||
pdf_folder: str,
|
pdf_folder: str,
|
||||||
output_data_folder: str,
|
output_data_folder: str,
|
||||||
extract_way: str = "text",
|
extract_way: str = "text",
|
||||||
re_run: bool = False
|
re_run: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
logger.info(f"Extract EMEA AR data for doc_id: {doc_id}")
|
logger.info(f"Extract EMEA AR data for doc_id: {doc_id}")
|
||||||
emea_ar_parsing = EMEA_AR_Parsing(
|
emea_ar_parsing = EMEA_AR_Parsing(
|
||||||
doc_id,
|
doc_id,
|
||||||
pdf_folder,
|
pdf_folder,
|
||||||
output_extract_data_folder=output_data_folder,
|
output_extract_data_folder=output_data_folder,
|
||||||
extract_way=extract_way
|
extract_way=extract_way,
|
||||||
)
|
)
|
||||||
data_from_gpt = emea_ar_parsing.extract_data(re_run)
|
data_from_gpt = emea_ar_parsing.extract_data(re_run)
|
||||||
return data_from_gpt
|
return data_from_gpt
|
||||||
|
|
@ -284,7 +292,9 @@ def batch_start_job(
|
||||||
result_extract_data_list.extend(doc_data_from_gpt)
|
result_extract_data_list.extend(doc_data_from_gpt)
|
||||||
result_mapping_data_list.extend(doc_mapping_data_list)
|
result_mapping_data_list.extend(doc_mapping_data_list)
|
||||||
|
|
||||||
if force_save_total_data or (special_doc_id_list is None or len(special_doc_id_list) == 0):
|
if force_save_total_data or (
|
||||||
|
special_doc_id_list is None or len(special_doc_id_list) == 0
|
||||||
|
):
|
||||||
result_extract_data_df = pd.DataFrame(result_extract_data_list)
|
result_extract_data_df = pd.DataFrame(result_extract_data_list)
|
||||||
result_extract_data_df.reset_index(drop=True, inplace=True)
|
result_extract_data_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
|
@ -292,11 +302,12 @@ def batch_start_job(
|
||||||
result_mappingdata_df.reset_index(drop=True, inplace=True)
|
result_mappingdata_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
logger.info(f"Saving extract data to {output_extract_data_total_folder}")
|
logger.info(f"Saving extract data to {output_extract_data_total_folder}")
|
||||||
|
unique_doc_ids = result_extract_data_df["doc_id"].unique().tolist()
|
||||||
os.makedirs(output_extract_data_total_folder, exist_ok=True)
|
os.makedirs(output_extract_data_total_folder, exist_ok=True)
|
||||||
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
||||||
output_file = os.path.join(
|
output_file = os.path.join(
|
||||||
output_extract_data_total_folder,
|
output_extract_data_total_folder,
|
||||||
f"extract_data_info_{len(pdf_files)}_documents_by_{extract_way}_{time_stamp}.xlsx",
|
f"extract_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx",
|
||||||
)
|
)
|
||||||
with pd.ExcelWriter(output_file) as writer:
|
with pd.ExcelWriter(output_file) as writer:
|
||||||
result_extract_data_df.to_excel(
|
result_extract_data_df.to_excel(
|
||||||
|
|
@ -304,11 +315,12 @@ def batch_start_job(
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Saving mapping data to {output_mapping_total_folder}")
|
logger.info(f"Saving mapping data to {output_mapping_total_folder}")
|
||||||
|
unique_doc_ids = result_mappingdata_df["doc_id"].unique().tolist()
|
||||||
os.makedirs(output_mapping_total_folder, exist_ok=True)
|
os.makedirs(output_mapping_total_folder, exist_ok=True)
|
||||||
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
||||||
output_file = os.path.join(
|
output_file = os.path.join(
|
||||||
output_mapping_total_folder,
|
output_mapping_total_folder,
|
||||||
f"mapping_data_info_{len(pdf_files)}_documents_by_{extract_way}_{time_stamp}.xlsx",
|
f"mapping_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx",
|
||||||
)
|
)
|
||||||
with pd.ExcelWriter(output_file) as writer:
|
with pd.ExcelWriter(output_file) as writer:
|
||||||
result_mappingdata_df.to_excel(
|
result_mappingdata_df.to_excel(
|
||||||
|
|
@ -322,13 +334,25 @@ def batch_start_job(
|
||||||
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
||||||
ground_truth_sheet_name = "mapping_data"
|
ground_truth_sheet_name = "mapping_data"
|
||||||
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
||||||
|
|
||||||
|
logger.info(f"Calculating metrics for data extraction")
|
||||||
missing_error_list, metrics_list, metrics_file = get_metrics(
|
missing_error_list, metrics_list, metrics_file = get_metrics(
|
||||||
"data_extraction",
|
"data_extraction",
|
||||||
output_file,
|
output_file,
|
||||||
prediction_sheet_name,
|
prediction_sheet_name,
|
||||||
ground_truth_file,
|
ground_truth_file,
|
||||||
ground_truth_sheet_name,
|
ground_truth_sheet_name,
|
||||||
metrics_output_folder
|
metrics_output_folder,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Calculating metrics for investment mapping")
|
||||||
|
missing_error_list, metrics_list, metrics_file = get_metrics(
|
||||||
|
"investment_mapping",
|
||||||
|
output_file,
|
||||||
|
prediction_sheet_name,
|
||||||
|
ground_truth_file,
|
||||||
|
ground_truth_sheet_name,
|
||||||
|
metrics_output_folder,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -535,9 +559,27 @@ def test_data_extraction_metrics():
|
||||||
prediction_sheet_name,
|
prediction_sheet_name,
|
||||||
ground_truth_file,
|
ground_truth_file,
|
||||||
ground_truth_sheet_name,
|
ground_truth_sheet_name,
|
||||||
metrics_output_folder
|
metrics_output_folder,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_mapping_raw_name():
|
||||||
|
doc_id = "292989214"
|
||||||
|
raw_name = "ENBD Saudi Arabia Equity Fund Class A USD Accumulation"
|
||||||
|
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||||
|
data_mapping = DataMapping(
|
||||||
|
doc_id,
|
||||||
|
datapoints=None,
|
||||||
|
raw_document_data_list=None,
|
||||||
|
document_mapping_info_df=None,
|
||||||
|
output_data_folder=output_folder,
|
||||||
|
)
|
||||||
|
mapping_info = data_mapping.matching_with_database(
|
||||||
|
raw_name=raw_name, parent_id="FS0000B4A7", matching_type="share"
|
||||||
|
)
|
||||||
|
print(mapping_info)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pdf_folder = r"/data/emea_ar/small_pdf/"
|
pdf_folder = r"/data/emea_ar/small_pdf/"
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
|
|
@ -579,27 +621,50 @@ if __name__ == "__main__":
|
||||||
# re_run_extract_data)
|
# re_run_extract_data)
|
||||||
|
|
||||||
# special_doc_id_list = ["505174428", "510326848", "349679479"]
|
# special_doc_id_list = ["505174428", "510326848", "349679479"]
|
||||||
special_doc_id_list = []
|
check_mapping_doc_id_list = [
|
||||||
|
"458359181",
|
||||||
|
"486383912",
|
||||||
|
"529925114",
|
||||||
|
"391456740",
|
||||||
|
"391736837",
|
||||||
|
"497497599",
|
||||||
|
"327956364",
|
||||||
|
"479793787",
|
||||||
|
"334718372",
|
||||||
|
"321733631",
|
||||||
|
"507967525",
|
||||||
|
"478585901",
|
||||||
|
"366179419",
|
||||||
|
"509845549",
|
||||||
|
"323390570",
|
||||||
|
"344636875",
|
||||||
|
"445256897",
|
||||||
|
"508854243",
|
||||||
|
"520879048",
|
||||||
|
"463081566",
|
||||||
|
]
|
||||||
|
special_doc_id_list = check_mapping_doc_id_list
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = True
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
# for extract_way in extract_ways:
|
for extract_way in extract_ways:
|
||||||
# batch_start_job(
|
batch_start_job(
|
||||||
# pdf_folder,
|
pdf_folder,
|
||||||
# page_filter_ground_truth_file,
|
page_filter_ground_truth_file,
|
||||||
# output_extract_data_child_folder,
|
output_extract_data_child_folder,
|
||||||
# output_mapping_child_folder,
|
output_mapping_child_folder,
|
||||||
# output_extract_data_total_folder,
|
output_extract_data_total_folder,
|
||||||
# output_mapping_total_folder,
|
output_mapping_total_folder,
|
||||||
# extract_way,
|
extract_way,
|
||||||
# special_doc_id_list,
|
special_doc_id_list,
|
||||||
# re_run_extract_data,
|
re_run_extract_data,
|
||||||
# re_run_mapping_data,
|
re_run_mapping_data,
|
||||||
# force_save_total_data=force_save_total_data,
|
force_save_total_data=force_save_total_data,
|
||||||
# )
|
)
|
||||||
|
|
||||||
test_data_extraction_metrics()
|
# test_data_extraction_metrics()
|
||||||
|
# test_mapping_raw_name()
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@ def clean_text(text: str) -> str:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def get_most_similar_name(text: str, name_list: list):
|
def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list = None) -> str:
|
||||||
"""
|
"""
|
||||||
Get the most similar fund name from fund_name_list by jacard similarity
|
Get the most similar fund name from fund_name_list by jacard similarity
|
||||||
"""
|
"""
|
||||||
|
|
@ -40,6 +40,9 @@ def get_most_similar_name(text: str, name_list: list):
|
||||||
common_word_list = []
|
common_word_list = []
|
||||||
if len(name_list) > 1:
|
if len(name_list) > 1:
|
||||||
_, common_word_list = remove_common_word(copy_fund_name_list)
|
_, common_word_list = remove_common_word(copy_fund_name_list)
|
||||||
|
if pre_common_word_list is not None and len(pre_common_word_list) > 0:
|
||||||
|
common_word_list.extend([word for word in pre_common_word_list
|
||||||
|
if word not in common_word_list])
|
||||||
|
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
text = remove_special_characters(text)
|
text = remove_special_characters(text)
|
||||||
|
|
@ -61,17 +64,13 @@ def get_most_similar_name(text: str, name_list: list):
|
||||||
# remove word in fund_name_list
|
# remove word in fund_name_list
|
||||||
for i in range(len(copy_fund_name_list)):
|
for i in range(len(copy_fund_name_list)):
|
||||||
temp_splits = copy_fund_name_list[i].split()
|
temp_splits = copy_fund_name_list[i].split()
|
||||||
for temp in temp_splits:
|
copy_fund_name_list[i] = ' '.join([split for split in temp_splits
|
||||||
if remove_special_characters(temp).lower() == word:
|
if remove_special_characters(split).lower() != word])
|
||||||
copy_fund_name_list[i] = re.sub(r'\s+', ' ',
|
|
||||||
copy_fund_name_list[i].replace(temp, ' '))
|
|
||||||
|
|
||||||
for i in range(len(copy_fund_name_list)):
|
for i in range(len(copy_fund_name_list)):
|
||||||
temp_splits = copy_fund_name_list[i].split()
|
temp_splits = copy_fund_name_list[i].split()
|
||||||
for temp in temp_splits:
|
copy_fund_name_list[i] = ' '.join([split for split in temp_splits
|
||||||
if remove_special_characters(temp).lower() in ['fund', 'portfolio', 'class', 'share', 'shares']:
|
if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']])
|
||||||
copy_fund_name_list[i] = \
|
|
||||||
re.sub(r'\s+', ' ', copy_fund_name_list[i].replace(temp, ' '))
|
|
||||||
final_splits = []
|
final_splits = []
|
||||||
for split in new_splits:
|
for split in new_splits:
|
||||||
if split.lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']:
|
if split.lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue