realize to calculate data extraction metrics.
This commit is contained in:
parent
50e6c3c19d
commit
98e86a6cfd
237
core/metrics.py
237
core/metrics.py
|
|
@ -3,6 +3,7 @@ import pandas as pd
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
from sklearn.metrics import precision_score, recall_score, f1_score
|
from sklearn.metrics import precision_score, recall_score, f1_score
|
||||||
|
from utils.biz_utils import get_unique_words_text
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -13,12 +14,14 @@ class Metrics:
|
||||||
prediction_file: str,
|
prediction_file: str,
|
||||||
prediction_sheet_name: str = "Sheet1",
|
prediction_sheet_name: str = "Sheet1",
|
||||||
ground_truth_file: str = None,
|
ground_truth_file: str = None,
|
||||||
|
ground_truth_sheet_name: str = "Sheet1",
|
||||||
output_folder: str = None,
|
output_folder: str = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.data_type = data_type
|
self.data_type = data_type
|
||||||
self.prediction_file = prediction_file
|
self.prediction_file = prediction_file
|
||||||
self.prediction_sheet_name = prediction_sheet_name
|
self.prediction_sheet_name = prediction_sheet_name
|
||||||
self.ground_truth_file = ground_truth_file
|
self.ground_truth_file = ground_truth_file
|
||||||
|
self.ground_truth_sheet_name = ground_truth_sheet_name
|
||||||
|
|
||||||
if output_folder is None or len(output_folder) == 0:
|
if output_folder is None or len(output_folder) == 0:
|
||||||
output_folder = r"/data/emea_ar/output/metrics/"
|
output_folder = r"/data/emea_ar/output/metrics/"
|
||||||
|
|
@ -49,27 +52,27 @@ class Metrics:
|
||||||
metrics_list = [
|
metrics_list = [
|
||||||
{"Data_Point": "NAN", "Precision": 0, "Recall": 0, "F1": 0, "Support": 0}
|
{"Data_Point": "NAN", "Precision": 0, "Recall": 0, "F1": 0, "Support": 0}
|
||||||
]
|
]
|
||||||
if self.data_type == "page_filter":
|
|
||||||
missing_error_list, metrics_list = self.get_page_filter_metrics()
|
missing_error_list, metrics_list = self.get_metrics()
|
||||||
elif self.data_type == "datapoint":
|
|
||||||
missing_error_list, metrics_list = self.get_datapoint_metrics()
|
|
||||||
else:
|
|
||||||
logger.error(f"Invalid data type: {self.data_type}")
|
|
||||||
|
|
||||||
missing_error_df = pd.DataFrame(missing_error_list)
|
missing_error_df = pd.DataFrame(missing_error_list)
|
||||||
missing_error_df.reset_index(drop=True, inplace=True)
|
missing_error_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
metrics_df = pd.DataFrame(metrics_list)
|
metrics_df = pd.DataFrame(metrics_list)
|
||||||
metrics_df.reset_index(drop=True, inplace=True)
|
metrics_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
with pd.ExcelWriter(self.output_file) as writer:
|
with pd.ExcelWriter(self.output_file) as writer:
|
||||||
missing_error_df.to_excel(writer, sheet_name="Missing_Error", index=False)
|
missing_error_df.to_excel(writer, sheet_name="Missing_Error", index=False)
|
||||||
metrics_df.to_excel(writer, sheet_name="Metrics", index=False)
|
metrics_df.to_excel(writer, sheet_name="Metrics", index=False)
|
||||||
return missing_error_list, metrics_list, self.output_file
|
return missing_error_list, metrics_list, self.output_file
|
||||||
|
|
||||||
def get_page_filter_metrics(self):
|
def get_metrics(self):
|
||||||
prediction_df = pd.read_excel(self.prediction_file, sheet_name=self.prediction_sheet_name)
|
prediction_df = pd.read_excel(
|
||||||
ground_truth_df = pd.read_excel(self.ground_truth_file, sheet_name="Sheet1")
|
self.prediction_file, sheet_name=self.prediction_sheet_name
|
||||||
|
)
|
||||||
|
ground_truth_df = pd.read_excel(
|
||||||
|
self.ground_truth_file, sheet_name=self.ground_truth_sheet_name
|
||||||
|
)
|
||||||
ground_truth_df = ground_truth_df[ground_truth_df["Checked"] == 1]
|
ground_truth_df = ground_truth_df[ground_truth_df["Checked"] == 1]
|
||||||
|
|
||||||
tor_true = []
|
tor_true = []
|
||||||
|
|
@ -87,27 +90,61 @@ class Metrics:
|
||||||
missing_error_list = []
|
missing_error_list = []
|
||||||
data_point_list = ["tor", "ter", "ogc", "performance_fee"]
|
data_point_list = ["tor", "ter", "ogc", "performance_fee"]
|
||||||
|
|
||||||
for index, row in ground_truth_df.iterrows():
|
if self.data_type == "page_filter":
|
||||||
doc_id = row["doc_id"]
|
for index, row in ground_truth_df.iterrows():
|
||||||
# get first row with the same doc_id
|
doc_id = row["doc_id"]
|
||||||
prediction_data = prediction_df[prediction_df["doc_id"] == doc_id].iloc[0]
|
# get first row with the same doc_id
|
||||||
for data_point in data_point_list:
|
prediction_data = prediction_df[prediction_df["doc_id"] == doc_id].iloc[
|
||||||
true_data, pred_data, missing_error_data = self.get_true_pred_data(
|
0
|
||||||
doc_id, row, prediction_data, data_point
|
]
|
||||||
)
|
for data_point in data_point_list:
|
||||||
if data_point == "tor":
|
true_data, pred_data, missing_error_data = (
|
||||||
tor_true.extend(true_data)
|
self.get_page_filter_true_pred_data(
|
||||||
tor_pred.extend(pred_data)
|
doc_id, row, prediction_data, data_point
|
||||||
elif data_point == "ter":
|
)
|
||||||
ter_true.extend(true_data)
|
)
|
||||||
ter_pred.extend(pred_data)
|
if data_point == "tor":
|
||||||
elif data_point == "ogc":
|
tor_true.extend(true_data)
|
||||||
ogc_true.extend(true_data)
|
tor_pred.extend(pred_data)
|
||||||
ogc_pred.extend(pred_data)
|
elif data_point == "ter":
|
||||||
elif data_point == "performance_fee":
|
ter_true.extend(true_data)
|
||||||
performance_fee_true.extend(true_data)
|
ter_pred.extend(pred_data)
|
||||||
performance_fee_pred.extend(pred_data)
|
elif data_point == "ogc":
|
||||||
missing_error_list.append(missing_error_data)
|
ogc_true.extend(true_data)
|
||||||
|
ogc_pred.extend(pred_data)
|
||||||
|
elif data_point == "performance_fee":
|
||||||
|
performance_fee_true.extend(true_data)
|
||||||
|
performance_fee_pred.extend(pred_data)
|
||||||
|
missing_error_list.append(missing_error_data)
|
||||||
|
else:
|
||||||
|
prediction_doc_id_list = prediction_df["doc_id"].unique().tolist()
|
||||||
|
ground_truth_doc_id_list = ground_truth_df["doc_id"].unique().tolist()
|
||||||
|
doc_id_list = list(set(prediction_doc_id_list + ground_truth_doc_id_list))
|
||||||
|
# order by doc_id
|
||||||
|
doc_id_list.sort()
|
||||||
|
|
||||||
|
for doc_id in doc_id_list:
|
||||||
|
prediction_data = prediction_df[prediction_df["doc_id"] == doc_id]
|
||||||
|
ground_truth_data = ground_truth_df[ground_truth_df["doc_id"] == doc_id]
|
||||||
|
for data_point in data_point_list:
|
||||||
|
true_data, pred_data, missing_error_data = (
|
||||||
|
self.get_data_extraction_true_pred_data(
|
||||||
|
doc_id, ground_truth_data, prediction_data, data_point
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if data_point == "tor":
|
||||||
|
tor_true.extend(true_data)
|
||||||
|
tor_pred.extend(pred_data)
|
||||||
|
elif data_point == "ter":
|
||||||
|
ter_true.extend(true_data)
|
||||||
|
ter_pred.extend(pred_data)
|
||||||
|
elif data_point == "ogc":
|
||||||
|
ogc_true.extend(true_data)
|
||||||
|
ogc_pred.extend(pred_data)
|
||||||
|
elif data_point == "performance_fee":
|
||||||
|
performance_fee_true.extend(true_data)
|
||||||
|
performance_fee_pred.extend(pred_data)
|
||||||
|
missing_error_list.extend(missing_error_data)
|
||||||
|
|
||||||
metrics_list = []
|
metrics_list = []
|
||||||
for data_point in data_point_list:
|
for data_point in data_point_list:
|
||||||
|
|
@ -188,14 +225,17 @@ class Metrics:
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return missing_error_list, metrics_list
|
return missing_error_list, metrics_list
|
||||||
|
|
||||||
def get_support_number(self, true_data: list):
|
def get_support_number(self, true_data: list):
|
||||||
# get the count which true_data is 1
|
# get the count which true_data is 1
|
||||||
return sum(true_data)
|
return sum(true_data)
|
||||||
|
|
||||||
|
|
||||||
def get_true_pred_data(
|
def get_page_filter_true_pred_data(
|
||||||
self, doc_id, ground_truth_data: pd.Series, prediction_data: pd.Series, data_point: str
|
self,
|
||||||
|
doc_id,
|
||||||
|
ground_truth_data: pd.Series,
|
||||||
|
prediction_data: pd.Series,
|
||||||
|
data_point: str,
|
||||||
):
|
):
|
||||||
ground_truth_list = ground_truth_data[data_point]
|
ground_truth_list = ground_truth_data[data_point]
|
||||||
if isinstance(ground_truth_list, str):
|
if isinstance(ground_truth_list, str):
|
||||||
|
|
@ -206,10 +246,14 @@ class Metrics:
|
||||||
|
|
||||||
true_data = []
|
true_data = []
|
||||||
pred_data = []
|
pred_data = []
|
||||||
|
|
||||||
|
missing_error_data = {
|
||||||
missing_error_data = {"doc_id": doc_id, "data_point": data_point, "missing": [], "error": []}
|
"doc_id": doc_id,
|
||||||
|
"data_point": data_point,
|
||||||
|
"missing": [],
|
||||||
|
"error": [],
|
||||||
|
}
|
||||||
|
|
||||||
missing_data = []
|
missing_data = []
|
||||||
error_data = []
|
error_data = []
|
||||||
|
|
||||||
|
|
@ -217,7 +261,7 @@ class Metrics:
|
||||||
true_data.append(1)
|
true_data.append(1)
|
||||||
pred_data.append(1)
|
pred_data.append(1)
|
||||||
return true_data, pred_data, missing_error_data
|
return true_data, pred_data, missing_error_data
|
||||||
|
|
||||||
for prediction in prediction_list:
|
for prediction in prediction_list:
|
||||||
if prediction in ground_truth_list:
|
if prediction in ground_truth_list:
|
||||||
true_data.append(1)
|
true_data.append(1)
|
||||||
|
|
@ -232,8 +276,115 @@ class Metrics:
|
||||||
true_data.append(1)
|
true_data.append(1)
|
||||||
pred_data.append(0)
|
pred_data.append(0)
|
||||||
missing_data.append(ground_truth)
|
missing_data.append(ground_truth)
|
||||||
missing_error_data = {"doc_id": doc_id, "data_point": data_point, "missing": missing_data, "error": error_data}
|
missing_error_data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"data_point": data_point,
|
||||||
|
"missing": missing_data,
|
||||||
|
"error": error_data,
|
||||||
|
}
|
||||||
|
|
||||||
|
return true_data, pred_data, missing_error_data
|
||||||
|
|
||||||
|
def get_data_extraction_true_pred_data(
|
||||||
|
self,
|
||||||
|
doc_id,
|
||||||
|
ground_truth_data: pd.DataFrame,
|
||||||
|
prediction_data: pd.DataFrame,
|
||||||
|
data_point: str,
|
||||||
|
):
|
||||||
|
dp_ground_truth = ground_truth_data[
|
||||||
|
ground_truth_data["datapoint"] == data_point
|
||||||
|
]
|
||||||
|
dp_prediction = prediction_data[prediction_data["datapoint"] == data_point]
|
||||||
|
|
||||||
|
# add new column to store unique words for dp_ground_truth
|
||||||
|
dp_ground_truth["unique_words"] = dp_ground_truth["raw_name"].apply(
|
||||||
|
get_unique_words_text
|
||||||
|
)
|
||||||
|
ground_truth_unique_words = dp_ground_truth["unique_words"].unique().tolist()
|
||||||
|
# add new column to store unique words for dp_prediction
|
||||||
|
dp_prediction["unique_words"] = dp_prediction["raw_name"].apply(
|
||||||
|
get_unique_words_text
|
||||||
|
)
|
||||||
|
pred_unique_words = dp_prediction["unique_words"].unique().tolist()
|
||||||
|
|
||||||
|
true_data = []
|
||||||
|
pred_data = []
|
||||||
|
|
||||||
|
missing_error_data = []
|
||||||
|
|
||||||
|
if len(dp_ground_truth) == 0 and len(dp_prediction) == 0:
|
||||||
|
true_data.append(1)
|
||||||
|
pred_data.append(1)
|
||||||
|
return true_data, pred_data, missing_error_data
|
||||||
|
|
||||||
|
for index, prediction in dp_prediction.iterrows():
|
||||||
|
pred_page_index = prediction["page_index"]
|
||||||
|
pred_raw_name = prediction["raw_name"]
|
||||||
|
pred_unique_words = prediction["unique_words"]
|
||||||
|
pred_data_point_value = prediction["value"]
|
||||||
|
pred_investment_type = prediction["investment_type"]
|
||||||
|
|
||||||
|
if pred_unique_words in ground_truth_unique_words:
|
||||||
|
# get the ground truth data with the same unique words
|
||||||
|
gt_data = dp_ground_truth[
|
||||||
|
dp_ground_truth["unique_words"] == pred_unique_words
|
||||||
|
].iloc[0]
|
||||||
|
gt_data_point_value = gt_data["value"]
|
||||||
|
if pred_data_point_value == gt_data_point_value:
|
||||||
|
true_data.append(1)
|
||||||
|
pred_data.append(1)
|
||||||
|
else:
|
||||||
|
true_data.append(0)
|
||||||
|
pred_data.append(1)
|
||||||
|
error_data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"data_point": data_point,
|
||||||
|
"page_index": pred_page_index,
|
||||||
|
"pred_raw_name": pred_raw_name,
|
||||||
|
"investment_type": pred_investment_type,
|
||||||
|
"error_type": "data value incorrect",
|
||||||
|
"error_value": pred_data_point_value,
|
||||||
|
"correct_value": gt_data_point_value,
|
||||||
|
}
|
||||||
|
missing_error_data.append(error_data)
|
||||||
|
else:
|
||||||
|
true_data.append(0)
|
||||||
|
pred_data.append(1)
|
||||||
|
error_data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"data_point": data_point,
|
||||||
|
"page_index": pred_page_index,
|
||||||
|
"pred_raw_name": pred_raw_name,
|
||||||
|
"investment_type": pred_investment_type,
|
||||||
|
"error_type": "raw name incorrect",
|
||||||
|
"error_value": pred_raw_name,
|
||||||
|
"correct_value": "",
|
||||||
|
}
|
||||||
|
missing_error_data.append(error_data)
|
||||||
|
|
||||||
|
for index, ground_truth in dp_ground_truth.iterrows():
|
||||||
|
gt_page_index = ground_truth["page_index"]
|
||||||
|
gt_raw_name = ground_truth["raw_name"]
|
||||||
|
gt_unique_words = ground_truth["unique_words"]
|
||||||
|
gt_data_point_value = ground_truth["value"]
|
||||||
|
gt_investment_type = ground_truth["investment_type"]
|
||||||
|
|
||||||
|
if gt_unique_words not in pred_unique_words:
|
||||||
|
true_data.append(1)
|
||||||
|
pred_data.append(0)
|
||||||
|
error_data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"data_point": data_point,
|
||||||
|
"page_index": gt_page_index,
|
||||||
|
"pred_raw_name": "",
|
||||||
|
"investment_type": gt_investment_type,
|
||||||
|
"error_type": "raw name missing",
|
||||||
|
"error_value": pred_data_point_value,
|
||||||
|
"correct_value": gt_raw_name,
|
||||||
|
}
|
||||||
|
missing_error_data.append(error_data)
|
||||||
|
|
||||||
return true_data, pred_data, missing_error_data
|
return true_data, pred_data, missing_error_data
|
||||||
|
|
||||||
def get_specific_metrics(self, true_data: list, pred_data: list):
|
def get_specific_metrics(self, true_data: list, pred_data: list):
|
||||||
|
|
|
||||||
|
|
@ -8,10 +8,20 @@
|
||||||
},
|
},
|
||||||
"data_business_features": {
|
"data_business_features": {
|
||||||
"common": [
|
"common": [
|
||||||
"Most of cases, the data is in the table(s) of context.",
|
"General rules:",
|
||||||
"Fund name: a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.\nb. The sub-fund name may be as the first column values in the table.",
|
"- Most of cases, the data is in the table(s) of context.",
|
||||||
|
"- Fund name: ",
|
||||||
|
"a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.",
|
||||||
|
"b. The sub-fund name may be as the first column or first row values in the table.",
|
||||||
|
"b.1 fund name example:",
|
||||||
|
"- context:",
|
||||||
|
"Summary information\nCapital International Fund Audited Annual Report 2023 | 15\nFootnotes are on page 17.\nCapital Group Multi-Sector \nIncome Fund (LUX) \n(CGMSILU)\nCapital Group US High Yield \nFund (LUX) (CGUSHYLU)\nCapital Group Emerging \nMarkets Debt Fund (LUX) \n(CGEMDLU)",
|
||||||
|
"fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
|
||||||
|
"- Only extract the latest data from context:",
|
||||||
"If with multiple data values in same row, please extract the latest.",
|
"If with multiple data values in same row, please extract the latest.",
|
||||||
"Only output the values which with significant reported names.\nPlease exclude below reported names and relevant values: \"Management Fees\", \"Management\", \"Management Fees p.a.\", \"Taxe d Abonnement in % p.a.\".\nDON'T EXTRACT MANAGEMENT FEES!",
|
"- Reported names:",
|
||||||
|
"Only output the values which with significant reported names.",
|
||||||
|
"Please exclude below reported names and relevant values: \"Management Fees\", \"Management\", \"Management Fees p.a.\", \"Taxe d Abonnement in % p.a.\".\nDON'T EXTRACT MANAGEMENT FEES!",
|
||||||
"One fund could be with multiple share classes and relevant share class level data values."
|
"One fund could be with multiple share classes and relevant share class level data values."
|
||||||
],
|
],
|
||||||
"investment_level": {
|
"investment_level": {
|
||||||
|
|
@ -106,7 +116,7 @@
|
||||||
"Only output the data point which with relevant value.",
|
"Only output the data point which with relevant value.",
|
||||||
"Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
|
"Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
|
||||||
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
|
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
|
||||||
"Ignore the data point which with -, N/A, N/A%, N/A %, NONE, etc.",
|
"Ignore the data point which value with -, *, **, N/A, N/A%, N/A %, NONE, etc.",
|
||||||
"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
|
"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
|
||||||
"The output should be JSON format, the format is like below example(s):"
|
"The output should be JSON format, the format is like below example(s):"
|
||||||
],
|
],
|
||||||
|
|
|
||||||
16
main.py
16
main.py
|
|
@ -217,6 +217,7 @@ def batch_start_job(
|
||||||
special_doc_id_list: list = None,
|
special_doc_id_list: list = None,
|
||||||
re_run_extract_data: bool = False,
|
re_run_extract_data: bool = False,
|
||||||
re_run_mapping_data: bool = False,
|
re_run_mapping_data: bool = False,
|
||||||
|
force_save_total_data: bool = False,
|
||||||
):
|
):
|
||||||
pdf_files = glob(pdf_folder + "*.pdf")
|
pdf_files = glob(pdf_folder + "*.pdf")
|
||||||
doc_list = []
|
doc_list = []
|
||||||
|
|
@ -249,14 +250,14 @@ def batch_start_job(
|
||||||
)
|
)
|
||||||
result_extract_data_list.extend(doc_data_from_gpt)
|
result_extract_data_list.extend(doc_data_from_gpt)
|
||||||
result_mapping_data_list.extend(doc_mapping_data_list)
|
result_mapping_data_list.extend(doc_mapping_data_list)
|
||||||
|
|
||||||
if special_doc_id_list is None or len(special_doc_id_list) == 0:
|
if force_save_total_data or (special_doc_id_list is None or len(special_doc_id_list) == 0):
|
||||||
result_extract_data_df = pd.DataFrame(result_extract_data_list)
|
result_extract_data_df = pd.DataFrame(result_extract_data_list)
|
||||||
result_extract_data_df.reset_index(drop=True, inplace=True)
|
result_extract_data_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
result_mappingdata_df = pd.DataFrame(result_mapping_data_list)
|
result_mappingdata_df = pd.DataFrame(result_mapping_data_list)
|
||||||
result_mappingdata_df.reset_index(drop=True, inplace=True)
|
result_mappingdata_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
logger.info(f"Saving extract data to {output_extract_data_total_folder}")
|
logger.info(f"Saving extract data to {output_extract_data_total_folder}")
|
||||||
os.makedirs(output_extract_data_total_folder, exist_ok=True)
|
os.makedirs(output_extract_data_total_folder, exist_ok=True)
|
||||||
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
||||||
|
|
@ -283,7 +284,7 @@ def batch_start_job(
|
||||||
result_extract_data_df.to_excel(
|
result_extract_data_df.to_excel(
|
||||||
writer, index=False, sheet_name="extract_data"
|
writer, index=False, sheet_name="extract_data"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def batch_filter_pdf_files(
|
def batch_filter_pdf_files(
|
||||||
pdf_folder: str,
|
pdf_folder: str,
|
||||||
|
|
@ -505,10 +506,14 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# doc_id = "476492237"
|
# doc_id = "476492237"
|
||||||
# extract_data(doc_id, pdf_folder, output_extract_data_child_folder, re_run)
|
# extract_data(doc_id, pdf_folder, output_extract_data_child_folder, re_run)
|
||||||
special_doc_id_list = ["508854243"]
|
special_doc_id_list = [
|
||||||
|
"525574973",
|
||||||
|
]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
|
|
||||||
|
force_save_total_data = False
|
||||||
batch_start_job(
|
batch_start_job(
|
||||||
pdf_folder,
|
pdf_folder,
|
||||||
page_filter_ground_truth_file,
|
page_filter_ground_truth_file,
|
||||||
|
|
@ -519,4 +524,5 @@ if __name__ == "__main__":
|
||||||
special_doc_id_list,
|
special_doc_id_list,
|
||||||
re_run_extract_data,
|
re_run_extract_data,
|
||||||
re_run_mapping_data,
|
re_run_mapping_data,
|
||||||
|
force_save_total_data=force_save_total_data,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -113,7 +113,11 @@ def analyze_json_error():
|
||||||
|
|
||||||
|
|
||||||
def statistics_document(
|
def statistics_document(
|
||||||
pdf_folder: str, doc_mapping_file_path: str, output_folder: str
|
pdf_folder: str,
|
||||||
|
doc_mapping_file_path: str,
|
||||||
|
sheet_name: str = "all_data",
|
||||||
|
output_folder: str = "/data/emea_ar/basic_information/English/",
|
||||||
|
output_file: str = "doc_mapping_statistics_data.xlsx"
|
||||||
):
|
):
|
||||||
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
|
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
|
||||||
logger.error(f"Invalid pdf_folder: {pdf_folder}")
|
logger.error(f"Invalid pdf_folder: {pdf_folder}")
|
||||||
|
|
@ -132,7 +136,7 @@ def statistics_document(
|
||||||
|
|
||||||
describe_stat_df_list = []
|
describe_stat_df_list = []
|
||||||
# statistics document mapping information
|
# statistics document mapping information
|
||||||
doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name="all_data")
|
doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=sheet_name)
|
||||||
|
|
||||||
# statistics doc_mapping_data for counting FundId count based on DocumentId
|
# statistics doc_mapping_data for counting FundId count based on DocumentId
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|
@ -172,15 +176,15 @@ def statistics_document(
|
||||||
)
|
)
|
||||||
describe_stat_df_list.append(doc_share_class_count_stat_df)
|
describe_stat_df_list.append(doc_share_class_count_stat_df)
|
||||||
|
|
||||||
# statistics doc_mapping_data for counting FundId count based on ProviderCompanyId and CompanyName
|
# statistics doc_mapping_data for counting FundId count based on CompanyId and CompanyName
|
||||||
logger.info(
|
logger.info(
|
||||||
"statistics doc_mapping_data for counting FundId count based on ProviderCompanyId and CompanyName"
|
"statistics doc_mapping_data for counting FundId count based on CompanyId and CompanyName"
|
||||||
)
|
)
|
||||||
provider_fund_id_df = doc_mapping_data[
|
provider_fund_id_df = doc_mapping_data[
|
||||||
["ProviderCompanyId", "CompanyName", "FundId"]
|
["CompanyId", "CompanyName", "FundId"]
|
||||||
].drop_duplicates()
|
].drop_duplicates()
|
||||||
provider_fund_count = (
|
provider_fund_count = (
|
||||||
provider_fund_id_df.groupby(["ProviderCompanyId", "CompanyName"])
|
provider_fund_id_df.groupby(["CompanyId", "CompanyName"])
|
||||||
.size()
|
.size()
|
||||||
.reset_index(name="fund_count")
|
.reset_index(name="fund_count")
|
||||||
)
|
)
|
||||||
|
|
@ -194,15 +198,15 @@ def statistics_document(
|
||||||
)
|
)
|
||||||
describe_stat_df_list.append(provider_fund_count_stat_df)
|
describe_stat_df_list.append(provider_fund_count_stat_df)
|
||||||
|
|
||||||
# statistics doc_mapping_data for counting FundClassId count based on ProviderCompanyId
|
# statistics doc_mapping_data for counting FundClassId count based on CompanyId
|
||||||
logger.info(
|
logger.info(
|
||||||
"statistics doc_mapping_data for counting FundClassId count based on ProviderCompanyId"
|
"statistics doc_mapping_data for counting FundClassId count based on CompanyId"
|
||||||
)
|
)
|
||||||
provider_share_class_id_df = doc_mapping_data[
|
provider_share_class_id_df = doc_mapping_data[
|
||||||
["ProviderCompanyId", "CompanyName", "FundClassId"]
|
["CompanyId", "CompanyName", "FundClassId"]
|
||||||
].drop_duplicates()
|
].drop_duplicates()
|
||||||
provider_share_class_count = (
|
provider_share_class_count = (
|
||||||
provider_share_class_id_df.groupby(["ProviderCompanyId", "CompanyName"])
|
provider_share_class_id_df.groupby(["CompanyId", "CompanyName"])
|
||||||
.size()
|
.size()
|
||||||
.reset_index(name="share_class_count")
|
.reset_index(name="share_class_count")
|
||||||
)
|
)
|
||||||
|
|
@ -238,13 +242,18 @@ def statistics_document(
|
||||||
)
|
)
|
||||||
describe_stat_df_list.append(fund_share_class_count_stat_df)
|
describe_stat_df_list.append(fund_share_class_count_stat_df)
|
||||||
|
|
||||||
stat_file = os.path.join(output_folder, "doc_mapping_statistics_data.xlsx")
|
stat_file = os.path.join(output_folder, output_file)
|
||||||
|
|
||||||
|
doc_id_list = [str(docid) for docid in doc_mapping_data["DocumentId"].unique().tolist()]
|
||||||
# statistics document page number
|
# statistics document page number
|
||||||
pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
|
pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
|
||||||
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
||||||
logger.info("statistics document page number")
|
logger.info("statistics document page number")
|
||||||
doc_page_num_list = []
|
doc_page_num_list = []
|
||||||
for pdf_file in tqdm(pdf_files):
|
for pdf_file in tqdm(pdf_files):
|
||||||
|
pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
|
||||||
|
if pdf_base_name not in doc_id_list:
|
||||||
|
continue
|
||||||
docid = os.path.basename(pdf_file).split(".")[0]
|
docid = os.path.basename(pdf_file).split(".")[0]
|
||||||
doc = fitz.open(pdf_file)
|
doc = fitz.open(pdf_file)
|
||||||
page_num = doc.page_count
|
page_num = doc.page_count
|
||||||
|
|
@ -829,6 +838,46 @@ def pickup_document_from_top_100_providers():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def compare_records_count_by_document_id():
|
||||||
|
data_from_document = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
||||||
|
sheet_name = "mapping_data"
|
||||||
|
data_from_document_df = pd.read_excel(data_from_document, sheet_name=sheet_name)
|
||||||
|
data_from_document_df.rename(
|
||||||
|
columns={"doc_id": "DocumentId"}, inplace=True
|
||||||
|
)
|
||||||
|
# get the count of records by DocumentId
|
||||||
|
document_records_count = data_from_document_df.groupby("DocumentId").size().reset_index(name="records_count")
|
||||||
|
|
||||||
|
data_from_database = r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
|
||||||
|
sheet_name = "random_small_document_all_data"
|
||||||
|
data_from_database_df = pd.read_excel(data_from_database, sheet_name=sheet_name)
|
||||||
|
database_records_count = data_from_database_df.groupby("DocumentId").size().reset_index(name="records_count")
|
||||||
|
|
||||||
|
# merge document_records_count with database_records_count
|
||||||
|
records_count_compare = pd.merge(
|
||||||
|
document_records_count,
|
||||||
|
database_records_count,
|
||||||
|
on=["DocumentId"],
|
||||||
|
how="left",
|
||||||
|
)
|
||||||
|
records_count_compare["records_count_diff"] = records_count_compare["records_count_x"] - records_count_compare["records_count_y"]
|
||||||
|
records_count_compare = records_count_compare.sort_values(by="records_count_diff", ascending=False)
|
||||||
|
# rename records_count_x to records_count_document, records_count_y to records_count_database
|
||||||
|
records_count_compare.rename(
|
||||||
|
columns={"records_count_x": "records_count_document",
|
||||||
|
"records_count_y": "records_count_database"}, inplace=True
|
||||||
|
)
|
||||||
|
records_count_compare.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
records_count_compare_file = (
|
||||||
|
r"/data/emea_ar/basic_information/English/records_count_compare_between_document_database.xlsx"
|
||||||
|
)
|
||||||
|
with pd.ExcelWriter(records_count_compare_file) as writer:
|
||||||
|
records_count_compare.to_excel(
|
||||||
|
writer, sheet_name="records_count_compare", index=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
doc_provider_file_path = (
|
doc_provider_file_path = (
|
||||||
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
||||||
|
|
@ -845,22 +894,35 @@ if __name__ == "__main__":
|
||||||
output_folder = r"/data/emea_ar/output/"
|
output_folder = r"/data/emea_ar/output/"
|
||||||
# get_unique_docids_from_doc_provider_data(doc_provider_file_path)
|
# get_unique_docids_from_doc_provider_data(doc_provider_file_path)
|
||||||
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
|
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
|
||||||
pdf_folder = r"/data/emea_ar/small_pdf/"
|
# pdf_folder = r"/data/emea_ar/small_pdf/"
|
||||||
output_folder = r"/data/emea_ar/small_pdf_txt/"
|
output_folder = r"/data/emea_ar/small_pdf_txt/"
|
||||||
random_small_document_data_file = (
|
random_small_document_data_file = (
|
||||||
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
|
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# download_pdf(random_small_document_data_file, 'random_small_document', pdf_folder)
|
# download_pdf(random_small_document_data_file, 'random_small_document', pdf_folder)
|
||||||
# output_pdf_page_text(pdf_folder, output_folder)
|
# output_pdf_page_text(pdf_folder, output_folder)
|
||||||
|
|
||||||
# extract_pdf_table(pdf_folder, output_folder)
|
# extract_pdf_table(pdf_folder, output_folder)
|
||||||
# analyze_json_error()
|
# analyze_json_error()
|
||||||
|
|
||||||
# statistics_document(pdf_folder, doc_mapping_file_path, basic_info_folder)
|
latest_top_100_provider_ar_data_file = r"/data/emea_ar/basic_information/English/top_100_provider_latest_document_most_mapping/lux_english_ar_from_top_100_provider_latest_document_with_most_mappings.xlsx"
|
||||||
|
# download_pdf(latest_top_100_provider_ar_data_file,
|
||||||
|
# 'latest_ar_document_most_mapping',
|
||||||
|
# pdf_folder)
|
||||||
|
|
||||||
|
output_data_folder = r"/data/emea_ar/basic_information/English/top_100_provider_latest_document_most_mapping/"
|
||||||
|
statistics_document(pdf_folder=pdf_folder,
|
||||||
|
doc_mapping_file_path=latest_top_100_provider_ar_data_file,
|
||||||
|
sheet_name="latest_doc_ar_data",
|
||||||
|
output_folder=output_data_folder,
|
||||||
|
output_file="latest_doc_ar_mapping_statistics.xlsx")
|
||||||
|
|
||||||
# statistics_provider_mapping(
|
# statistics_provider_mapping(
|
||||||
# provider_mapping_data_file=provider_mapping_data_file,
|
# provider_mapping_data_file=provider_mapping_data_file,
|
||||||
# output_folder=basic_info_folder,
|
# output_folder=basic_info_folder,
|
||||||
# )
|
# )
|
||||||
# statistics_document_fund_share_count(doc_mapping_from_top_100_provider_file)
|
# statistics_document_fund_share_count(doc_mapping_from_top_100_provider_file)
|
||||||
pickup_document_from_top_100_providers()
|
# pickup_document_from_top_100_providers()
|
||||||
|
# compare_records_count_by_document_id()
|
||||||
|
|
|
||||||
|
|
@ -165,6 +165,16 @@ def remove_special_characters(text):
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def get_unique_words_text(text):
|
||||||
|
text = remove_special_characters(text)
|
||||||
|
text = text.lower()
|
||||||
|
text_split = text.split()
|
||||||
|
text_split = list(set(text_split))
|
||||||
|
# sort the list
|
||||||
|
text_split.sort()
|
||||||
|
return_text = ' '.join(text_split)
|
||||||
|
return return_text
|
||||||
|
|
||||||
|
|
||||||
def remove_numeric_characters(text):
|
def remove_numeric_characters(text):
|
||||||
# remove numeric characters
|
# remove numeric characters
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue