Support replaces share class name in database to be more readable.
Examples document 532422720 M&G European Credit Investment Fund A CHFH Acc -> M&G European Credit Investment Fund A CHF H Accumulation M&G European Credit Investment Fund A CHFHInc -> M&G European Credit Investment Fund A CHF H Income M&G European High Yield Credit Investment Fund E GBPHedgedAcc -> M&G European High Yield Credit Investment Fund E GBP Hedged Accumulation
This commit is contained in:
parent
2645d528b1
commit
81a424b00d
22
main.py
22
main.py
|
|
@ -636,7 +636,25 @@ def test_translate_pdf():
|
||||||
translate_pdf.start_job()
|
translate_pdf.start_job()
|
||||||
|
|
||||||
|
|
||||||
|
def test_replace_abbrevation():
|
||||||
|
from utils.biz_utils import replace_abbrevation
|
||||||
|
text_list= ["M&G European Credit Investment Fund A CHFH Acc",
|
||||||
|
"M&G European Credit Investment Fund A CHFHInc",
|
||||||
|
"M&G European Credit Investment Fund A USDHAcc",
|
||||||
|
"M&G European High Yield Credit Investment Fund E GBPHedgedAcc",
|
||||||
|
"M&G Sustainable European Credit Investment Fd Cl L GBPH Acc",
|
||||||
|
"M&G Sustainable Total Return Credit Investment Fd AI HGBPInc",
|
||||||
|
"M&G Total Return Credit Investment Fund Class WI GBPHedgedInc",
|
||||||
|
"M&G Total Return Credit Investment Fund Class W GBP HedgedInc",
|
||||||
|
"M&G Total Return Credit Investment Fund Class P CHF H Acc",
|
||||||
|
"M&G Total Return Credit Investment Fund P EUR Inc"]
|
||||||
|
for text in text_list:
|
||||||
|
result = replace_abbrevation(text)
|
||||||
|
logger.info(f"Original text: {text}, replaced text: {result}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# test_replace_abbrevation()
|
||||||
# test_translate_pdf()
|
# test_translate_pdf()
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
|
|
@ -905,10 +923,10 @@ if __name__ == "__main__":
|
||||||
"536343790"
|
"536343790"
|
||||||
]
|
]
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
special_doc_id_list = ["534547266"]
|
special_doc_id_list = ["532422720"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = True
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
|
||||||
330
prepare_data.py
330
prepare_data.py
|
|
@ -975,7 +975,327 @@ def concat_mapping(mapping_folder: str,
|
||||||
all_data.to_excel(f, index=False)
|
all_data.to_excel(f, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def calc_typical_doc_metrics_v2():
|
||||||
|
"""
|
||||||
|
Statistics metrics for typical document.
|
||||||
|
1. Fund level datapoint: TOR
|
||||||
|
2. Share level datapoint: OGC, TER, Performance fees
|
||||||
|
3. Only statistics the record which with document investment mapping
|
||||||
|
"""
|
||||||
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||||||
|
result_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_20_new_emea_documents_sample_Accuracy.xlsx"
|
||||||
|
sheet_name = "record_level_Results"
|
||||||
|
data = pd.read_excel(result_file, sheet_name=sheet_name)
|
||||||
|
data.fillna("", inplace=True)
|
||||||
|
# filter data which valid is 1
|
||||||
|
data = data[data["valid"] == 1]
|
||||||
|
|
||||||
|
fund_raw_data_gt = []
|
||||||
|
fund_raw_data_pred = []
|
||||||
|
|
||||||
|
fund_mapping_data_gt = []
|
||||||
|
fund_mapping_data_pred = []
|
||||||
|
|
||||||
|
share_raw_data_gt = []
|
||||||
|
share_raw_data_pred = []
|
||||||
|
|
||||||
|
share_mapping_data_gt = []
|
||||||
|
share_mapping_data_pred = []
|
||||||
|
|
||||||
|
for idx, row in data.iterrows():
|
||||||
|
raw_data_gt_count = row["Raw data in Doc"]
|
||||||
|
raw_data_infer_count = row["Raw data in Inference"]
|
||||||
|
if len(str(raw_data_gt_count)) > 0:
|
||||||
|
raw_data_gt_count = int(raw_data_gt_count)
|
||||||
|
raw_data_infer_count = int(raw_data_infer_count)
|
||||||
|
|
||||||
|
raw_gt_list = [1 for i in range(raw_data_gt_count)]
|
||||||
|
raw_pred_list = []
|
||||||
|
if raw_data_infer_count > 0:
|
||||||
|
raw_pred_list = [1 for i in range(raw_data_infer_count)]
|
||||||
|
if len(raw_pred_list) < len(raw_gt_list):
|
||||||
|
raw_pred_list.extend([0 for i in range(len(raw_gt_list) - len(raw_pred_list))])
|
||||||
|
|
||||||
|
mapping_data_gt_count = row["data in DB"]
|
||||||
|
mapping_data_infer_count = row["data in Inferencce"]
|
||||||
|
if len(str(mapping_data_gt_count)) > 0:
|
||||||
|
mapping_data_gt_count = int(mapping_data_gt_count)
|
||||||
|
mapping_data_infer_count = int(mapping_data_infer_count)
|
||||||
|
|
||||||
|
mapping_gt_list = [1 for i in range(mapping_data_gt_count)]
|
||||||
|
mapping_pred_list = []
|
||||||
|
if mapping_data_infer_count > 0:
|
||||||
|
mapping_pred_list = [1 for i in range(mapping_data_infer_count)]
|
||||||
|
if len(mapping_pred_list) < len(mapping_gt_list):
|
||||||
|
mapping_pred_list.extend([0 for i in range(len(mapping_gt_list) - len(mapping_pred_list))])
|
||||||
|
|
||||||
|
|
||||||
|
data_level = row["data_level"]
|
||||||
|
if data_level == "fund":
|
||||||
|
fund_raw_data_gt.extend(raw_gt_list)
|
||||||
|
fund_raw_data_pred.extend(raw_pred_list)
|
||||||
|
|
||||||
|
fund_mapping_data_gt.extend(mapping_gt_list)
|
||||||
|
fund_mapping_data_pred.extend(mapping_pred_list)
|
||||||
|
else:
|
||||||
|
share_raw_data_gt.extend(raw_gt_list)
|
||||||
|
share_raw_data_pred.extend(raw_pred_list)
|
||||||
|
|
||||||
|
share_mapping_data_gt.extend(mapping_gt_list)
|
||||||
|
share_mapping_data_pred.extend(mapping_pred_list)
|
||||||
|
|
||||||
|
share_raw_data_gt.extend([0, 0, 0, 0, 0, 0])
|
||||||
|
share_raw_data_pred.extend([1, 1, 1, 1, 1, 1])
|
||||||
|
|
||||||
|
share_mapping_data_gt.extend([0, 0, 0, 0, 0, 0])
|
||||||
|
share_mapping_data_pred.extend([1, 1, 1, 1, 1, 1])
|
||||||
|
|
||||||
|
fund_raw_data_accuracy = accuracy_score(fund_raw_data_gt, fund_raw_data_pred)
|
||||||
|
fund_raw_data_precision = precision_score(fund_raw_data_gt, fund_raw_data_pred)
|
||||||
|
fund_raw_data_recall = recall_score(fund_raw_data_gt, fund_raw_data_pred)
|
||||||
|
fund_raw_data_f1 = f1_score(fund_raw_data_gt, fund_raw_data_pred)
|
||||||
|
|
||||||
|
fund_mapping_data_accuracy = accuracy_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
||||||
|
fund_mapping_data_precision = precision_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
||||||
|
fund_mapping_data_recall = recall_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
||||||
|
fund_mapping_data_f1 = f1_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
||||||
|
|
||||||
|
share_raw_data_accuracy = accuracy_score(share_raw_data_gt, share_raw_data_pred)
|
||||||
|
share_raw_data_precision = precision_score(share_raw_data_gt, share_raw_data_pred)
|
||||||
|
share_raw_data_recall = recall_score(share_raw_data_gt, share_raw_data_pred)
|
||||||
|
share_raw_data_f1 = f1_score(share_raw_data_gt, share_raw_data_pred)
|
||||||
|
|
||||||
|
share_mapping_data_accuracy = accuracy_score(share_mapping_data_gt, share_mapping_data_pred)
|
||||||
|
share_mapping_data_precision = precision_score(share_mapping_data_gt, share_mapping_data_pred)
|
||||||
|
share_mapping_data_recall = recall_score(share_mapping_data_gt, share_mapping_data_pred)
|
||||||
|
share_mapping_data_f1 = f1_score(share_mapping_data_gt, share_mapping_data_pred)
|
||||||
|
|
||||||
|
final_data = []
|
||||||
|
|
||||||
|
fund_raw_data_metrics = {"title": "Fund_Datapoint_Raw_Data",
|
||||||
|
"accuracy": fund_raw_data_accuracy,
|
||||||
|
"precision": fund_raw_data_precision,
|
||||||
|
"recall": fund_raw_data_recall,
|
||||||
|
"f1": fund_raw_data_f1,
|
||||||
|
"support": len(fund_raw_data_gt)}
|
||||||
|
final_data.append(fund_raw_data_metrics)
|
||||||
|
logger.info(f"fund_raw_data_accuracy: {fund_raw_data_accuracy}")
|
||||||
|
logger.info(f"fund_raw_data_precision: {fund_raw_data_precision}")
|
||||||
|
logger.info(f"fund_raw_data_recall: {fund_raw_data_recall}")
|
||||||
|
logger.info(f"fund_raw_data_f1: {fund_raw_data_f1}")
|
||||||
|
logger.info(f"fund_raw_data_support: {len(fund_raw_data_gt)}")
|
||||||
|
|
||||||
|
fund_mapping_data_metrics = {"title": "Fund_Datapoint_Mapping_Data",
|
||||||
|
"accuracy": fund_mapping_data_accuracy,
|
||||||
|
"precision": fund_mapping_data_precision,
|
||||||
|
"recall": fund_mapping_data_recall,
|
||||||
|
"f1": fund_mapping_data_f1,
|
||||||
|
"support": len(fund_mapping_data_gt)}
|
||||||
|
final_data.append(fund_mapping_data_metrics)
|
||||||
|
logger.info(f"fund_mapping_data_accuracy: {fund_mapping_data_accuracy}")
|
||||||
|
logger.info(f"fund_mapping_data_precision: {fund_mapping_data_precision}")
|
||||||
|
logger.info(f"fund_mapping_data_recall: {fund_mapping_data_recall}")
|
||||||
|
logger.info(f"fund_mapping_data_f1: {fund_mapping_data_f1}")
|
||||||
|
logger.info(f"fund_mapping_data_support: {len(fund_mapping_data_gt)}")
|
||||||
|
|
||||||
|
share_raw_data_metrics = {"title": "Share_Datapoint_Raw_Data",
|
||||||
|
"accuracy": share_raw_data_accuracy,
|
||||||
|
"precision": share_raw_data_precision,
|
||||||
|
"recall": share_raw_data_recall,
|
||||||
|
"f1": share_raw_data_f1,
|
||||||
|
"support": len(share_raw_data_gt)}
|
||||||
|
final_data.append(share_raw_data_metrics)
|
||||||
|
logger.info(f"share_raw_data_accuracy: {share_raw_data_accuracy}")
|
||||||
|
logger.info(f"share_raw_data_precision: {share_raw_data_precision}")
|
||||||
|
logger.info(f"share_raw_data_recall: {share_raw_data_recall}")
|
||||||
|
logger.info(f"share_raw_data_f1: {share_raw_data_f1}")
|
||||||
|
logger.info(f"share_raw_data_support: {len(share_raw_data_gt)}")
|
||||||
|
|
||||||
|
share_mapping_data_metrics = {"title": "Share_Datapoint_Mapping_Data",
|
||||||
|
"accuracy": share_mapping_data_accuracy,
|
||||||
|
"precision": share_mapping_data_precision,
|
||||||
|
"recall": share_mapping_data_recall,
|
||||||
|
"f1": share_mapping_data_f1,
|
||||||
|
"support": len(share_mapping_data_gt)}
|
||||||
|
final_data.append(share_mapping_data_metrics)
|
||||||
|
logger.info(f"share_mapping_data_accuracy: {share_mapping_data_accuracy}")
|
||||||
|
logger.info(f"share_mapping_data_precision: {share_mapping_data_precision}")
|
||||||
|
logger.info(f"share_mapping_data_recall: {share_mapping_data_recall}")
|
||||||
|
logger.info(f"share_mapping_data_f1: {share_mapping_data_f1}")
|
||||||
|
logger.info(f"share_mapping_data_support: {len(share_mapping_data_gt)}")
|
||||||
|
|
||||||
|
final_data_df = pd.DataFrame(final_data)
|
||||||
|
# set column order as title, accuracy, f1, precision, recall
|
||||||
|
final_data_df = final_data_df[["title", "accuracy", "f1", "precision", "recall", "support"]]
|
||||||
|
# output to excel
|
||||||
|
final_data_file = (
|
||||||
|
r"/data/emea_ar/output/metrics/mapping_data_info_20_new_emea_documents_sample_Accuracy_metrics_v2.xlsx"
|
||||||
|
)
|
||||||
|
with pd.ExcelWriter(final_data_file) as writer:
|
||||||
|
final_data_df.to_excel(
|
||||||
|
writer, sheet_name="metrics", index=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def calc_typical_doc_metrics_v1():
|
||||||
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||||||
|
result_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_20_new_emea_documents_sample_Accuracy.xlsx"
|
||||||
|
sheet_name = "record_level_Results"
|
||||||
|
data = pd.read_excel(result_file, sheet_name=sheet_name)
|
||||||
|
data.fillna("", inplace=True)
|
||||||
|
fund_raw_data_list = data["Raw Mapping"].tolist()
|
||||||
|
fund_raw_data_gt = []
|
||||||
|
fund_raw_data_pred = []
|
||||||
|
for fund_raw_data in fund_raw_data_list:
|
||||||
|
if fund_raw_data == "Correct Raw mapping":
|
||||||
|
fund_raw_data_gt.append(1)
|
||||||
|
fund_raw_data_pred.append(1)
|
||||||
|
elif fund_raw_data == "Incorrect Raw mapping":
|
||||||
|
fund_raw_data_gt.append(1)
|
||||||
|
fund_raw_data_pred.append(0)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
fund_raw_data_accuracy = accuracy_score(fund_raw_data_gt, fund_raw_data_pred)
|
||||||
|
fund_raw_data_precision = precision_score(fund_raw_data_gt, fund_raw_data_pred)
|
||||||
|
fund_raw_data_recall = recall_score(fund_raw_data_gt, fund_raw_data_pred)
|
||||||
|
fund_raw_data_f1 = f1_score(fund_raw_data_gt, fund_raw_data_pred)
|
||||||
|
|
||||||
|
fund_mapping_data_list = data["Share Mapping"].tolist()
|
||||||
|
fund_mapping_data_gt = []
|
||||||
|
fund_mapping_data_pred = []
|
||||||
|
for fund_mapping_data in fund_mapping_data_list:
|
||||||
|
if fund_mapping_data == "Correct share mapping":
|
||||||
|
fund_mapping_data_gt.append(1)
|
||||||
|
fund_mapping_data_pred.append(1)
|
||||||
|
elif fund_mapping_data == "Incorrect share mapping":
|
||||||
|
fund_mapping_data_gt.append(1)
|
||||||
|
fund_mapping_data_pred.append(0)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
fund_mapping_data_accuracy = accuracy_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
||||||
|
fund_mapping_data_precision = precision_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
||||||
|
fund_mapping_data_recall = recall_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
||||||
|
fund_mapping_data_f1 = f1_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
||||||
|
|
||||||
|
share_raw_data_gt = []
|
||||||
|
share_raw_data_pred = []
|
||||||
|
|
||||||
|
share_mapping_data_gt = []
|
||||||
|
share_mapping_data_pred = []
|
||||||
|
for idx, row in data.iterrows():
|
||||||
|
share_raw_data_infer_count = row["Raw Share in Inference"]
|
||||||
|
share_raw_data_gt_count = row["Raw Share in Doc"]
|
||||||
|
if share_raw_data_gt_count is not None and \
|
||||||
|
len(str(share_raw_data_gt_count)) > 0:
|
||||||
|
share_raw_data_gt_count = int(share_raw_data_gt_count)
|
||||||
|
share_raw_data_infer_count = int(share_raw_data_infer_count)
|
||||||
|
|
||||||
|
gt_list = [1 for i in range(share_raw_data_gt_count)]
|
||||||
|
if share_raw_data_infer_count > 0:
|
||||||
|
pred_list = [1 for i in range(share_raw_data_infer_count)]
|
||||||
|
else:
|
||||||
|
pred_list = [1, 1]
|
||||||
|
gt_list = [0, 0]
|
||||||
|
if len(pred_list) < len(gt_list):
|
||||||
|
pred_list.extend([0 for i in range(len(gt_list) - len(pred_list))])
|
||||||
|
share_raw_data_gt.extend(gt_list)
|
||||||
|
share_raw_data_pred.extend(pred_list)
|
||||||
|
|
||||||
|
share_mapping_data_infer_count = row["share in Inferencce"]
|
||||||
|
share_mapping_data_gt_count = row["share in DB"]
|
||||||
|
if share_mapping_data_gt_count is not None and \
|
||||||
|
len(str(share_mapping_data_gt_count)) > 0:
|
||||||
|
share_mapping_data_gt_count = int(share_mapping_data_gt_count)
|
||||||
|
share_mapping_data_infer_count = int(share_mapping_data_infer_count)
|
||||||
|
|
||||||
|
gt_list = [1 for i in range(share_mapping_data_gt_count)]
|
||||||
|
if share_mapping_data_infer_count > 0:
|
||||||
|
pred_list = [1 for i in range(share_mapping_data_infer_count)]
|
||||||
|
else:
|
||||||
|
pred_list = [1, 1]
|
||||||
|
gt_list = [0, 0]
|
||||||
|
if len(pred_list) < len(gt_list):
|
||||||
|
pred_list.extend([0 for i in range(len(gt_list) - len(pred_list))])
|
||||||
|
share_mapping_data_gt.extend(gt_list)
|
||||||
|
share_mapping_data_pred.extend(pred_list)
|
||||||
|
share_raw_data_accuracy = accuracy_score(share_raw_data_gt, share_raw_data_pred)
|
||||||
|
share_raw_data_precision = precision_score(share_raw_data_gt, share_raw_data_pred)
|
||||||
|
share_raw_data_recall = recall_score(share_raw_data_gt, share_raw_data_pred)
|
||||||
|
share_raw_data_f1 = f1_score(share_raw_data_gt, share_raw_data_pred)
|
||||||
|
|
||||||
|
share_mapping_data_accuracy = accuracy_score(share_mapping_data_gt, share_mapping_data_pred)
|
||||||
|
share_mapping_data_precision = precision_score(share_mapping_data_gt, share_mapping_data_pred)
|
||||||
|
share_mapping_data_recall = recall_score(share_mapping_data_gt, share_mapping_data_pred)
|
||||||
|
share_mapping_data_f1 = f1_score(share_mapping_data_gt, share_mapping_data_pred)
|
||||||
|
|
||||||
|
final_data = []
|
||||||
|
|
||||||
|
fund_raw_data_metrics = {"title": "Fund_Raw_Data",
|
||||||
|
"accuracy": fund_raw_data_accuracy,
|
||||||
|
"precision": fund_raw_data_precision,
|
||||||
|
"recall": fund_raw_data_recall,
|
||||||
|
"f1": fund_raw_data_f1,
|
||||||
|
"support": len(fund_raw_data_gt)}
|
||||||
|
final_data.append(fund_raw_data_metrics)
|
||||||
|
logger.info(f"fund_raw_data_accuracy: {fund_raw_data_accuracy}")
|
||||||
|
logger.info(f"fund_raw_data_precision: {fund_raw_data_precision}")
|
||||||
|
logger.info(f"fund_raw_data_recall: {fund_raw_data_recall}")
|
||||||
|
logger.info(f"fund_raw_data_f1: {fund_raw_data_f1}")
|
||||||
|
|
||||||
|
fund_mapping_data_metrics = {"title": "Fund_Mapping_Data",
|
||||||
|
"accuracy": fund_mapping_data_accuracy,
|
||||||
|
"precision": fund_mapping_data_precision,
|
||||||
|
"recall": fund_mapping_data_recall,
|
||||||
|
"f1": fund_mapping_data_f1,
|
||||||
|
"support": len(fund_mapping_data_gt)}
|
||||||
|
final_data.append(fund_mapping_data_metrics)
|
||||||
|
logger.info(f"fund_mapping_data_accuracy: {fund_mapping_data_accuracy}")
|
||||||
|
logger.info(f"fund_mapping_data_precision: {fund_mapping_data_precision}")
|
||||||
|
logger.info(f"fund_mapping_data_recall: {fund_mapping_data_recall}")
|
||||||
|
logger.info(f"fund_mapping_data_f1: {fund_mapping_data_f1}")
|
||||||
|
|
||||||
|
share_raw_data_metrics = {"title": "Share_Raw_Data",
|
||||||
|
"accuracy": share_raw_data_accuracy,
|
||||||
|
"precision": share_raw_data_precision,
|
||||||
|
"recall": share_raw_data_recall,
|
||||||
|
"f1": share_raw_data_f1,
|
||||||
|
"support": len(share_raw_data_gt)}
|
||||||
|
final_data.append(share_raw_data_metrics)
|
||||||
|
logger.info(f"share_raw_data_accuracy: {share_raw_data_accuracy}")
|
||||||
|
logger.info(f"share_raw_data_precision: {share_raw_data_precision}")
|
||||||
|
logger.info(f"share_raw_data_recall: {share_raw_data_recall}")
|
||||||
|
logger.info(f"share_raw_data_f1: {share_raw_data_f1}")
|
||||||
|
|
||||||
|
share_mapping_data_metrics = {"title": "Share_Mapping_Data",
|
||||||
|
"accuracy": share_mapping_data_accuracy,
|
||||||
|
"precision": share_mapping_data_precision,
|
||||||
|
"recall": share_mapping_data_recall,
|
||||||
|
"f1": share_mapping_data_f1,
|
||||||
|
"support": len(share_mapping_data_gt)}
|
||||||
|
final_data.append(share_mapping_data_metrics)
|
||||||
|
logger.info(f"share_mapping_data_accuracy: {share_mapping_data_accuracy}")
|
||||||
|
logger.info(f"share_mapping_data_precision: {share_mapping_data_precision}")
|
||||||
|
logger.info(f"share_mapping_data_recall: {share_mapping_data_recall}")
|
||||||
|
logger.info(f"share_mapping_data_f1: {share_mapping_data_f1}")
|
||||||
|
|
||||||
|
final_data_df = pd.DataFrame(final_data)
|
||||||
|
# set column order as title, accuracy, f1, precision, recall
|
||||||
|
final_data_df = final_data_df[["title", "accuracy", "f1", "precision", "recall", "support"]]
|
||||||
|
# output to excel
|
||||||
|
final_data_file = (
|
||||||
|
r"/data/emea_ar/output/metrics/mapping_data_info_20_new_emea_documents_sample_Accuracy_metrics.xlsx"
|
||||||
|
)
|
||||||
|
with pd.ExcelWriter(final_data_file) as writer:
|
||||||
|
final_data_df.to_excel(
|
||||||
|
writer, sheet_name="metrics", index=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# calc_typical_doc_metrics_v1()
|
||||||
|
calc_typical_doc_metrics_v2()
|
||||||
|
|
||||||
doc_provider_file_path = (
|
doc_provider_file_path = (
|
||||||
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
||||||
)
|
)
|
||||||
|
|
@ -999,11 +1319,11 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
doc_provider_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/Final list of EMEA documents.xlsx"
|
doc_provider_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/Final list of EMEA documents.xlsx"
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
download_pdf(
|
# download_pdf(
|
||||||
doc_provider_file_path=doc_provider_file_path,
|
# doc_provider_file_path=doc_provider_file_path,
|
||||||
sheet_name="Sheet1",
|
# sheet_name="Sheet1",
|
||||||
doc_id_column="Document Id",
|
# doc_id_column="Document Id",
|
||||||
pdf_path=pdf_folder)
|
# pdf_path=pdf_folder)
|
||||||
# output_pdf_page_text(pdf_folder, output_folder)
|
# output_pdf_page_text(pdf_folder, output_folder)
|
||||||
|
|
||||||
# extract_pdf_table(pdf_folder, output_folder)
|
# extract_pdf_table(pdf_folder, output_folder)
|
||||||
|
|
|
||||||
|
|
@ -825,8 +825,50 @@ def replace_abbrevation(text: str):
|
||||||
elif split.lower() in ['net', 'unhgd']:
|
elif split.lower() in ['net', 'unhgd']:
|
||||||
new_text_splits.append('')
|
new_text_splits.append('')
|
||||||
else:
|
else:
|
||||||
|
split = split_short_name_with_share_features(split)
|
||||||
new_text_splits.append(split)
|
new_text_splits.append(split)
|
||||||
|
|
||||||
new_text = ' '.join(new_text_splits)
|
new_text = ' '.join(new_text_splits)
|
||||||
new_text = re.sub(r'\s+', ' ', new_text).strip()
|
new_text = re.sub(r'\s+', ' ', new_text).strip()
|
||||||
return new_text
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
def split_short_name_with_share_features(text: str):
|
||||||
|
"""
|
||||||
|
Split short name with share features,
|
||||||
|
for examples:
|
||||||
|
Document mapping for 532422720
|
||||||
|
CHFHInc to be CHF H Income
|
||||||
|
USDHAcc to be USD H Accumulation
|
||||||
|
GBPHInc to be GBP H Income
|
||||||
|
HAcc to be H Accumulation
|
||||||
|
GBPHedgedAcc to be GBP Hedged Accumulation
|
||||||
|
HGBPInc to be H GBP Income
|
||||||
|
HNOKAcc to be H NOK Accumulation
|
||||||
|
"""
|
||||||
|
if text is None or len(text.strip()) == 0:
|
||||||
|
return text
|
||||||
|
if len(text.split()) > 1:
|
||||||
|
return text
|
||||||
|
text = text.strip()
|
||||||
|
share_features = {'Acc': 'Accumulation',
|
||||||
|
'Inc': 'Income',
|
||||||
|
'Dist': 'Distribution',
|
||||||
|
'Div': 'Dividend',}
|
||||||
|
feature_name = ""
|
||||||
|
for key, value in share_features.items():
|
||||||
|
if len(text) > len(key) and text.endswith(key):
|
||||||
|
feature_name = value
|
||||||
|
text = text.replace(key, '')
|
||||||
|
break
|
||||||
|
|
||||||
|
currency_text = ""
|
||||||
|
for currency in total_currency_list:
|
||||||
|
if len(text) > len(currency) and currency in text:
|
||||||
|
currency_text = currency
|
||||||
|
text = text.replace(currency, '')
|
||||||
|
break
|
||||||
|
|
||||||
|
new_text = currency_text + ' ' + text + ' ' + feature_name
|
||||||
|
new_text = re.sub(r'\s+', ' ', new_text).strip()
|
||||||
|
return new_text
|
||||||
Loading…
Reference in New Issue