Support replaces share class name in database to be more readable.

Examples document 532422720
M&G European Credit Investment Fund A CHFH Acc -> M&G European Credit Investment Fund A CHF H Accumulation

M&G European Credit Investment Fund A CHFHInc -> M&G European Credit Investment Fund A CHF H Income

M&G European High Yield Credit Investment Fund E GBPHedgedAcc -> M&G European High Yield Credit Investment Fund E GBP Hedged Accumulation
This commit is contained in:
Blade He 2024-11-05 11:14:56 -06:00
parent 2645d528b1
commit 81a424b00d
3 changed files with 387 additions and 7 deletions

22
main.py
View File

@ -634,9 +634,27 @@ def test_translate_pdf():
output_folder = r"/data/translate/output/"
translate_pdf = Translate_PDF(pdf_file, output_folder)
translate_pdf.start_job()
def test_replace_abbrevation():
from utils.biz_utils import replace_abbrevation
text_list= ["M&G European Credit Investment Fund A CHFH Acc",
"M&G European Credit Investment Fund A CHFHInc",
"M&G European Credit Investment Fund A USDHAcc",
"M&G European High Yield Credit Investment Fund E GBPHedgedAcc",
"M&G Sustainable European Credit Investment Fd Cl L GBPH Acc",
"M&G Sustainable Total Return Credit Investment Fd AI HGBPInc",
"M&G Total Return Credit Investment Fund Class WI GBPHedgedInc",
"M&G Total Return Credit Investment Fund Class W GBP HedgedInc",
"M&G Total Return Credit Investment Fund Class P CHF H Acc",
"M&G Total Return Credit Investment Fund P EUR Inc"]
for text in text_list:
result = replace_abbrevation(text)
logger.info(f"Original text: {text}, replaced text: {result}")
if __name__ == "__main__":
# test_replace_abbrevation()
# test_translate_pdf()
pdf_folder = r"/data/emea_ar/pdf/"
page_filter_ground_truth_file = (
@ -905,10 +923,10 @@ if __name__ == "__main__":
"536343790"
]
special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["534547266"]
special_doc_id_list = ["532422720"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True
re_run_extract_data = False
re_run_mapping_data = True
force_save_total_data = False
calculate_metrics = False

View File

@ -973,9 +973,329 @@ def concat_mapping(mapping_folder: str,
all_data.reset_index(drop=True, inplace=True)
with open(output_file, "wb") as f:
all_data.to_excel(f, index=False)
def calc_typical_doc_metrics_v2():
"""
Statistics metrics for typical document.
1. Fund level datapoint: TOR
2. Share level datapoint: OGC, TER, Performance fees
3. Only statistics the record which with document investment mapping
"""
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
result_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_20_new_emea_documents_sample_Accuracy.xlsx"
sheet_name = "record_level_Results"
data = pd.read_excel(result_file, sheet_name=sheet_name)
data.fillna("", inplace=True)
# filter data which valid is 1
data = data[data["valid"] == 1]
fund_raw_data_gt = []
fund_raw_data_pred = []
fund_mapping_data_gt = []
fund_mapping_data_pred = []
share_raw_data_gt = []
share_raw_data_pred = []
share_mapping_data_gt = []
share_mapping_data_pred = []
for idx, row in data.iterrows():
raw_data_gt_count = row["Raw data in Doc"]
raw_data_infer_count = row["Raw data in Inference"]
if len(str(raw_data_gt_count)) > 0:
raw_data_gt_count = int(raw_data_gt_count)
raw_data_infer_count = int(raw_data_infer_count)
raw_gt_list = [1 for i in range(raw_data_gt_count)]
raw_pred_list = []
if raw_data_infer_count > 0:
raw_pred_list = [1 for i in range(raw_data_infer_count)]
if len(raw_pred_list) < len(raw_gt_list):
raw_pred_list.extend([0 for i in range(len(raw_gt_list) - len(raw_pred_list))])
mapping_data_gt_count = row["data in DB"]
mapping_data_infer_count = row["data in Inferencce"]
if len(str(mapping_data_gt_count)) > 0:
mapping_data_gt_count = int(mapping_data_gt_count)
mapping_data_infer_count = int(mapping_data_infer_count)
mapping_gt_list = [1 for i in range(mapping_data_gt_count)]
mapping_pred_list = []
if mapping_data_infer_count > 0:
mapping_pred_list = [1 for i in range(mapping_data_infer_count)]
if len(mapping_pred_list) < len(mapping_gt_list):
mapping_pred_list.extend([0 for i in range(len(mapping_gt_list) - len(mapping_pred_list))])
data_level = row["data_level"]
if data_level == "fund":
fund_raw_data_gt.extend(raw_gt_list)
fund_raw_data_pred.extend(raw_pred_list)
fund_mapping_data_gt.extend(mapping_gt_list)
fund_mapping_data_pred.extend(mapping_pred_list)
else:
share_raw_data_gt.extend(raw_gt_list)
share_raw_data_pred.extend(raw_pred_list)
share_mapping_data_gt.extend(mapping_gt_list)
share_mapping_data_pred.extend(mapping_pred_list)
share_raw_data_gt.extend([0, 0, 0, 0, 0, 0])
share_raw_data_pred.extend([1, 1, 1, 1, 1, 1])
share_mapping_data_gt.extend([0, 0, 0, 0, 0, 0])
share_mapping_data_pred.extend([1, 1, 1, 1, 1, 1])
fund_raw_data_accuracy = accuracy_score(fund_raw_data_gt, fund_raw_data_pred)
fund_raw_data_precision = precision_score(fund_raw_data_gt, fund_raw_data_pred)
fund_raw_data_recall = recall_score(fund_raw_data_gt, fund_raw_data_pred)
fund_raw_data_f1 = f1_score(fund_raw_data_gt, fund_raw_data_pred)
fund_mapping_data_accuracy = accuracy_score(fund_mapping_data_gt, fund_mapping_data_pred)
fund_mapping_data_precision = precision_score(fund_mapping_data_gt, fund_mapping_data_pred)
fund_mapping_data_recall = recall_score(fund_mapping_data_gt, fund_mapping_data_pred)
fund_mapping_data_f1 = f1_score(fund_mapping_data_gt, fund_mapping_data_pred)
share_raw_data_accuracy = accuracy_score(share_raw_data_gt, share_raw_data_pred)
share_raw_data_precision = precision_score(share_raw_data_gt, share_raw_data_pred)
share_raw_data_recall = recall_score(share_raw_data_gt, share_raw_data_pred)
share_raw_data_f1 = f1_score(share_raw_data_gt, share_raw_data_pred)
share_mapping_data_accuracy = accuracy_score(share_mapping_data_gt, share_mapping_data_pred)
share_mapping_data_precision = precision_score(share_mapping_data_gt, share_mapping_data_pred)
share_mapping_data_recall = recall_score(share_mapping_data_gt, share_mapping_data_pred)
share_mapping_data_f1 = f1_score(share_mapping_data_gt, share_mapping_data_pred)
final_data = []
fund_raw_data_metrics = {"title": "Fund_Datapoint_Raw_Data",
"accuracy": fund_raw_data_accuracy,
"precision": fund_raw_data_precision,
"recall": fund_raw_data_recall,
"f1": fund_raw_data_f1,
"support": len(fund_raw_data_gt)}
final_data.append(fund_raw_data_metrics)
logger.info(f"fund_raw_data_accuracy: {fund_raw_data_accuracy}")
logger.info(f"fund_raw_data_precision: {fund_raw_data_precision}")
logger.info(f"fund_raw_data_recall: {fund_raw_data_recall}")
logger.info(f"fund_raw_data_f1: {fund_raw_data_f1}")
logger.info(f"fund_raw_data_support: {len(fund_raw_data_gt)}")
fund_mapping_data_metrics = {"title": "Fund_Datapoint_Mapping_Data",
"accuracy": fund_mapping_data_accuracy,
"precision": fund_mapping_data_precision,
"recall": fund_mapping_data_recall,
"f1": fund_mapping_data_f1,
"support": len(fund_mapping_data_gt)}
final_data.append(fund_mapping_data_metrics)
logger.info(f"fund_mapping_data_accuracy: {fund_mapping_data_accuracy}")
logger.info(f"fund_mapping_data_precision: {fund_mapping_data_precision}")
logger.info(f"fund_mapping_data_recall: {fund_mapping_data_recall}")
logger.info(f"fund_mapping_data_f1: {fund_mapping_data_f1}")
logger.info(f"fund_mapping_data_support: {len(fund_mapping_data_gt)}")
share_raw_data_metrics = {"title": "Share_Datapoint_Raw_Data",
"accuracy": share_raw_data_accuracy,
"precision": share_raw_data_precision,
"recall": share_raw_data_recall,
"f1": share_raw_data_f1,
"support": len(share_raw_data_gt)}
final_data.append(share_raw_data_metrics)
logger.info(f"share_raw_data_accuracy: {share_raw_data_accuracy}")
logger.info(f"share_raw_data_precision: {share_raw_data_precision}")
logger.info(f"share_raw_data_recall: {share_raw_data_recall}")
logger.info(f"share_raw_data_f1: {share_raw_data_f1}")
logger.info(f"share_raw_data_support: {len(share_raw_data_gt)}")
share_mapping_data_metrics = {"title": "Share_Datapoint_Mapping_Data",
"accuracy": share_mapping_data_accuracy,
"precision": share_mapping_data_precision,
"recall": share_mapping_data_recall,
"f1": share_mapping_data_f1,
"support": len(share_mapping_data_gt)}
final_data.append(share_mapping_data_metrics)
logger.info(f"share_mapping_data_accuracy: {share_mapping_data_accuracy}")
logger.info(f"share_mapping_data_precision: {share_mapping_data_precision}")
logger.info(f"share_mapping_data_recall: {share_mapping_data_recall}")
logger.info(f"share_mapping_data_f1: {share_mapping_data_f1}")
logger.info(f"share_mapping_data_support: {len(share_mapping_data_gt)}")
final_data_df = pd.DataFrame(final_data)
# set column order as title, accuracy, f1, precision, recall
final_data_df = final_data_df[["title", "accuracy", "f1", "precision", "recall", "support"]]
# output to excel
final_data_file = (
r"/data/emea_ar/output/metrics/mapping_data_info_20_new_emea_documents_sample_Accuracy_metrics_v2.xlsx"
)
with pd.ExcelWriter(final_data_file) as writer:
final_data_df.to_excel(
writer, sheet_name="metrics", index=False
)
def calc_typical_doc_metrics_v1():
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
result_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_20_new_emea_documents_sample_Accuracy.xlsx"
sheet_name = "record_level_Results"
data = pd.read_excel(result_file, sheet_name=sheet_name)
data.fillna("", inplace=True)
fund_raw_data_list = data["Raw Mapping"].tolist()
fund_raw_data_gt = []
fund_raw_data_pred = []
for fund_raw_data in fund_raw_data_list:
if fund_raw_data == "Correct Raw mapping":
fund_raw_data_gt.append(1)
fund_raw_data_pred.append(1)
elif fund_raw_data == "Incorrect Raw mapping":
fund_raw_data_gt.append(1)
fund_raw_data_pred.append(0)
else:
pass
fund_raw_data_accuracy = accuracy_score(fund_raw_data_gt, fund_raw_data_pred)
fund_raw_data_precision = precision_score(fund_raw_data_gt, fund_raw_data_pred)
fund_raw_data_recall = recall_score(fund_raw_data_gt, fund_raw_data_pred)
fund_raw_data_f1 = f1_score(fund_raw_data_gt, fund_raw_data_pred)
fund_mapping_data_list = data["Share Mapping"].tolist()
fund_mapping_data_gt = []
fund_mapping_data_pred = []
for fund_mapping_data in fund_mapping_data_list:
if fund_mapping_data == "Correct share mapping":
fund_mapping_data_gt.append(1)
fund_mapping_data_pred.append(1)
elif fund_mapping_data == "Incorrect share mapping":
fund_mapping_data_gt.append(1)
fund_mapping_data_pred.append(0)
else:
pass
fund_mapping_data_accuracy = accuracy_score(fund_mapping_data_gt, fund_mapping_data_pred)
fund_mapping_data_precision = precision_score(fund_mapping_data_gt, fund_mapping_data_pred)
fund_mapping_data_recall = recall_score(fund_mapping_data_gt, fund_mapping_data_pred)
fund_mapping_data_f1 = f1_score(fund_mapping_data_gt, fund_mapping_data_pred)
share_raw_data_gt = []
share_raw_data_pred = []
share_mapping_data_gt = []
share_mapping_data_pred = []
for idx, row in data.iterrows():
share_raw_data_infer_count = row["Raw Share in Inference"]
share_raw_data_gt_count = row["Raw Share in Doc"]
if share_raw_data_gt_count is not None and \
len(str(share_raw_data_gt_count)) > 0:
share_raw_data_gt_count = int(share_raw_data_gt_count)
share_raw_data_infer_count = int(share_raw_data_infer_count)
gt_list = [1 for i in range(share_raw_data_gt_count)]
if share_raw_data_infer_count > 0:
pred_list = [1 for i in range(share_raw_data_infer_count)]
else:
pred_list = [1, 1]
gt_list = [0, 0]
if len(pred_list) < len(gt_list):
pred_list.extend([0 for i in range(len(gt_list) - len(pred_list))])
share_raw_data_gt.extend(gt_list)
share_raw_data_pred.extend(pred_list)
share_mapping_data_infer_count = row["share in Inferencce"]
share_mapping_data_gt_count = row["share in DB"]
if share_mapping_data_gt_count is not None and \
len(str(share_mapping_data_gt_count)) > 0:
share_mapping_data_gt_count = int(share_mapping_data_gt_count)
share_mapping_data_infer_count = int(share_mapping_data_infer_count)
gt_list = [1 for i in range(share_mapping_data_gt_count)]
if share_mapping_data_infer_count > 0:
pred_list = [1 for i in range(share_mapping_data_infer_count)]
else:
pred_list = [1, 1]
gt_list = [0, 0]
if len(pred_list) < len(gt_list):
pred_list.extend([0 for i in range(len(gt_list) - len(pred_list))])
share_mapping_data_gt.extend(gt_list)
share_mapping_data_pred.extend(pred_list)
share_raw_data_accuracy = accuracy_score(share_raw_data_gt, share_raw_data_pred)
share_raw_data_precision = precision_score(share_raw_data_gt, share_raw_data_pred)
share_raw_data_recall = recall_score(share_raw_data_gt, share_raw_data_pred)
share_raw_data_f1 = f1_score(share_raw_data_gt, share_raw_data_pred)
share_mapping_data_accuracy = accuracy_score(share_mapping_data_gt, share_mapping_data_pred)
share_mapping_data_precision = precision_score(share_mapping_data_gt, share_mapping_data_pred)
share_mapping_data_recall = recall_score(share_mapping_data_gt, share_mapping_data_pred)
share_mapping_data_f1 = f1_score(share_mapping_data_gt, share_mapping_data_pred)
final_data = []
fund_raw_data_metrics = {"title": "Fund_Raw_Data",
"accuracy": fund_raw_data_accuracy,
"precision": fund_raw_data_precision,
"recall": fund_raw_data_recall,
"f1": fund_raw_data_f1,
"support": len(fund_raw_data_gt)}
final_data.append(fund_raw_data_metrics)
logger.info(f"fund_raw_data_accuracy: {fund_raw_data_accuracy}")
logger.info(f"fund_raw_data_precision: {fund_raw_data_precision}")
logger.info(f"fund_raw_data_recall: {fund_raw_data_recall}")
logger.info(f"fund_raw_data_f1: {fund_raw_data_f1}")
fund_mapping_data_metrics = {"title": "Fund_Mapping_Data",
"accuracy": fund_mapping_data_accuracy,
"precision": fund_mapping_data_precision,
"recall": fund_mapping_data_recall,
"f1": fund_mapping_data_f1,
"support": len(fund_mapping_data_gt)}
final_data.append(fund_mapping_data_metrics)
logger.info(f"fund_mapping_data_accuracy: {fund_mapping_data_accuracy}")
logger.info(f"fund_mapping_data_precision: {fund_mapping_data_precision}")
logger.info(f"fund_mapping_data_recall: {fund_mapping_data_recall}")
logger.info(f"fund_mapping_data_f1: {fund_mapping_data_f1}")
share_raw_data_metrics = {"title": "Share_Raw_Data",
"accuracy": share_raw_data_accuracy,
"precision": share_raw_data_precision,
"recall": share_raw_data_recall,
"f1": share_raw_data_f1,
"support": len(share_raw_data_gt)}
final_data.append(share_raw_data_metrics)
logger.info(f"share_raw_data_accuracy: {share_raw_data_accuracy}")
logger.info(f"share_raw_data_precision: {share_raw_data_precision}")
logger.info(f"share_raw_data_recall: {share_raw_data_recall}")
logger.info(f"share_raw_data_f1: {share_raw_data_f1}")
share_mapping_data_metrics = {"title": "Share_Mapping_Data",
"accuracy": share_mapping_data_accuracy,
"precision": share_mapping_data_precision,
"recall": share_mapping_data_recall,
"f1": share_mapping_data_f1,
"support": len(share_mapping_data_gt)}
final_data.append(share_mapping_data_metrics)
logger.info(f"share_mapping_data_accuracy: {share_mapping_data_accuracy}")
logger.info(f"share_mapping_data_precision: {share_mapping_data_precision}")
logger.info(f"share_mapping_data_recall: {share_mapping_data_recall}")
logger.info(f"share_mapping_data_f1: {share_mapping_data_f1}")
final_data_df = pd.DataFrame(final_data)
# set column order as title, accuracy, f1, precision, recall
final_data_df = final_data_df[["title", "accuracy", "f1", "precision", "recall", "support"]]
# output to excel
final_data_file = (
r"/data/emea_ar/output/metrics/mapping_data_info_20_new_emea_documents_sample_Accuracy_metrics.xlsx"
)
with pd.ExcelWriter(final_data_file) as writer:
final_data_df.to_excel(
writer, sheet_name="metrics", index=False
)
if __name__ == "__main__":
# calc_typical_doc_metrics_v1()
calc_typical_doc_metrics_v2()
doc_provider_file_path = (
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
)
@ -999,11 +1319,11 @@ if __name__ == "__main__":
doc_provider_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/Final list of EMEA documents.xlsx"
pdf_folder = r"/data/emea_ar/pdf/"
download_pdf(
doc_provider_file_path=doc_provider_file_path,
sheet_name="Sheet1",
doc_id_column="Document Id",
pdf_path=pdf_folder)
# download_pdf(
# doc_provider_file_path=doc_provider_file_path,
# sheet_name="Sheet1",
# doc_id_column="Document Id",
# pdf_path=pdf_folder)
# output_pdf_page_text(pdf_folder, output_folder)
# extract_pdf_table(pdf_folder, output_folder)

View File

@ -825,8 +825,50 @@ def replace_abbrevation(text: str):
elif split.lower() in ['net', 'unhgd']:
new_text_splits.append('')
else:
split = split_short_name_with_share_features(split)
new_text_splits.append(split)
new_text = ' '.join(new_text_splits)
new_text = re.sub(r'\s+', ' ', new_text).strip()
return new_text
def split_short_name_with_share_features(text: str):
"""
Split short name with share features,
for examples:
Document mapping for 532422720
CHFHInc to be CHF H Income
USDHAcc to be USD H Accumulation
GBPHInc to be GBP H Income
HAcc to be H Accumulation
GBPHedgedAcc to be GBP Hedged Accumulation
HGBPInc to be H GBP Income
HNOKAcc to be H NOK Accumulation
"""
if text is None or len(text.strip()) == 0:
return text
if len(text.split()) > 1:
return text
text = text.strip()
share_features = {'Acc': 'Accumulation',
'Inc': 'Income',
'Dist': 'Distribution',
'Div': 'Dividend',}
feature_name = ""
for key, value in share_features.items():
if len(text) > len(key) and text.endswith(key):
feature_name = value
text = text.replace(key, '')
break
currency_text = ""
for currency in total_currency_list:
if len(text) > len(currency) and currency in text:
currency_text = currency
text = text.replace(currency, '')
break
new_text = currency_text + ' ' + text + ' ' + feature_name
new_text = re.sub(r'\s+', ' ', new_text).strip()
return new_text