From 81a424b00d5476c1b8a1d494f96afcb5f4302db6 Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 5 Nov 2024 11:14:56 -0600 Subject: [PATCH] Support replaces share class name in database to be more readable. Examples document 532422720 M&G European Credit Investment Fund A CHFH Acc -> M&G European Credit Investment Fund A CHF H Accumulation M&G European Credit Investment Fund A CHFHInc -> M&G European Credit Investment Fund A CHF H Income M&G European High Yield Credit Investment Fund E GBPHedgedAcc -> M&G European High Yield Credit Investment Fund E GBP Hedged Accumulation --- main.py | 22 ++- prepare_data.py | 330 ++++++++++++++++++++++++++++++++++++++++++++- utils/biz_utils.py | 42 ++++++ 3 files changed, 387 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 8aa0ab8..aba2060 100644 --- a/main.py +++ b/main.py @@ -634,9 +634,27 @@ def test_translate_pdf(): output_folder = r"/data/translate/output/" translate_pdf = Translate_PDF(pdf_file, output_folder) translate_pdf.start_job() + + +def test_replace_abbrevation(): + from utils.biz_utils import replace_abbrevation + text_list= ["M&G European Credit Investment Fund A CHFH Acc", + "M&G European Credit Investment Fund A CHFHInc", + "M&G European Credit Investment Fund A USDHAcc", + "M&G European High Yield Credit Investment Fund E GBPHedgedAcc", + "M&G Sustainable European Credit Investment Fd Cl L GBPH Acc", + "M&G Sustainable Total Return Credit Investment Fd AI HGBPInc", + "M&G Total Return Credit Investment Fund Class WI GBPHedgedInc", + "M&G Total Return Credit Investment Fund Class W GBP HedgedInc", + "M&G Total Return Credit Investment Fund Class P CHF H Acc", + "M&G Total Return Credit Investment Fund P EUR Inc"] + for text in text_list: + result = replace_abbrevation(text) + logger.info(f"Original text: {text}, replaced text: {result}") if __name__ == "__main__": + # test_replace_abbrevation() # test_translate_pdf() pdf_folder = r"/data/emea_ar/pdf/" page_filter_ground_truth_file = ( @@ -905,10 +923,10 @@ if __name__ == "__main__": "536343790" ] special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["534547266"] + special_doc_id_list = ["532422720"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = True + re_run_extract_data = False re_run_mapping_data = True force_save_total_data = False calculate_metrics = False diff --git a/prepare_data.py b/prepare_data.py index 5da0a56..c728819 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -973,9 +973,329 @@ def concat_mapping(mapping_folder: str, all_data.reset_index(drop=True, inplace=True) with open(output_file, "wb") as f: all_data.to_excel(f, index=False) + + +def calc_typical_doc_metrics_v2(): + """ + Statistics metrics for typical document. + 1. Fund level datapoint: TOR + 2. Share level datapoint: OGC, TER, Performance fees + 3. Only statistics the record which with document investment mapping + """ + from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score + result_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_20_new_emea_documents_sample_Accuracy.xlsx" + sheet_name = "record_level_Results" + data = pd.read_excel(result_file, sheet_name=sheet_name) + data.fillna("", inplace=True) + # filter data which valid is 1 + data = data[data["valid"] == 1] + + fund_raw_data_gt = [] + fund_raw_data_pred = [] + + fund_mapping_data_gt = [] + fund_mapping_data_pred = [] + + share_raw_data_gt = [] + share_raw_data_pred = [] + + share_mapping_data_gt = [] + share_mapping_data_pred = [] + + for idx, row in data.iterrows(): + raw_data_gt_count = row["Raw data in Doc"] + raw_data_infer_count = row["Raw data in Inference"] + if len(str(raw_data_gt_count)) > 0: + raw_data_gt_count = int(raw_data_gt_count) + raw_data_infer_count = int(raw_data_infer_count) + + raw_gt_list = [1 for i in range(raw_data_gt_count)] + raw_pred_list = [] + if raw_data_infer_count > 0: + raw_pred_list = [1 for i in range(raw_data_infer_count)] + if len(raw_pred_list) < len(raw_gt_list): + raw_pred_list.extend([0 for i in range(len(raw_gt_list) - len(raw_pred_list))]) + + mapping_data_gt_count = row["data in DB"] + mapping_data_infer_count = row["data in Inferencce"] + if len(str(mapping_data_gt_count)) > 0: + mapping_data_gt_count = int(mapping_data_gt_count) + mapping_data_infer_count = int(mapping_data_infer_count) + + mapping_gt_list = [1 for i in range(mapping_data_gt_count)] + mapping_pred_list = [] + if mapping_data_infer_count > 0: + mapping_pred_list = [1 for i in range(mapping_data_infer_count)] + if len(mapping_pred_list) < len(mapping_gt_list): + mapping_pred_list.extend([0 for i in range(len(mapping_gt_list) - len(mapping_pred_list))]) + + + data_level = row["data_level"] + if data_level == "fund": + fund_raw_data_gt.extend(raw_gt_list) + fund_raw_data_pred.extend(raw_pred_list) + + fund_mapping_data_gt.extend(mapping_gt_list) + fund_mapping_data_pred.extend(mapping_pred_list) + else: + share_raw_data_gt.extend(raw_gt_list) + share_raw_data_pred.extend(raw_pred_list) + + share_mapping_data_gt.extend(mapping_gt_list) + share_mapping_data_pred.extend(mapping_pred_list) + + share_raw_data_gt.extend([0, 0, 0, 0, 0, 0]) + share_raw_data_pred.extend([1, 1, 1, 1, 1, 1]) + + share_mapping_data_gt.extend([0, 0, 0, 0, 0, 0]) + share_mapping_data_pred.extend([1, 1, 1, 1, 1, 1]) + + fund_raw_data_accuracy = accuracy_score(fund_raw_data_gt, fund_raw_data_pred) + fund_raw_data_precision = precision_score(fund_raw_data_gt, fund_raw_data_pred) + fund_raw_data_recall = recall_score(fund_raw_data_gt, fund_raw_data_pred) + fund_raw_data_f1 = f1_score(fund_raw_data_gt, fund_raw_data_pred) + + fund_mapping_data_accuracy = accuracy_score(fund_mapping_data_gt, fund_mapping_data_pred) + fund_mapping_data_precision = precision_score(fund_mapping_data_gt, fund_mapping_data_pred) + fund_mapping_data_recall = recall_score(fund_mapping_data_gt, fund_mapping_data_pred) + fund_mapping_data_f1 = f1_score(fund_mapping_data_gt, fund_mapping_data_pred) + + share_raw_data_accuracy = accuracy_score(share_raw_data_gt, share_raw_data_pred) + share_raw_data_precision = precision_score(share_raw_data_gt, share_raw_data_pred) + share_raw_data_recall = recall_score(share_raw_data_gt, share_raw_data_pred) + share_raw_data_f1 = f1_score(share_raw_data_gt, share_raw_data_pred) + + share_mapping_data_accuracy = accuracy_score(share_mapping_data_gt, share_mapping_data_pred) + share_mapping_data_precision = precision_score(share_mapping_data_gt, share_mapping_data_pred) + share_mapping_data_recall = recall_score(share_mapping_data_gt, share_mapping_data_pred) + share_mapping_data_f1 = f1_score(share_mapping_data_gt, share_mapping_data_pred) + + final_data = [] + + fund_raw_data_metrics = {"title": "Fund_Datapoint_Raw_Data", + "accuracy": fund_raw_data_accuracy, + "precision": fund_raw_data_precision, + "recall": fund_raw_data_recall, + "f1": fund_raw_data_f1, + "support": len(fund_raw_data_gt)} + final_data.append(fund_raw_data_metrics) + logger.info(f"fund_raw_data_accuracy: {fund_raw_data_accuracy}") + logger.info(f"fund_raw_data_precision: {fund_raw_data_precision}") + logger.info(f"fund_raw_data_recall: {fund_raw_data_recall}") + logger.info(f"fund_raw_data_f1: {fund_raw_data_f1}") + logger.info(f"fund_raw_data_support: {len(fund_raw_data_gt)}") + + fund_mapping_data_metrics = {"title": "Fund_Datapoint_Mapping_Data", + "accuracy": fund_mapping_data_accuracy, + "precision": fund_mapping_data_precision, + "recall": fund_mapping_data_recall, + "f1": fund_mapping_data_f1, + "support": len(fund_mapping_data_gt)} + final_data.append(fund_mapping_data_metrics) + logger.info(f"fund_mapping_data_accuracy: {fund_mapping_data_accuracy}") + logger.info(f"fund_mapping_data_precision: {fund_mapping_data_precision}") + logger.info(f"fund_mapping_data_recall: {fund_mapping_data_recall}") + logger.info(f"fund_mapping_data_f1: {fund_mapping_data_f1}") + logger.info(f"fund_mapping_data_support: {len(fund_mapping_data_gt)}") + + share_raw_data_metrics = {"title": "Share_Datapoint_Raw_Data", + "accuracy": share_raw_data_accuracy, + "precision": share_raw_data_precision, + "recall": share_raw_data_recall, + "f1": share_raw_data_f1, + "support": len(share_raw_data_gt)} + final_data.append(share_raw_data_metrics) + logger.info(f"share_raw_data_accuracy: {share_raw_data_accuracy}") + logger.info(f"share_raw_data_precision: {share_raw_data_precision}") + logger.info(f"share_raw_data_recall: {share_raw_data_recall}") + logger.info(f"share_raw_data_f1: {share_raw_data_f1}") + logger.info(f"share_raw_data_support: {len(share_raw_data_gt)}") + + share_mapping_data_metrics = {"title": "Share_Datapoint_Mapping_Data", + "accuracy": share_mapping_data_accuracy, + "precision": share_mapping_data_precision, + "recall": share_mapping_data_recall, + "f1": share_mapping_data_f1, + "support": len(share_mapping_data_gt)} + final_data.append(share_mapping_data_metrics) + logger.info(f"share_mapping_data_accuracy: {share_mapping_data_accuracy}") + logger.info(f"share_mapping_data_precision: {share_mapping_data_precision}") + logger.info(f"share_mapping_data_recall: {share_mapping_data_recall}") + logger.info(f"share_mapping_data_f1: {share_mapping_data_f1}") + logger.info(f"share_mapping_data_support: {len(share_mapping_data_gt)}") + + final_data_df = pd.DataFrame(final_data) + # set column order as title, accuracy, f1, precision, recall + final_data_df = final_data_df[["title", "accuracy", "f1", "precision", "recall", "support"]] + # output to excel + final_data_file = ( + r"/data/emea_ar/output/metrics/mapping_data_info_20_new_emea_documents_sample_Accuracy_metrics_v2.xlsx" + ) + with pd.ExcelWriter(final_data_file) as writer: + final_data_df.to_excel( + writer, sheet_name="metrics", index=False + ) + + +def calc_typical_doc_metrics_v1(): + from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score + result_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_20_new_emea_documents_sample_Accuracy.xlsx" + sheet_name = "record_level_Results" + data = pd.read_excel(result_file, sheet_name=sheet_name) + data.fillna("", inplace=True) + fund_raw_data_list = data["Raw Mapping"].tolist() + fund_raw_data_gt = [] + fund_raw_data_pred = [] + for fund_raw_data in fund_raw_data_list: + if fund_raw_data == "Correct Raw mapping": + fund_raw_data_gt.append(1) + fund_raw_data_pred.append(1) + elif fund_raw_data == "Incorrect Raw mapping": + fund_raw_data_gt.append(1) + fund_raw_data_pred.append(0) + else: + pass + fund_raw_data_accuracy = accuracy_score(fund_raw_data_gt, fund_raw_data_pred) + fund_raw_data_precision = precision_score(fund_raw_data_gt, fund_raw_data_pred) + fund_raw_data_recall = recall_score(fund_raw_data_gt, fund_raw_data_pred) + fund_raw_data_f1 = f1_score(fund_raw_data_gt, fund_raw_data_pred) + + fund_mapping_data_list = data["Share Mapping"].tolist() + fund_mapping_data_gt = [] + fund_mapping_data_pred = [] + for fund_mapping_data in fund_mapping_data_list: + if fund_mapping_data == "Correct share mapping": + fund_mapping_data_gt.append(1) + fund_mapping_data_pred.append(1) + elif fund_mapping_data == "Incorrect share mapping": + fund_mapping_data_gt.append(1) + fund_mapping_data_pred.append(0) + else: + pass + fund_mapping_data_accuracy = accuracy_score(fund_mapping_data_gt, fund_mapping_data_pred) + fund_mapping_data_precision = precision_score(fund_mapping_data_gt, fund_mapping_data_pred) + fund_mapping_data_recall = recall_score(fund_mapping_data_gt, fund_mapping_data_pred) + fund_mapping_data_f1 = f1_score(fund_mapping_data_gt, fund_mapping_data_pred) + + share_raw_data_gt = [] + share_raw_data_pred = [] + + share_mapping_data_gt = [] + share_mapping_data_pred = [] + for idx, row in data.iterrows(): + share_raw_data_infer_count = row["Raw Share in Inference"] + share_raw_data_gt_count = row["Raw Share in Doc"] + if share_raw_data_gt_count is not None and \ + len(str(share_raw_data_gt_count)) > 0: + share_raw_data_gt_count = int(share_raw_data_gt_count) + share_raw_data_infer_count = int(share_raw_data_infer_count) + + gt_list = [1 for i in range(share_raw_data_gt_count)] + if share_raw_data_infer_count > 0: + pred_list = [1 for i in range(share_raw_data_infer_count)] + else: + pred_list = [1, 1] + gt_list = [0, 0] + if len(pred_list) < len(gt_list): + pred_list.extend([0 for i in range(len(gt_list) - len(pred_list))]) + share_raw_data_gt.extend(gt_list) + share_raw_data_pred.extend(pred_list) + + share_mapping_data_infer_count = row["share in Inferencce"] + share_mapping_data_gt_count = row["share in DB"] + if share_mapping_data_gt_count is not None and \ + len(str(share_mapping_data_gt_count)) > 0: + share_mapping_data_gt_count = int(share_mapping_data_gt_count) + share_mapping_data_infer_count = int(share_mapping_data_infer_count) + + gt_list = [1 for i in range(share_mapping_data_gt_count)] + if share_mapping_data_infer_count > 0: + pred_list = [1 for i in range(share_mapping_data_infer_count)] + else: + pred_list = [1, 1] + gt_list = [0, 0] + if len(pred_list) < len(gt_list): + pred_list.extend([0 for i in range(len(gt_list) - len(pred_list))]) + share_mapping_data_gt.extend(gt_list) + share_mapping_data_pred.extend(pred_list) + share_raw_data_accuracy = accuracy_score(share_raw_data_gt, share_raw_data_pred) + share_raw_data_precision = precision_score(share_raw_data_gt, share_raw_data_pred) + share_raw_data_recall = recall_score(share_raw_data_gt, share_raw_data_pred) + share_raw_data_f1 = f1_score(share_raw_data_gt, share_raw_data_pred) + + share_mapping_data_accuracy = accuracy_score(share_mapping_data_gt, share_mapping_data_pred) + share_mapping_data_precision = precision_score(share_mapping_data_gt, share_mapping_data_pred) + share_mapping_data_recall = recall_score(share_mapping_data_gt, share_mapping_data_pred) + share_mapping_data_f1 = f1_score(share_mapping_data_gt, share_mapping_data_pred) + + final_data = [] + + fund_raw_data_metrics = {"title": "Fund_Raw_Data", + "accuracy": fund_raw_data_accuracy, + "precision": fund_raw_data_precision, + "recall": fund_raw_data_recall, + "f1": fund_raw_data_f1, + "support": len(fund_raw_data_gt)} + final_data.append(fund_raw_data_metrics) + logger.info(f"fund_raw_data_accuracy: {fund_raw_data_accuracy}") + logger.info(f"fund_raw_data_precision: {fund_raw_data_precision}") + logger.info(f"fund_raw_data_recall: {fund_raw_data_recall}") + logger.info(f"fund_raw_data_f1: {fund_raw_data_f1}") + + fund_mapping_data_metrics = {"title": "Fund_Mapping_Data", + "accuracy": fund_mapping_data_accuracy, + "precision": fund_mapping_data_precision, + "recall": fund_mapping_data_recall, + "f1": fund_mapping_data_f1, + "support": len(fund_mapping_data_gt)} + final_data.append(fund_mapping_data_metrics) + logger.info(f"fund_mapping_data_accuracy: {fund_mapping_data_accuracy}") + logger.info(f"fund_mapping_data_precision: {fund_mapping_data_precision}") + logger.info(f"fund_mapping_data_recall: {fund_mapping_data_recall}") + logger.info(f"fund_mapping_data_f1: {fund_mapping_data_f1}") + + share_raw_data_metrics = {"title": "Share_Raw_Data", + "accuracy": share_raw_data_accuracy, + "precision": share_raw_data_precision, + "recall": share_raw_data_recall, + "f1": share_raw_data_f1, + "support": len(share_raw_data_gt)} + final_data.append(share_raw_data_metrics) + logger.info(f"share_raw_data_accuracy: {share_raw_data_accuracy}") + logger.info(f"share_raw_data_precision: {share_raw_data_precision}") + logger.info(f"share_raw_data_recall: {share_raw_data_recall}") + logger.info(f"share_raw_data_f1: {share_raw_data_f1}") + + share_mapping_data_metrics = {"title": "Share_Mapping_Data", + "accuracy": share_mapping_data_accuracy, + "precision": share_mapping_data_precision, + "recall": share_mapping_data_recall, + "f1": share_mapping_data_f1, + "support": len(share_mapping_data_gt)} + final_data.append(share_mapping_data_metrics) + logger.info(f"share_mapping_data_accuracy: {share_mapping_data_accuracy}") + logger.info(f"share_mapping_data_precision: {share_mapping_data_precision}") + logger.info(f"share_mapping_data_recall: {share_mapping_data_recall}") + logger.info(f"share_mapping_data_f1: {share_mapping_data_f1}") + + final_data_df = pd.DataFrame(final_data) + # set column order as title, accuracy, f1, precision, recall + final_data_df = final_data_df[["title", "accuracy", "f1", "precision", "recall", "support"]] + # output to excel + final_data_file = ( + r"/data/emea_ar/output/metrics/mapping_data_info_20_new_emea_documents_sample_Accuracy_metrics.xlsx" + ) + with pd.ExcelWriter(final_data_file) as writer: + final_data_df.to_excel( + writer, sheet_name="metrics", index=False + ) + if __name__ == "__main__": + # calc_typical_doc_metrics_v1() + calc_typical_doc_metrics_v2() + doc_provider_file_path = ( r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx" ) @@ -999,11 +1319,11 @@ if __name__ == "__main__": doc_provider_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/Final list of EMEA documents.xlsx" pdf_folder = r"/data/emea_ar/pdf/" - download_pdf( - doc_provider_file_path=doc_provider_file_path, - sheet_name="Sheet1", - doc_id_column="Document Id", - pdf_path=pdf_folder) + # download_pdf( + # doc_provider_file_path=doc_provider_file_path, + # sheet_name="Sheet1", + # doc_id_column="Document Id", + # pdf_path=pdf_folder) # output_pdf_page_text(pdf_folder, output_folder) # extract_pdf_table(pdf_folder, output_folder) diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 22e780c..2ade806 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -825,8 +825,50 @@ def replace_abbrevation(text: str): elif split.lower() in ['net', 'unhgd']: new_text_splits.append('') else: + split = split_short_name_with_share_features(split) new_text_splits.append(split) new_text = ' '.join(new_text_splits) new_text = re.sub(r'\s+', ' ', new_text).strip() + return new_text + + +def split_short_name_with_share_features(text: str): + """ + Split short name with share features, + for examples: + Document mapping for 532422720 + CHFHInc to be CHF H Income + USDHAcc to be USD H Accumulation + GBPHInc to be GBP H Income + HAcc to be H Accumulation + GBPHedgedAcc to be GBP Hedged Accumulation + HGBPInc to be H GBP Income + HNOKAcc to be H NOK Accumulation + """ + if text is None or len(text.strip()) == 0: + return text + if len(text.split()) > 1: + return text + text = text.strip() + share_features = {'Acc': 'Accumulation', + 'Inc': 'Income', + 'Dist': 'Distribution', + 'Div': 'Dividend',} + feature_name = "" + for key, value in share_features.items(): + if len(text) > len(key) and text.endswith(key): + feature_name = value + text = text.replace(key, '') + break + + currency_text = "" + for currency in total_currency_list: + if len(text) > len(currency) and currency in text: + currency_text = currency + text = text.replace(currency, '') + break + + new_text = currency_text + ' ' + text + ' ' + feature_name + new_text = re.sub(r'\s+', ' ', new_text).strip() return new_text \ No newline at end of file