From d92053a16eef19bf5c790b0fa28f19dd74b36fce Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 1 Oct 2024 12:19:45 -0500 Subject: [PATCH] optimize mapping metrics algorithm --- core/metrics.py | 19 +++++++++++++++---- main.py | 3 ++- utils/biz_utils.py | 17 +++++++++++++++++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/core/metrics.py b/core/metrics.py index be31215..43707d0 100644 --- a/core/metrics.py +++ b/core/metrics.py @@ -3,7 +3,7 @@ import pandas as pd import time import json from sklearn.metrics import precision_score, recall_score, f1_score -from utils.biz_utils import get_unique_words_text, get_beginning_common_words, remove_special_characters +from utils.biz_utils import get_unique_words_text, get_beginning_common_words, remove_special_characters, simple_most_similarity_name from utils.sql_query_util import query_document_fund_mapping from utils.logger import logger @@ -813,9 +813,20 @@ class Metrics: else: pred_data = None else: - pred_data_df = dp_prediction[ - dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0] - ] + if len(find_raw_name_in_pred) > 1: + max_similarity_name, max_similarity = simple_most_similarity_name(gt_raw_name, find_raw_name_in_pred) + if max_similarity_name is not None and len(max_similarity_name) > 0: + pred_data_df = dp_prediction[ + dp_prediction["simple_raw_name"] == max_similarity_name + ] + else: + pred_data_df = dp_prediction[ + dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0] + ] + else: + pred_data_df = dp_prediction[ + dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0] + ] if len(pred_data_df) > 1: if ( len(pred_data_df[pred_data_df["page_index"] == gt_page_index]) diff --git a/main.py b/main.py index 8934327..5728e12 100644 --- a/main.py +++ b/main.py @@ -700,6 +700,7 @@ if __name__ == "__main__": "448265376", "449555622", "449623976", + "469138353" ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list @@ -707,7 +708,7 @@ if __name__ == "__main__": output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False - re_run_mapping_data = True + re_run_mapping_data = False force_save_total_data = True extract_ways = ["text"] diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 50e748a..fd769e0 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -478,6 +478,23 @@ def get_jacard_similarity(text_left, else: return 0 + +def simple_most_similarity_name(text: str, name_list: list): + if text is None or len(text.strip()) == 0 or \ + name_list is None or len(name_list) == 0: + return None, 0.0 + max_similarity = 0 + max_similarity_name = None + for full_name in name_list: + similarity = get_jacard_similarity(text, full_name) + if similarity > max_similarity: + max_similarity = similarity + max_similarity_name = full_name + if max_similarity == 1: + break + return max_similarity_name, max_similarity + + def get_beginning_common_words(text_list: list): """ Get the beginning common words in text_list