optimize mapping metrics algorithm

This commit is contained in:
Blade He 2024-10-01 12:19:45 -05:00
parent 18174bf1cf
commit d92053a16e
3 changed files with 34 additions and 5 deletions

View File

@ -3,7 +3,7 @@ import pandas as pd
import time import time
import json import json
from sklearn.metrics import precision_score, recall_score, f1_score from sklearn.metrics import precision_score, recall_score, f1_score
from utils.biz_utils import get_unique_words_text, get_beginning_common_words, remove_special_characters from utils.biz_utils import get_unique_words_text, get_beginning_common_words, remove_special_characters, simple_most_similarity_name
from utils.sql_query_util import query_document_fund_mapping from utils.sql_query_util import query_document_fund_mapping
from utils.logger import logger from utils.logger import logger
@ -812,6 +812,17 @@ class Metrics:
pred_data = pred_data_df.iloc[0] pred_data = pred_data_df.iloc[0]
else: else:
pred_data = None pred_data = None
else:
if len(find_raw_name_in_pred) > 1:
max_similarity_name, max_similarity = simple_most_similarity_name(gt_raw_name, find_raw_name_in_pred)
if max_similarity_name is not None and len(max_similarity_name) > 0:
pred_data_df = dp_prediction[
dp_prediction["simple_raw_name"] == max_similarity_name
]
else:
pred_data_df = dp_prediction[
dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0]
]
else: else:
pred_data_df = dp_prediction[ pred_data_df = dp_prediction[
dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0] dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0]

View File

@ -700,6 +700,7 @@ if __name__ == "__main__":
"448265376", "448265376",
"449555622", "449555622",
"449623976", "449623976",
"469138353"
] ]
# special_doc_id_list = check_mapping_doc_id_list # special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
@ -707,7 +708,7 @@ if __name__ == "__main__":
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = False
force_save_total_data = True force_save_total_data = True
extract_ways = ["text"] extract_ways = ["text"]

View File

@ -478,6 +478,23 @@ def get_jacard_similarity(text_left,
else: else:
return 0 return 0
def simple_most_similarity_name(text: str, name_list: list):
if text is None or len(text.strip()) == 0 or \
name_list is None or len(name_list) == 0:
return None, 0.0
max_similarity = 0
max_similarity_name = None
for full_name in name_list:
similarity = get_jacard_similarity(text, full_name)
if similarity > max_similarity:
max_similarity = similarity
max_similarity_name = full_name
if max_similarity == 1:
break
return max_similarity_name, max_similarity
def get_beginning_common_words(text_list: list): def get_beginning_common_words(text_list: list):
""" """
Get the beginning common words in text_list Get the beginning common words in text_list