optimize mapping metrics algorithm
This commit is contained in:
parent
18174bf1cf
commit
d92053a16e
|
|
@ -3,7 +3,7 @@ import pandas as pd
|
|||
import time
|
||||
import json
|
||||
from sklearn.metrics import precision_score, recall_score, f1_score
|
||||
from utils.biz_utils import get_unique_words_text, get_beginning_common_words, remove_special_characters
|
||||
from utils.biz_utils import get_unique_words_text, get_beginning_common_words, remove_special_characters, simple_most_similarity_name
|
||||
from utils.sql_query_util import query_document_fund_mapping
|
||||
from utils.logger import logger
|
||||
|
||||
|
|
@ -812,6 +812,17 @@ class Metrics:
|
|||
pred_data = pred_data_df.iloc[0]
|
||||
else:
|
||||
pred_data = None
|
||||
else:
|
||||
if len(find_raw_name_in_pred) > 1:
|
||||
max_similarity_name, max_similarity = simple_most_similarity_name(gt_raw_name, find_raw_name_in_pred)
|
||||
if max_similarity_name is not None and len(max_similarity_name) > 0:
|
||||
pred_data_df = dp_prediction[
|
||||
dp_prediction["simple_raw_name"] == max_similarity_name
|
||||
]
|
||||
else:
|
||||
pred_data_df = dp_prediction[
|
||||
dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0]
|
||||
]
|
||||
else:
|
||||
pred_data_df = dp_prediction[
|
||||
dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0]
|
||||
|
|
|
|||
3
main.py
3
main.py
|
|
@ -700,6 +700,7 @@ if __name__ == "__main__":
|
|||
"448265376",
|
||||
"449555622",
|
||||
"449623976",
|
||||
"469138353"
|
||||
]
|
||||
# special_doc_id_list = check_mapping_doc_id_list
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
|
|
@ -707,7 +708,7 @@ if __name__ == "__main__":
|
|||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = True
|
||||
re_run_mapping_data = False
|
||||
force_save_total_data = True
|
||||
|
||||
extract_ways = ["text"]
|
||||
|
|
|
|||
|
|
@ -478,6 +478,23 @@ def get_jacard_similarity(text_left,
|
|||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def simple_most_similarity_name(text: str, name_list: list):
|
||||
if text is None or len(text.strip()) == 0 or \
|
||||
name_list is None or len(name_list) == 0:
|
||||
return None, 0.0
|
||||
max_similarity = 0
|
||||
max_similarity_name = None
|
||||
for full_name in name_list:
|
||||
similarity = get_jacard_similarity(text, full_name)
|
||||
if similarity > max_similarity:
|
||||
max_similarity = similarity
|
||||
max_similarity_name = full_name
|
||||
if max_similarity == 1:
|
||||
break
|
||||
return max_similarity_name, max_similarity
|
||||
|
||||
|
||||
def get_beginning_common_words(text_list: list):
|
||||
"""
|
||||
Get the beginning common words in text_list
|
||||
|
|
|
|||
Loading…
Reference in New Issue