optimize mapping metrics algorithm
This commit is contained in:
parent
18174bf1cf
commit
d92053a16e
|
|
@ -3,7 +3,7 @@ import pandas as pd
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
from sklearn.metrics import precision_score, recall_score, f1_score
|
from sklearn.metrics import precision_score, recall_score, f1_score
|
||||||
from utils.biz_utils import get_unique_words_text, get_beginning_common_words, remove_special_characters
|
from utils.biz_utils import get_unique_words_text, get_beginning_common_words, remove_special_characters, simple_most_similarity_name
|
||||||
from utils.sql_query_util import query_document_fund_mapping
|
from utils.sql_query_util import query_document_fund_mapping
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
|
|
||||||
|
|
@ -812,6 +812,17 @@ class Metrics:
|
||||||
pred_data = pred_data_df.iloc[0]
|
pred_data = pred_data_df.iloc[0]
|
||||||
else:
|
else:
|
||||||
pred_data = None
|
pred_data = None
|
||||||
|
else:
|
||||||
|
if len(find_raw_name_in_pred) > 1:
|
||||||
|
max_similarity_name, max_similarity = simple_most_similarity_name(gt_raw_name, find_raw_name_in_pred)
|
||||||
|
if max_similarity_name is not None and len(max_similarity_name) > 0:
|
||||||
|
pred_data_df = dp_prediction[
|
||||||
|
dp_prediction["simple_raw_name"] == max_similarity_name
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
pred_data_df = dp_prediction[
|
||||||
|
dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0]
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
pred_data_df = dp_prediction[
|
pred_data_df = dp_prediction[
|
||||||
dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0]
|
dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0]
|
||||||
|
|
|
||||||
3
main.py
3
main.py
|
|
@ -700,6 +700,7 @@ if __name__ == "__main__":
|
||||||
"448265376",
|
"448265376",
|
||||||
"449555622",
|
"449555622",
|
||||||
"449623976",
|
"449623976",
|
||||||
|
"469138353"
|
||||||
]
|
]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
|
|
@ -707,7 +708,7 @@ if __name__ == "__main__":
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = False
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
|
||||||
|
|
@ -478,6 +478,23 @@ def get_jacard_similarity(text_left,
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def simple_most_similarity_name(text: str, name_list: list):
|
||||||
|
if text is None or len(text.strip()) == 0 or \
|
||||||
|
name_list is None or len(name_list) == 0:
|
||||||
|
return None, 0.0
|
||||||
|
max_similarity = 0
|
||||||
|
max_similarity_name = None
|
||||||
|
for full_name in name_list:
|
||||||
|
similarity = get_jacard_similarity(text, full_name)
|
||||||
|
if similarity > max_similarity:
|
||||||
|
max_similarity = similarity
|
||||||
|
max_similarity_name = full_name
|
||||||
|
if max_similarity == 1:
|
||||||
|
break
|
||||||
|
return max_similarity_name, max_similarity
|
||||||
|
|
||||||
|
|
||||||
def get_beginning_common_words(text_list: list):
|
def get_beginning_common_words(text_list: list):
|
||||||
"""
|
"""
|
||||||
Get the beginning common words in text_list
|
Get the beginning common words in text_list
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue