optimize mapping metrics algorithm

This commit is contained in:
Blade He 2024-10-01 12:19:45 -05:00
parent 18174bf1cf
commit d92053a16e
3 changed files with 34 additions and 5 deletions

View File

@ -3,7 +3,7 @@ import pandas as pd
import time
import json
from sklearn.metrics import precision_score, recall_score, f1_score
from utils.biz_utils import get_unique_words_text, get_beginning_common_words, remove_special_characters
from utils.biz_utils import get_unique_words_text, get_beginning_common_words, remove_special_characters, simple_most_similarity_name
from utils.sql_query_util import query_document_fund_mapping
from utils.logger import logger
@ -813,9 +813,20 @@ class Metrics:
else:
pred_data = None
else:
pred_data_df = dp_prediction[
dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0]
]
if len(find_raw_name_in_pred) > 1:
max_similarity_name, max_similarity = simple_most_similarity_name(gt_raw_name, find_raw_name_in_pred)
if max_similarity_name is not None and len(max_similarity_name) > 0:
pred_data_df = dp_prediction[
dp_prediction["simple_raw_name"] == max_similarity_name
]
else:
pred_data_df = dp_prediction[
dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0]
]
else:
pred_data_df = dp_prediction[
dp_prediction["simple_raw_name"] == find_raw_name_in_pred[0]
]
if len(pred_data_df) > 1:
if (
len(pred_data_df[pred_data_df["page_index"] == gt_page_index])

View File

@ -700,6 +700,7 @@ if __name__ == "__main__":
"448265376",
"449555622",
"449623976",
"469138353"
]
# special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list
@ -707,7 +708,7 @@ if __name__ == "__main__":
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False
re_run_mapping_data = True
re_run_mapping_data = False
force_save_total_data = True
extract_ways = ["text"]

View File

@ -478,6 +478,23 @@ def get_jacard_similarity(text_left,
else:
return 0
def simple_most_similarity_name(text: str, name_list: list):
if text is None or len(text.strip()) == 0 or \
name_list is None or len(name_list) == 0:
return None, 0.0
max_similarity = 0
max_similarity_name = None
for full_name in name_list:
similarity = get_jacard_similarity(text, full_name)
if similarity > max_similarity:
max_similarity = similarity
max_similarity_name = full_name
if max_similarity == 1:
break
return max_similarity_name, max_similarity
def get_beginning_common_words(text_list: list):
"""
Get the beginning common words in text_list