optimize mapping algorithm

This commit is contained in:
Blade He 2024-10-01 16:46:59 -05:00
parent 3adbd7631a
commit 035f028155
2 changed files with 12 additions and 3 deletions

View File

@ -704,7 +704,7 @@ if __name__ == "__main__":
]
# special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["333207452"]
# special_doc_id_list = ["479042269"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False

View File

@ -49,7 +49,7 @@ total_currency_list = [
"XFO",
]
share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Dividend', 'Investor', 'Institutional', 'Admin', 'Advantage']
share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Admin', 'Advantage']
share_features_abbrevation = ['Acc', 'Inc', 'Dist', 'Div', 'Inv', 'Inst', 'Adm', 'Adv']
@ -101,6 +101,10 @@ def get_most_similar_name(text: str,
common_word_list.extend([word for word in pre_common_word_list
if word not in common_word_list])
if len(common_word_list) > 0:
common_word_list = [word for word in common_word_list
if len(word) > 1 and word.upper() not in total_currency_list]
text = text.strip()
text = remove_special_characters(text)
text = replace_abbrevation(text)
@ -224,6 +228,7 @@ def get_most_similar_name(text: str,
if text_feature is not None and len(text_feature) > 0 and \
copy_name_feature is not None and len(copy_name_feature) > 0:
if text_feature != copy_name_feature:
if copy_name_feature.lower() not in text.lower().split():
continue
if matching_type == "share":
if text_share_short_name is not None and len(text_share_short_name) > 0 and \
@ -263,6 +268,10 @@ def get_share_part_list(text_list: list):
text_split = text.split("funds")
if len(text_split) == 1:
text_split = text.split("Portfolio")
if len(text_split) == 1:
text_split = text.split("Bond")
if len(text_split) == 1:
text_split = text.split("Bonds")
if len(text_split) > 1:
share_part_list.append(text_split[-1].strip())
else: