1. Update for mapping multilingual share class names.

2. Optimize getting currency logic
This commit is contained in:
Blade He 2024-11-21 11:37:58 -06:00
parent 843bbbd13f
commit 5b9f9416de
2 changed files with 37 additions and 7 deletions

View File

@ -1203,10 +1203,10 @@ if __name__ == "__main__":
"501380497", "501380497",
"514636959", "514636959",
"508981020"] "508981020"]
special_doc_id_list = ["514636993"] special_doc_id_list = ["514636953"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = False force_save_total_data = False
calculate_metrics = False calculate_metrics = False

View File

@ -97,6 +97,8 @@ def get_most_similar_name(text: str,
for i in range(len(copy_name_list)): for i in range(len(copy_name_list)):
copy_name = copy_name_list[i] copy_name = copy_name_list[i]
if matching_type == "share":
copy_name, _ = replace_share_name_for_multilingual(copy_name, None)
share_part = get_share_part_list([copy_name])[0] share_part = get_share_part_list([copy_name])[0]
if '-' in share_part: if '-' in share_part:
copy_name = copy_name.replace('-', ' ') copy_name = copy_name.replace('-', ' ')
@ -128,6 +130,9 @@ def get_most_similar_name(text: str,
share_name = remove_special_characters(share_name) share_name = remove_special_characters(share_name)
share_name = replace_abbrevation(share_name) share_name = replace_abbrevation(share_name)
text, share_name = replace_share_name_for_multilingual(text, share_name)
text_splits = text.split() text_splits = text.split()
if len(text_splits) == 1: if len(text_splits) == 1:
text = split_words_without_space(text) text = split_words_without_space(text)
@ -332,6 +337,23 @@ def get_most_similar_name(text: str,
return None, 0.0 return None, 0.0
def replace_share_name_for_multilingual(text: str, share_name: str):
if text is None or len(text.strip()) == 0:
return text, share_name
multilingual_share_list = ["Catégorie de parts", "Classe di quote",
"Kategorie Anteile", "Kategorie anteile",
"Clase de participaciones", "Aandelenklasse",
"aandelenklasse", "Anteilklasse", "anteilklasse"]
for multilingual_share in multilingual_share_list:
if multilingual_share in text:
text = text.replace(multilingual_share, "Class")
if share_name is not None and len(share_name.strip()) > 0:
share_name = share_name.replace(multilingual_share, "Class")
break
return text, share_name
def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list): def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list):
copy_text_short_name_list = deepcopy(text_short_name_list) copy_text_short_name_list = deepcopy(text_short_name_list)
copy_compare_short_name_list = deepcopy(compare_short_name_list) copy_compare_short_name_list = deepcopy(compare_short_name_list)
@ -448,12 +470,20 @@ def get_currency_from_text(text: str):
text = text.strip() text = text.strip()
text_split = text.split() text_split = text.split()
count = 0 count = 0
currency_list = []
for split in text_split[::-1]: for split in text_split[::-1]:
if count == 4: if count == 4:
break break
if split.upper() in total_currency_list: if split.upper() in total_currency_list:
return split currency_list.append(split.upper())
count += 1 count += 1
if len(currency_list) > 1:
# remove the first currency from currency list
currency_list.pop(0)
return currency_list[0]
elif len(currency_list) == 1:
return currency_list[0]
else:
return None return None