1. Update for mapping multilingual share class names.

2. Optimize getting currency logic
This commit is contained in:
Blade He 2024-11-21 11:37:58 -06:00
parent 843bbbd13f
commit 5b9f9416de
2 changed files with 37 additions and 7 deletions

View File

@ -1203,10 +1203,10 @@ if __name__ == "__main__":
"501380497",
"514636959",
"508981020"]
special_doc_id_list = ["514636993"]
special_doc_id_list = ["514636953"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True
re_run_extract_data = False
re_run_mapping_data = True
force_save_total_data = False
calculate_metrics = False

View File

@ -97,9 +97,11 @@ def get_most_similar_name(text: str,
for i in range(len(copy_name_list)):
copy_name = copy_name_list[i]
share_part = get_share_part_list([copy_name])[0]
if '-' in share_part:
copy_name = copy_name.replace('-', ' ')
if matching_type == "share":
copy_name, _ = replace_share_name_for_multilingual(copy_name, None)
share_part = get_share_part_list([copy_name])[0]
if '-' in share_part:
copy_name = copy_name.replace('-', ' ')
copy_name = replace_abbrevation(copy_name)
copy_name_list[i] = copy_name
@ -127,6 +129,9 @@ def get_most_similar_name(text: str,
if share_name is not None:
share_name = remove_special_characters(share_name)
share_name = replace_abbrevation(share_name)
text, share_name = replace_share_name_for_multilingual(text, share_name)
text_splits = text.split()
if len(text_splits) == 1:
@ -332,6 +337,23 @@ def get_most_similar_name(text: str,
return None, 0.0
def replace_share_name_for_multilingual(text: str, share_name: str):
if text is None or len(text.strip()) == 0:
return text, share_name
multilingual_share_list = ["Catégorie de parts", "Classe di quote",
"Kategorie Anteile", "Kategorie anteile",
"Clase de participaciones", "Aandelenklasse",
"aandelenklasse", "Anteilklasse", "anteilklasse"]
for multilingual_share in multilingual_share_list:
if multilingual_share in text:
text = text.replace(multilingual_share, "Class")
if share_name is not None and len(share_name.strip()) > 0:
share_name = share_name.replace(multilingual_share, "Class")
break
return text, share_name
def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list):
copy_text_short_name_list = deepcopy(text_short_name_list)
copy_compare_short_name_list = deepcopy(compare_short_name_list)
@ -448,13 +470,21 @@ def get_currency_from_text(text: str):
text = text.strip()
text_split = text.split()
count = 0
currency_list = []
for split in text_split[::-1]:
if count == 4:
break
if split.upper() in total_currency_list:
return split
currency_list.append(split.upper())
count += 1
return None
if len(currency_list) > 1:
# remove the first currency from currency list
currency_list.pop(0)
return currency_list[0]
elif len(currency_list) == 1:
return currency_list[0]
else:
return None
def update_for_currency(text: str, share_name: str, compare_list: list):