1. Update for mapping multilingual share class names.
2. Optimize getting currency logic
This commit is contained in:
parent
843bbbd13f
commit
5b9f9416de
4
main.py
4
main.py
|
|
@ -1203,10 +1203,10 @@ if __name__ == "__main__":
|
||||||
"501380497",
|
"501380497",
|
||||||
"514636959",
|
"514636959",
|
||||||
"508981020"]
|
"508981020"]
|
||||||
special_doc_id_list = ["514636993"]
|
special_doc_id_list = ["514636953"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = True
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
|
||||||
|
|
@ -97,9 +97,11 @@ def get_most_similar_name(text: str,
|
||||||
|
|
||||||
for i in range(len(copy_name_list)):
|
for i in range(len(copy_name_list)):
|
||||||
copy_name = copy_name_list[i]
|
copy_name = copy_name_list[i]
|
||||||
share_part = get_share_part_list([copy_name])[0]
|
if matching_type == "share":
|
||||||
if '-' in share_part:
|
copy_name, _ = replace_share_name_for_multilingual(copy_name, None)
|
||||||
copy_name = copy_name.replace('-', ' ')
|
share_part = get_share_part_list([copy_name])[0]
|
||||||
|
if '-' in share_part:
|
||||||
|
copy_name = copy_name.replace('-', ' ')
|
||||||
copy_name = replace_abbrevation(copy_name)
|
copy_name = replace_abbrevation(copy_name)
|
||||||
copy_name_list[i] = copy_name
|
copy_name_list[i] = copy_name
|
||||||
|
|
||||||
|
|
@ -128,6 +130,9 @@ def get_most_similar_name(text: str,
|
||||||
share_name = remove_special_characters(share_name)
|
share_name = remove_special_characters(share_name)
|
||||||
share_name = replace_abbrevation(share_name)
|
share_name = replace_abbrevation(share_name)
|
||||||
|
|
||||||
|
text, share_name = replace_share_name_for_multilingual(text, share_name)
|
||||||
|
|
||||||
|
|
||||||
text_splits = text.split()
|
text_splits = text.split()
|
||||||
if len(text_splits) == 1:
|
if len(text_splits) == 1:
|
||||||
text = split_words_without_space(text)
|
text = split_words_without_space(text)
|
||||||
|
|
@ -332,6 +337,23 @@ def get_most_similar_name(text: str,
|
||||||
return None, 0.0
|
return None, 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def replace_share_name_for_multilingual(text: str, share_name: str):
|
||||||
|
if text is None or len(text.strip()) == 0:
|
||||||
|
return text, share_name
|
||||||
|
|
||||||
|
multilingual_share_list = ["Catégorie de parts", "Classe di quote",
|
||||||
|
"Kategorie Anteile", "Kategorie anteile",
|
||||||
|
"Clase de participaciones", "Aandelenklasse",
|
||||||
|
"aandelenklasse", "Anteilklasse", "anteilklasse"]
|
||||||
|
for multilingual_share in multilingual_share_list:
|
||||||
|
if multilingual_share in text:
|
||||||
|
text = text.replace(multilingual_share, "Class")
|
||||||
|
if share_name is not None and len(share_name.strip()) > 0:
|
||||||
|
share_name = share_name.replace(multilingual_share, "Class")
|
||||||
|
break
|
||||||
|
return text, share_name
|
||||||
|
|
||||||
|
|
||||||
def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list):
|
def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list):
|
||||||
copy_text_short_name_list = deepcopy(text_short_name_list)
|
copy_text_short_name_list = deepcopy(text_short_name_list)
|
||||||
copy_compare_short_name_list = deepcopy(compare_short_name_list)
|
copy_compare_short_name_list = deepcopy(compare_short_name_list)
|
||||||
|
|
@ -448,13 +470,21 @@ def get_currency_from_text(text: str):
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
text_split = text.split()
|
text_split = text.split()
|
||||||
count = 0
|
count = 0
|
||||||
|
currency_list = []
|
||||||
for split in text_split[::-1]:
|
for split in text_split[::-1]:
|
||||||
if count == 4:
|
if count == 4:
|
||||||
break
|
break
|
||||||
if split.upper() in total_currency_list:
|
if split.upper() in total_currency_list:
|
||||||
return split
|
currency_list.append(split.upper())
|
||||||
count += 1
|
count += 1
|
||||||
return None
|
if len(currency_list) > 1:
|
||||||
|
# remove the first currency from currency list
|
||||||
|
currency_list.pop(0)
|
||||||
|
return currency_list[0]
|
||||||
|
elif len(currency_list) == 1:
|
||||||
|
return currency_list[0]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def update_for_currency(text: str, share_name: str, compare_list: list):
|
def update_for_currency(text: str, share_name: str, compare_list: list):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue