diff --git a/main.py b/main.py index 541de17..6061731 100644 --- a/main.py +++ b/main.py @@ -1203,10 +1203,10 @@ if __name__ == "__main__": "501380497", "514636959", "508981020"] - special_doc_id_list = ["514636993"] + special_doc_id_list = ["514636953"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = True + re_run_extract_data = False re_run_mapping_data = True force_save_total_data = False calculate_metrics = False diff --git a/utils/biz_utils.py b/utils/biz_utils.py index ecdb4ab..a1d8b7e 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -97,9 +97,11 @@ def get_most_similar_name(text: str, for i in range(len(copy_name_list)): copy_name = copy_name_list[i] - share_part = get_share_part_list([copy_name])[0] - if '-' in share_part: - copy_name = copy_name.replace('-', ' ') + if matching_type == "share": + copy_name, _ = replace_share_name_for_multilingual(copy_name, None) + share_part = get_share_part_list([copy_name])[0] + if '-' in share_part: + copy_name = copy_name.replace('-', ' ') copy_name = replace_abbrevation(copy_name) copy_name_list[i] = copy_name @@ -127,6 +129,9 @@ def get_most_similar_name(text: str, if share_name is not None: share_name = remove_special_characters(share_name) share_name = replace_abbrevation(share_name) + + text, share_name = replace_share_name_for_multilingual(text, share_name) + text_splits = text.split() if len(text_splits) == 1: @@ -332,6 +337,23 @@ def get_most_similar_name(text: str, return None, 0.0 +def replace_share_name_for_multilingual(text: str, share_name: str): + if text is None or len(text.strip()) == 0: + return text, share_name + + multilingual_share_list = ["Catégorie de parts", "Classe di quote", + "Kategorie Anteile", "Kategorie anteile", + "Clase de participaciones", "Aandelenklasse", + "aandelenklasse", "Anteilklasse", "anteilklasse"] + for multilingual_share in multilingual_share_list: + if multilingual_share in text: + text = text.replace(multilingual_share, "Class") + if share_name is not None and len(share_name.strip()) > 0: + share_name = share_name.replace(multilingual_share, "Class") + break + return text, share_name + + def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list): copy_text_short_name_list = deepcopy(text_short_name_list) copy_compare_short_name_list = deepcopy(compare_short_name_list) @@ -448,13 +470,21 @@ def get_currency_from_text(text: str): text = text.strip() text_split = text.split() count = 0 + currency_list = [] for split in text_split[::-1]: if count == 4: break if split.upper() in total_currency_list: - return split + currency_list.append(split.upper()) count += 1 - return None + if len(currency_list) > 1: + # remove the first currency from currency list + currency_list.pop(0) + return currency_list[0] + elif len(currency_list) == 1: + return currency_list[0] + else: + return None def update_for_currency(text: str, share_name: str, compare_list: list):