diff --git a/core/data_mapping.py b/core/data_mapping.py index 870bc64..eaa5cdc 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -312,7 +312,10 @@ class DataMapping: max_similarity_name, max_similarity = get_most_similar_name( raw_name, provider_compare_name_list, pre_common_word_list=pre_common_word_list ) - if max_similarity is not None and max_similarity >= 0.5: + threshold = 0.7 + if matching_type == "share": + threshold = 0.5 + if max_similarity is not None and max_similarity >= threshold: data_info["id"] = provider_compare_mapping[ provider_compare_mapping[compare_name_dp] == max_similarity_name ][compare_id_dp].values[0] diff --git a/main.py b/main.py index 15527af..0323f90 100644 --- a/main.py +++ b/main.py @@ -642,8 +642,10 @@ if __name__ == "__main__": "508854243", "520879048", "463081566", + "389171486" ] special_doc_id_list = check_mapping_doc_id_list + # special_doc_id_list = ["445256897"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 285ff9d..1ca1697 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -1,5 +1,6 @@ import re from copy import deepcopy +from traceback import print_exc def add_slash_to_text_as_regex(text: str): if text is None or len(text) == 0: @@ -79,6 +80,8 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list text = ' '.join(final_splits) max_similarity = 0 max_similarity_fund_name = None + text = remove_special_characters(text) + text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list) for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list): copy_fund_name = remove_special_characters(copy_fund_name) copy_fund_name = split_words_without_space(copy_fund_name) @@ -95,9 +98,76 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list return max_similarity_fund_name, max_similarity except Exception as e: print(e) + print_exc() return None, 0.0 +def update_for_currency(text: str, compare_list: list): + text_split = text.split() + with_currency = False + total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY', + 'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD', + 'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN', + 'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR', + 'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW', + 'ETB', 'DZD', 'XEU', 'XFO'] + for split in text_split: + if split.upper() in total_currency_list: + with_currency = True + break + + with_currency_list = [] + without_currency_list = [] + for index, compare in enumerate(compare_list): + compare_split = compare.split() + with_currency_compare = False + for split in compare_split: + if split.upper() in total_currency_list: + with_currency_compare = True + break + if with_currency_compare: + with_currency_list.append(index) + else: + without_currency_list.append(index) + if not with_currency and len(with_currency_list) == 0: + return text, compare_list + elif not with_currency and len(with_currency_list) > 0: + last_split = text_split[-1] + updated = False + if len(last_split) < 4 and last_split.upper() == last_split: + if len(without_currency_list) > 0: + for index in without_currency_list: + if last_split in compare_list[index].split(): + text = text + ' ' + 'USD' + updated = True + break + if not updated: + currency_list = [] + for index in with_currency_list: + compare_split = compare_list[index].split() + if last_split in compare_split: + current_currency_list = [split for split in compare_split + if split.upper() in total_currency_list] + if len(current_currency_list) > 0: + currency_list.append(current_currency_list[-1]) + if len(currency_list) == 1: + text = text + ' ' + currency_list[0] + updated = True + + for index in without_currency_list: + compare_list[index] = compare_list[index] + ' ' + 'USD' + + if not updated: + text = text + ' ' + 'USD' + return text, compare_list + elif with_currency and len(without_currency_list) == 0: + for index in without_currency_list: + compare_list[index] = compare_list[index] + ' ' + 'USD' + return text, compare_list + else: + return text, compare_list + + def remove_common_word(text_list: list): if text_list is None or len(text_list) == 0: return text_list @@ -268,6 +338,10 @@ def replace_abbrevation(text: str): text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE) elif '€' in text.lower().split(): text = re.sub(r'\€', 'EUR', text, flags=re.IGNORECASE) + elif '$' in text.lower().split(): + text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE) + elif '£' in text.lower().split(): + text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE) elif 'RMB' in text.lower().split(): text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE) else: