diff --git a/main.py b/main.py index df8ed8e..d3891f2 100644 --- a/main.py +++ b/main.py @@ -564,10 +564,8 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "344636875" - raw_fund_name = "" - raw_share_name = "" - raw_name = "Aberdeen Standard Alpha Global Loans I QInc USD" + doc_id = "481475385" + raw_name = "Emerging Markets Fund Y-DIST Shares (USD)" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -578,7 +576,7 @@ def test_mapping_raw_name(): ) mapping_info = data_mapping.matching_with_database( raw_name=raw_name, - parent_id="FS0000DA0E", + parent_id=None, matching_type="share" ) print(mapping_info) @@ -657,16 +655,15 @@ if __name__ == "__main__": "486378555", "506559375", "479793787", - "333207452", "471641628", ] special_doc_id_list = check_mapping_doc_id_list - # special_doc_id_list = ["402181770"] + special_doc_id_list = ["402113224"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False extract_ways = ["text"] for extract_way in extract_ways: diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 1ca1697..346fd1d 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -2,6 +2,54 @@ import re from copy import deepcopy from traceback import print_exc + +total_currency_list = [ + "USD", + "EUR", + "AUD", + "JPY", + "CHF", + "GBP", + "SEK", + "CNY", + "NZD", + "CNH", + "NOK", + "SGD", + "HKD", + "ZAR", + "PLN", + "CAD", + "CZK", + "HUF", + "DKK", + "BRL", + "SKK", + "RON", + "TRY", + "BGN", + "CUP", + "MXN", + "CLF", + "XCD", + "ISK", + "IDR", + "MNT", + "AED", + "AFN", + "INR", + "ESP", + "RUB", + "CLP", + "KRW", + "ETB", + "DZD", + "XEU", + "XFO", +] + +share_features = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Capitalisation', 'Admin', 'Advantage'] + def add_slash_to_text_as_regex(text: str): if text is None or len(text) == 0: return text @@ -29,18 +77,18 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list Get the most similar fund name from fund_name_list by jacard similarity """ try: - copy_fund_name_list = deepcopy(name_list) + copy_name_list = deepcopy(name_list) if text is None or len(text.split()) == 0 or \ - copy_fund_name_list is None or len(copy_fund_name_list) == 0: + copy_name_list is None or len(copy_name_list) == 0: return None, None - copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name - in copy_fund_name_list] + copy_name_list = [replace_abbrevation(copy_name) for copy_name + in copy_name_list] # get common words in fund_name_list common_word_list = [] if len(name_list) > 1: - _, common_word_list = remove_common_word(copy_fund_name_list) + _, common_word_list = remove_common_word(copy_name_list) if pre_common_word_list is not None and len(pre_common_word_list) > 0: common_word_list.extend([word for word in pre_common_word_list if word not in common_word_list]) @@ -63,14 +111,14 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list for word in common_word_list: if word not in lower_new_splits: # remove word in fund_name_list - for i in range(len(copy_fund_name_list)): - temp_splits = copy_fund_name_list[i].split() - copy_fund_name_list[i] = ' '.join([split for split in temp_splits + for i in range(len(copy_name_list)): + temp_splits = copy_name_list[i].split() + copy_name_list[i] = ' '.join([split for split in temp_splits if remove_special_characters(split).lower() != word]) - for i in range(len(copy_fund_name_list)): - temp_splits = copy_fund_name_list[i].split() - copy_fund_name_list[i] = ' '.join([split for split in temp_splits + for i in range(len(copy_name_list)): + temp_splits = copy_name_list[i].split() + copy_name_list[i] = ' '.join([split for split in temp_splits if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']]) final_splits = [] for split in new_splits: @@ -79,38 +127,72 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list text = ' '.join(final_splits) max_similarity = 0 - max_similarity_fund_name = None + max_similarity_full_name = None text = remove_special_characters(text) - text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list) - for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list): - copy_fund_name = remove_special_characters(copy_fund_name) - copy_fund_name = split_words_without_space(copy_fund_name) + text, copy_name_list = update_for_currency(text, copy_name_list) + text_currencty = get_currency_from_text(text) + text_feature = get_share_feature_from_text(text) + for full_name, copy_name in zip(name_list , copy_name_list): + copy_name = remove_special_characters(copy_name) + copy_name = split_words_without_space(copy_name) similarity = get_jacard_similarity(text, - copy_fund_name, + copy_name, need_remove_numeric_characters=False) + copy_name_2 = replace_abbrevation(copy_name) + if copy_name != copy_name_2: + similarity_2 = get_jacard_similarity(text, + copy_name_2, + need_remove_numeric_characters=False) + if similarity_2 > similarity: + similarity = similarity_2 if similarity > max_similarity: + copy_name_currency = get_currency_from_text(copy_name) + if text_currencty is not None and copy_name_currency is not None: + if text_currencty != copy_name_currency: + continue + copy_name_feature = get_share_feature_from_text(copy_name) + if text_feature is not None and copy_name_feature is not None: + if text_feature != copy_name_feature: + continue max_similarity = similarity - max_similarity_fund_name = fund_name + max_similarity_full_name = full_name if max_similarity == 1: break if max_similarity < 0.35: return None, max_similarity - return max_similarity_fund_name, max_similarity + return max_similarity_full_name, max_similarity except Exception as e: print(e) print_exc() return None, 0.0 +def get_share_feature_from_text(text: str): + if text is None or len(text.strip()) == 0: + return None + text = text.strip() + text = text.lower() + text_split = text.split() + temp_share_features = [feature.lower() for feature in share_features] + for split in text_split[::-1]: + if split in temp_share_features: + return split + return None + +def get_currency_from_text(text: str): + if text is None or len(text.strip()) == 0: + return None + text = text.strip() + text = text.lower() + text_split = text.split() + for split in text_split[::-1]: + if split.upper() in total_currency_list: + return split + return None + def update_for_currency(text: str, compare_list: list): text_split = text.split() with_currency = False - total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY', - 'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD', - 'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN', - 'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR', - 'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW', - 'ETB', 'DZD', 'XEU', 'XFO'] for split in text_split: if split.upper() in total_currency_list: with_currency = True @@ -198,6 +280,16 @@ def remove_common_word(text_list: list): else: common_word_list = list( set(common_word_list).intersection(set(new_text_splits_list[j]))) + + remove_list = [] + # if exists the share name and currency name, remove from the list + for word in common_word_list: + if word.upper() in total_currency_list: + remove_list.append(word) + for remove in remove_list: + if remove in common_word_list: + common_word_list.remove(remove) + common_word_list = list(set(common_word_list)) for i in range(len(new_text_splits_list)): for common_word in common_word_list: @@ -219,12 +311,22 @@ def split_words_without_space(text: str): # if len(splits) > 1: # return text # find all words with capital letter + lower letter - regex = r'[A-Z][a-z]+' + regex = r"[A-Z][a-z]+" + regex2 = r"[A-Z]{2,}[a-z]+" word_list = re.findall(regex, text) + word_list2 = re.findall(regex2, text) if len(word_list) > 0: for word in word_list: - text = text.replace(word, ' ' + word + ' ') - text = re.sub(r'(\s)+', ' ', text) + if len(word_list2) > 0: + word_exists_in_word2 = False + for word2 in word_list2: + if word in word2: + word_exists_in_word2 = True + break + if word_exists_in_word2: + continue + text = text.replace(word, " " + word + " ") + text = re.sub(r"(\s)+", " ", text) return text.strip() @@ -332,6 +434,8 @@ def replace_abbrevation(text: str): text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE) elif 'swedish kronor' in text.lower(): text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE) + elif "GPB" in text.split(): + text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE) elif 'sterling' in text.lower().split(): text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE) elif 'euro' in text.lower().split(): @@ -342,7 +446,7 @@ def replace_abbrevation(text: str): text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE) elif '£' in text.lower().split(): text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE) - elif 'RMB' in text.lower().split(): + elif 'RMB' in text.split(): text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE) else: pass