optimize mapping algorithm, this is the fixed version to confirm mapping metrics

2024-09-27 09:25:11 -05:00 · 2024-09-27 09:25:11 -05:00 · 0c4c541319
parent 7eba9a52ae
commit 0c4c541319
2 changed files with 138 additions and 37 deletions
--- a/main.py
+++ b/main.py
@ -564,10 +564,8 @@ def test_data_extraction_metrics():


 def test_mapping_raw_name():
-    doc_id = "344636875"
-    raw_fund_name = ""
-    raw_share_name = ""
-    raw_name = "Aberdeen Standard Alpha Global Loans I QInc USD"
+    doc_id = "481475385"
+    raw_name = "Emerging Markets Fund Y-DIST Shares (USD)"
    output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
    data_mapping = DataMapping(
        doc_id,
@ -578,7 +576,7 @@ def test_mapping_raw_name():
    )
    mapping_info = data_mapping.matching_with_database(
        raw_name=raw_name,
-        parent_id="FS0000DA0E", 
+        parent_id=None, 
        matching_type="share"
    )
    print(mapping_info)
@ -657,16 +655,15 @@ if __name__ == "__main__":
        "486378555",
        "506559375",
        "479793787",
-        "333207452",
        "471641628",
    ]
    special_doc_id_list = check_mapping_doc_id_list
-    # special_doc_id_list = ["402181770"]
+    special_doc_id_list = ["402113224"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
    re_run_mapping_data = True
-    force_save_total_data = True
+    force_save_total_data = False

    extract_ways = ["text"]
    for extract_way in extract_ways:
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -2,6 +2,54 @@ import re
 from copy import deepcopy
 from traceback import print_exc

+
+total_currency_list = [
+    "USD",
+    "EUR",
+    "AUD",
+    "JPY",
+    "CHF",
+    "GBP",
+    "SEK",
+    "CNY",
+    "NZD",
+    "CNH",
+    "NOK",
+    "SGD",
+    "HKD",
+    "ZAR",
+    "PLN",
+    "CAD",
+    "CZK",
+    "HUF",
+    "DKK",
+    "BRL",
+    "SKK",
+    "RON",
+    "TRY",
+    "BGN",
+    "CUP",
+    "MXN",
+    "CLF",
+    "XCD",
+    "ISK",
+    "IDR",
+    "MNT",
+    "AED",
+    "AFN",
+    "INR",
+    "ESP",
+    "RUB",
+    "CLP",
+    "KRW",
+    "ETB",
+    "DZD",
+    "XEU",
+    "XFO",
+]
+
+share_features = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Capitalisation', 'Admin', 'Advantage']
+
 def add_slash_to_text_as_regex(text: str):
    if text is None or len(text) == 0:
        return text
@ -29,18 +77,18 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
    Get the most similar fund name from fund_name_list by jacard similarity
    """
    try:
-        copy_fund_name_list = deepcopy(name_list)
+        copy_name_list = deepcopy(name_list)
        if text is None or len(text.split()) == 0 or \
-                copy_fund_name_list is None or len(copy_fund_name_list) == 0:
+                copy_name_list is None or len(copy_name_list) == 0:
            return None, None
        
-        copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name 
-                               in copy_fund_name_list]
+        copy_name_list = [replace_abbrevation(copy_name) for copy_name 
+                               in copy_name_list]

        # get common words in fund_name_list
        common_word_list = []
        if len(name_list) > 1:
-            _, common_word_list = remove_common_word(copy_fund_name_list)
+            _, common_word_list = remove_common_word(copy_name_list)
        if pre_common_word_list is not None and len(pre_common_word_list) > 0:
            common_word_list.extend([word for word in pre_common_word_list
                                     if word not in common_word_list])
@ -63,14 +111,14 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
            for word in common_word_list:
                if word not in lower_new_splits:
                    # remove word in fund_name_list
-                    for i in range(len(copy_fund_name_list)):
-                        temp_splits = copy_fund_name_list[i].split()
-                        copy_fund_name_list[i] = ' '.join([split for split in temp_splits 
+                    for i in range(len(copy_name_list)):
+                        temp_splits = copy_name_list[i].split()
+                        copy_name_list[i] = ' '.join([split for split in temp_splits 
                                                           if remove_special_characters(split).lower() != word])

-            for i in range(len(copy_fund_name_list)):
-                temp_splits = copy_fund_name_list[i].split()
-                copy_fund_name_list[i] = ' '.join([split for split in temp_splits
+            for i in range(len(copy_name_list)):
+                temp_splits = copy_name_list[i].split()
+                copy_name_list[i] = ' '.join([split for split in temp_splits
                                                   if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']])
            final_splits = []
            for split in new_splits:
@ -79,38 +127,72 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list

            text = ' '.join(final_splits)
        max_similarity = 0
-        max_similarity_fund_name = None
+        max_similarity_full_name = None
        text = remove_special_characters(text)
-        text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
-        for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list):
-            copy_fund_name = remove_special_characters(copy_fund_name)
-            copy_fund_name = split_words_without_space(copy_fund_name)
+        text, copy_name_list = update_for_currency(text, copy_name_list)
+        text_currencty = get_currency_from_text(text)
+        text_feature = get_share_feature_from_text(text)
+        for full_name, copy_name in zip(name_list , copy_name_list):
+            copy_name = remove_special_characters(copy_name)
+            copy_name = split_words_without_space(copy_name)
            similarity = get_jacard_similarity(text,
-                                            copy_fund_name,
+                                            copy_name,
                                            need_remove_numeric_characters=False)
+            copy_name_2 = replace_abbrevation(copy_name)
+            if copy_name != copy_name_2:
+                similarity_2 = get_jacard_similarity(text,
+                                            copy_name_2,
+                                            need_remove_numeric_characters=False)
+                if similarity_2 > similarity:
+                    similarity = similarity_2
            if similarity > max_similarity:
+                copy_name_currency = get_currency_from_text(copy_name)
+                if text_currencty is not None and copy_name_currency is not None:
+                    if text_currencty != copy_name_currency:
+                        continue
+                copy_name_feature = get_share_feature_from_text(copy_name)
+                if text_feature is not None and copy_name_feature is not None:
+                    if text_feature != copy_name_feature:
+                        continue
                max_similarity = similarity
-                max_similarity_fund_name = fund_name
+                max_similarity_full_name = full_name
            if max_similarity == 1:
                break
        if max_similarity < 0.35:
            return None, max_similarity
-        return max_similarity_fund_name, max_similarity
+        return max_similarity_full_name, max_similarity
    except Exception as e:
        print(e)
        print_exc()
        return None, 0.0

+def get_share_feature_from_text(text: str):
+    if text is None or len(text.strip()) == 0:
+        return None
+    text = text.strip()
+    text = text.lower()
+    text_split = text.split()
+    temp_share_features = [feature.lower() for feature in share_features]
+    for split in text_split[::-1]:
+        if split in temp_share_features:
+            return split
+    return None
+
+def get_currency_from_text(text: str):
+    if text is None or len(text.strip()) == 0:
+        return None
+    text = text.strip()
+    text = text.lower()
+    text_split = text.split()
+    for split in text_split[::-1]:
+        if split.upper() in total_currency_list:
+            return split
+    return None
+

 def update_for_currency(text: str, compare_list: list):
    text_split = text.split()
    with_currency = False
-    total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY', 
-                           'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD', 
-                           'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN', 
-                           'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR', 
-                           'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW', 
-                           'ETB', 'DZD', 'XEU', 'XFO']
    for split in text_split:
        if split.upper() in total_currency_list:
            with_currency = True
@ -198,6 +280,16 @@ def remove_common_word(text_list: list):
            else:
                common_word_list = list(
                    set(common_word_list).intersection(set(new_text_splits_list[j])))
+    
+    remove_list = []
+    # if exists the share name and currency name, remove from the list
+    for word in common_word_list:
+        if word.upper() in total_currency_list:
+            remove_list.append(word)
+    for remove in remove_list:
+        if remove in common_word_list:
+            common_word_list.remove(remove)
+    
    common_word_list = list(set(common_word_list))
    for i in range(len(new_text_splits_list)):
        for common_word in common_word_list:
@ -219,12 +311,22 @@ def split_words_without_space(text: str):
    # if len(splits) > 1:
    #     return text
    # find all words with capital letter + lower letter
-    regex = r'[A-Z][a-z]+'
+    regex = r"[A-Z][a-z]+"
+    regex2 = r"[A-Z]{2,}[a-z]+"
    word_list = re.findall(regex, text)
+    word_list2 = re.findall(regex2, text)
    if len(word_list) > 0:
        for word in word_list:
-            text = text.replace(word, ' ' + word + ' ')
-        text = re.sub(r'(\s)+', ' ', text)
+            if len(word_list2) > 0:
+                word_exists_in_word2 = False
+                for word2 in word_list2:
+                    if word in word2:
+                        word_exists_in_word2 = True
+                        break
+                if word_exists_in_word2:
+                    continue
+            text = text.replace(word, " " + word + " ")
+        text = re.sub(r"(\s)+", " ", text)
    return text.strip()


@ -332,6 +434,8 @@ def replace_abbrevation(text: str):
        text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE)
    elif 'swedish kronor' in text.lower():
        text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE)
+    elif "GPB" in text.split():
+        text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE)
    elif 'sterling' in text.lower().split():
        text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE)
    elif 'euro' in text.lower().split():
@ -342,7 +446,7 @@ def replace_abbrevation(text: str):
        text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
    elif '£' in text.lower().split():
        text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
-    elif 'RMB' in text.lower().split():
+    elif 'RMB' in text.split():
        text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
    else:
        pass