optimize mapping algorithm, this is the fixed version to confirm mapping metrics

2024-09-27 09:25:11 -05:00 · 2024-09-27 09:25:11 -05:00 · 0c4c541319
parent 7eba9a52ae
commit 0c4c541319
2 changed files with 138 additions and 37 deletions
--- a/main.py
+++ b/main.py
@ -564,10 +564,8 @@ def test_data_extraction_metrics():
 def test_mapping_raw_name():
-    doc_id = "344636875"
+    doc_id = "481475385"
-    raw_fund_name = ""
+    raw_name = "Emerging Markets Fund Y-DIST Shares (USD)"
    raw_share_name = ""
    raw_name = "Aberdeen Standard Alpha Global Loans I QInc USD"
    output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
    data_mapping = DataMapping(
        doc_id,
@ -578,7 +576,7 @@ def test_mapping_raw_name():
    )
    mapping_info = data_mapping.matching_with_database(
        raw_name=raw_name,
-        parent_id="FS0000DA0E", 
+        parent_id=None, 
        matching_type="share"
    )
    print(mapping_info)
@ -657,16 +655,15 @@ if __name__ == "__main__":
        "486378555",
        "506559375",
        "479793787",
        "333207452",
        "471641628",
    ]
    special_doc_id_list = check_mapping_doc_id_list
-    # special_doc_id_list = ["402181770"]
+    special_doc_id_list = ["402113224"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
    re_run_mapping_data = True
-    force_save_total_data = True
+    force_save_total_data = False
    extract_ways = ["text"]
    for extract_way in extract_ways:
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -2,6 +2,54 @@ import re
 from copy import deepcopy
 from traceback import print_exc
 total_currency_list = [
    "USD",
    "EUR",
    "AUD",
    "JPY",
    "CHF",
    "GBP",
    "SEK",
    "CNY",
    "NZD",
    "CNH",
    "NOK",
    "SGD",
    "HKD",
    "ZAR",
    "PLN",
    "CAD",
    "CZK",
    "HUF",
    "DKK",
    "BRL",
    "SKK",
    "RON",
    "TRY",
    "BGN",
    "CUP",
    "MXN",
    "CLF",
    "XCD",
    "ISK",
    "IDR",
    "MNT",
    "AED",
    "AFN",
    "INR",
    "ESP",
    "RUB",
    "CLP",
    "KRW",
    "ETB",
    "DZD",
    "XEU",
    "XFO",
 ]
 share_features = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Capitalisation', 'Admin', 'Advantage']
 def add_slash_to_text_as_regex(text: str):
    if text is None or len(text) == 0:
        return text
@ -29,18 +77,18 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
    Get the most similar fund name from fund_name_list by jacard similarity
    """
    try:
-        copy_fund_name_list = deepcopy(name_list)
+        copy_name_list = deepcopy(name_list)
        if text is None or len(text.split()) == 0 or \
-                copy_fund_name_list is None or len(copy_fund_name_list) == 0:
+                copy_name_list is None or len(copy_name_list) == 0:
            return None, None
-        copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name 
+        copy_name_list = [replace_abbrevation(copy_name) for copy_name 
-                               in copy_fund_name_list]
+                               in copy_name_list]
        # get common words in fund_name_list
        common_word_list = []
        if len(name_list) > 1:
-            _, common_word_list = remove_common_word(copy_fund_name_list)
+            _, common_word_list = remove_common_word(copy_name_list)
        if pre_common_word_list is not None and len(pre_common_word_list) > 0:
            common_word_list.extend([word for word in pre_common_word_list
                                     if word not in common_word_list])
@ -63,14 +111,14 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
            for word in common_word_list:
                if word not in lower_new_splits:
                    # remove word in fund_name_list
-                    for i in range(len(copy_fund_name_list)):
+                    for i in range(len(copy_name_list)):
-                        temp_splits = copy_fund_name_list[i].split()
+                        temp_splits = copy_name_list[i].split()
-                        copy_fund_name_list[i] = ' '.join([split for split in temp_splits 
+                        copy_name_list[i] = ' '.join([split for split in temp_splits 
                                                           if remove_special_characters(split).lower() != word])
-            for i in range(len(copy_fund_name_list)):
+            for i in range(len(copy_name_list)):
-                temp_splits = copy_fund_name_list[i].split()
+                temp_splits = copy_name_list[i].split()
-                copy_fund_name_list[i] = ' '.join([split for split in temp_splits
+                copy_name_list[i] = ' '.join([split for split in temp_splits
                                                   if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']])
            final_splits = []
            for split in new_splits:
@ -79,38 +127,72 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
            text = ' '.join(final_splits)
        max_similarity = 0
-        max_similarity_fund_name = None
+        max_similarity_full_name = None
        text = remove_special_characters(text)
-        text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
+        text, copy_name_list = update_for_currency(text, copy_name_list)
-        for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list):
+        text_currencty = get_currency_from_text(text)
-            copy_fund_name = remove_special_characters(copy_fund_name)
+        text_feature = get_share_feature_from_text(text)
-            copy_fund_name = split_words_without_space(copy_fund_name)
+        for full_name, copy_name in zip(name_list , copy_name_list):
            copy_name = remove_special_characters(copy_name)
            copy_name = split_words_without_space(copy_name)
            similarity = get_jacard_similarity(text,
-                                            copy_fund_name,
+                                            copy_name,
                                            need_remove_numeric_characters=False)
            copy_name_2 = replace_abbrevation(copy_name)
            if copy_name != copy_name_2:
                similarity_2 = get_jacard_similarity(text,
                                            copy_name_2,
                                            need_remove_numeric_characters=False)
                if similarity_2 > similarity:
                    similarity = similarity_2
            if similarity > max_similarity:
                copy_name_currency = get_currency_from_text(copy_name)
                if text_currencty is not None and copy_name_currency is not None:
                    if text_currencty != copy_name_currency:
                        continue
                copy_name_feature = get_share_feature_from_text(copy_name)
                if text_feature is not None and copy_name_feature is not None:
                    if text_feature != copy_name_feature:
                        continue
                max_similarity = similarity
-                max_similarity_fund_name = fund_name
+                max_similarity_full_name = full_name
            if max_similarity == 1:
                break
        if max_similarity < 0.35:
            return None, max_similarity
-        return max_similarity_fund_name, max_similarity
+        return max_similarity_full_name, max_similarity
    except Exception as e:
        print(e)
        print_exc()
        return None, 0.0
 def get_share_feature_from_text(text: str):
    if text is None or len(text.strip()) == 0:
        return None
    text = text.strip()
    text = text.lower()
    text_split = text.split()
    temp_share_features = [feature.lower() for feature in share_features]
    for split in text_split[::-1]:
        if split in temp_share_features:
            return split
    return None
 def get_currency_from_text(text: str):
    if text is None or len(text.strip()) == 0:
        return None
    text = text.strip()
    text = text.lower()
    text_split = text.split()
    for split in text_split[::-1]:
        if split.upper() in total_currency_list:
            return split
    return None
 def update_for_currency(text: str, compare_list: list):
    text_split = text.split()
    with_currency = False
    total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY', 
                           'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD', 
                           'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN', 
                           'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR', 
                           'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW', 
                           'ETB', 'DZD', 'XEU', 'XFO']
    for split in text_split:
        if split.upper() in total_currency_list:
            with_currency = True
@ -198,6 +280,16 @@ def remove_common_word(text_list: list):
            else:
                common_word_list = list(
                    set(common_word_list).intersection(set(new_text_splits_list[j])))
    remove_list = []
    # if exists the share name and currency name, remove from the list
    for word in common_word_list:
        if word.upper() in total_currency_list:
            remove_list.append(word)
    for remove in remove_list:
        if remove in common_word_list:
            common_word_list.remove(remove)
    common_word_list = list(set(common_word_list))
    for i in range(len(new_text_splits_list)):
        for common_word in common_word_list:
@ -219,12 +311,22 @@ def split_words_without_space(text: str):
    # if len(splits) > 1:
    #     return text
    # find all words with capital letter + lower letter
-    regex = r'[A-Z][a-z]+'
+    regex = r"[A-Z][a-z]+"
    regex2 = r"[A-Z]{2,}[a-z]+"
    word_list = re.findall(regex, text)
    word_list2 = re.findall(regex2, text)
    if len(word_list) > 0:
        for word in word_list:
-            text = text.replace(word, ' ' + word + ' ')
+            if len(word_list2) > 0:
-        text = re.sub(r'(\s)+', ' ', text)
+                word_exists_in_word2 = False
                for word2 in word_list2:
                    if word in word2:
                        word_exists_in_word2 = True
                        break
                if word_exists_in_word2:
                    continue
            text = text.replace(word, " " + word + " ")
        text = re.sub(r"(\s)+", " ", text)
    return text.strip()
@ -332,6 +434,8 @@ def replace_abbrevation(text: str):
        text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE)
    elif 'swedish kronor' in text.lower():
        text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE)
    elif "GPB" in text.split():
        text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE)
    elif 'sterling' in text.lower().split():
        text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE)
    elif 'euro' in text.lower().split():
@ -342,7 +446,7 @@ def replace_abbrevation(text: str):
        text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
    elif '£' in text.lower().split():
        text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
-    elif 'RMB' in text.lower().split():
+    elif 'RMB' in text.split():
        text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
    else:
        pass