recover algorithm to the better version

2024-09-26 19:25:17 -05:00 · 2024-09-26 19:25:17 -05:00 · 7eba9a52ae
parent d25bae936c
commit 7eba9a52ae
2 changed files with 145 additions and 249 deletions
--- a/main.py
+++ b/main.py
@ -564,8 +564,10 @@ def test_data_extraction_metrics():


 def test_mapping_raw_name():
-    doc_id = "391456740"
-    raw_name = "Robeco Multi Asset Sustainable D EUR"
+    doc_id = "344636875"
+    raw_fund_name = ""
+    raw_share_name = ""
+    raw_name = "Aberdeen Standard Alpha Global Loans I QInc USD"
    output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
    data_mapping = DataMapping(
        doc_id,
@ -575,7 +577,9 @@ def test_mapping_raw_name():
        output_data_folder=output_folder,
    )
    mapping_info = data_mapping.matching_with_database(
-        raw_name=raw_name, parent_id=None, matching_type="share"
+        raw_name=raw_name,
+        parent_id="FS0000DA0E", 
+        matching_type="share"
    )
    print(mapping_info)

@ -653,10 +657,11 @@ if __name__ == "__main__":
        "486378555",
        "506559375",
        "479793787",
-        "333207452"
+        "333207452",
+        "471641628",
    ]
    special_doc_id_list = check_mapping_doc_id_list
-    # special_doc_id_list = ["333207452"]
+    # special_doc_id_list = ["402181770"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -2,55 +2,6 @@ import re
 from copy import deepcopy
 from traceback import print_exc

-
-total_currency_list = [
-    "USD",
-    "EUR",
-    "AUD",
-    "JPY",
-    "CHF",
-    "GBP",
-    "SEK",
-    "CNY",
-    "NZD",
-    "CNH",
-    "NOK",
-    "SGD",
-    "HKD",
-    "ZAR",
-    "PLN",
-    "CAD",
-    "CZK",
-    "HUF",
-    "DKK",
-    "BRL",
-    "SKK",
-    "RON",
-    "TRY",
-    "BGN",
-    "CUP",
-    "MXN",
-    "TOP",
-    "ILS",
-    "CLF",
-    "XCD",
-    "ISK",
-    "IDR",
-    "MNT",
-    "AED",
-    "AFN",
-    "INR",
-    "ESP",
-    "RUB",
-    "CLP",
-    "KRW",
-    "ETB",
-    "DZD",
-    "XEU",
-    "XFO",
-]
-
-
 def add_slash_to_text_as_regex(text: str):
    if text is None or len(text) == 0:
        return text
@ -68,49 +19,35 @@ def add_slash_to_text_as_regex(text: str):
 def clean_text(text: str) -> str:
    # text = text.lower()
    # update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
-    text = re.sub(r"\\u[A-Z0-9a-z]{4}", " ", text)
-    text = re.sub(r"( ){2,}", " ", text.strip())
+    text = re.sub(r"\\u[A-Z0-9a-z]{4}", ' ', text)
+    text = re.sub(r"( ){2,}", ' ', text.strip())
    return text


-def get_most_similar_name(
-    text: str, name_list: list, pre_common_word_list: list = None
-) -> str:
+def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list = None) -> str:
    """
    Get the most similar fund name from fund_name_list by jacard similarity
    """
    try:
        copy_fund_name_list = deepcopy(name_list)
-        if (
-            text is None
-            or len(text.split()) == 0
-            or copy_fund_name_list is None
-            or len(copy_fund_name_list) == 0
-        ):
+        if text is None or len(text.split()) == 0 or \
+                copy_fund_name_list is None or len(copy_fund_name_list) == 0:
            return None, None
        
-        copy_fund_name_list = [
-            replace_abbrevation(copy_fund_name)
-            for copy_fund_name in copy_fund_name_list
-        ]
-        
-        copy_fund_name_list = [
-            replace_abbrevation(remove_special_characters(copy_fund_name))
-            for copy_fund_name in copy_fund_name_list
-        ]
+        copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name 
+                               in copy_fund_name_list]

        # get common words in fund_name_list
        common_word_list = []
        if len(name_list) > 1:
            _, common_word_list = remove_common_word(copy_fund_name_list)
        if pre_common_word_list is not None and len(pre_common_word_list) > 0:
-            common_word_list.extend(
-                [word for word in pre_common_word_list if word not in common_word_list]
-            )
+            common_word_list.extend([word for word in pre_common_word_list
+                                     if word not in common_word_list])

        text = text.strip()
+        text = remove_special_characters(text)
        text = replace_abbrevation(text)
-        text = replace_abbrevation(remove_special_characters(text))
        text_splits = text.split()
        if len(text_splits) == 1:
            text = split_words_without_space(text)
@ -128,46 +65,29 @@ def get_most_similar_name(
                    # remove word in fund_name_list
                    for i in range(len(copy_fund_name_list)):
                        temp_splits = copy_fund_name_list[i].split()
-                        copy_fund_name_list[i] = " ".join(
-                            [
-                                split
-                                for split in temp_splits
-                                if remove_special_characters(split).lower() != word
-                            ]
-                        )
+                        copy_fund_name_list[i] = ' '.join([split for split in temp_splits 
+                                                           if remove_special_characters(split).lower() != word])

            for i in range(len(copy_fund_name_list)):
                temp_splits = copy_fund_name_list[i].split()
-                copy_fund_name_list[i] = " ".join(
-                    [
-                        split
-                        for split in temp_splits
-                        if remove_special_characters(split).lower()
-                        not in ["fund", "portfolio", "class", "share", "shares"]
-                    ]
-                )
+                copy_fund_name_list[i] = ' '.join([split for split in temp_splits
+                                                   if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']])
            final_splits = []
            for split in new_splits:
-                if split.lower() not in [
-                    "fund",
-                    "portfolio",
-                    "class",
-                    "share",
-                    "shares",
-                ]:
+                if split.lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']:
                    final_splits.append(split)

-            text = " ".join(final_splits)
+            text = ' '.join(final_splits)
        max_similarity = 0
        max_similarity_fund_name = None
        text = remove_special_characters(text)
        text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
-        for fund_name, copy_fund_name in zip(name_list, copy_fund_name_list):
+        for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list):
            copy_fund_name = remove_special_characters(copy_fund_name)
            copy_fund_name = split_words_without_space(copy_fund_name)
-            similarity = get_jacard_similarity(
-                text, copy_fund_name, need_remove_numeric_characters=False
-            )
+            similarity = get_jacard_similarity(text,
+                                            copy_fund_name,
+                                            need_remove_numeric_characters=False)
            if similarity > max_similarity:
                max_similarity = similarity
                max_similarity_fund_name = fund_name
@ -185,6 +105,12 @@ def get_most_similar_name(
 def update_for_currency(text: str, compare_list: list):
    text_split = text.split()
    with_currency = False
+    total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY', 
+                           'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD', 
+                           'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN', 
+                           'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR', 
+                           'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW', 
+                           'ETB', 'DZD', 'XEU', 'XFO']
    for split in text_split:
        if split.upper() in total_currency_list:
            with_currency = True
@ -212,7 +138,7 @@ def update_for_currency(text: str, compare_list: list):
            if len(without_currency_list) > 0:
                for index in without_currency_list:
                    if last_split in compare_list[index].split():
-                        text = text + " " + "USD"
+                        text = text + ' ' + 'USD'
                        updated = True
                        break
            if not updated:
@ -220,26 +146,23 @@ def update_for_currency(text: str, compare_list: list):
                for index in with_currency_list:
                    compare_split = compare_list[index].split()
                    if last_split in compare_split:
-                        current_currency_list = [
-                            split
-                            for split in compare_split
-                            if split.upper() in total_currency_list
-                        ]
+                        current_currency_list = [split for split in compare_split 
+                                                 if split.upper() in total_currency_list]
                        if len(current_currency_list) > 0:
                            currency_list.append(current_currency_list[-1])
                if len(currency_list) == 1:
-                    text = text + " " + currency_list[0]
+                    text = text + ' ' + currency_list[0]
                    updated = True        
                        
        for index in without_currency_list:
-            compare_list[index] = compare_list[index] + " " + "USD"
+            compare_list[index] = compare_list[index] + ' ' + 'USD'
            
        if not updated:
-            text = text + " " + "USD"
+            text = text + ' ' + 'USD'
        return text, compare_list
    elif with_currency and len(without_currency_list) == 0:
        for index in without_currency_list:
-            compare_list[index] = compare_list[index] + " " + "USD"
+            compare_list[index] = compare_list[index] + ' ' + 'USD'
        return text, compare_list
    else:
        return text, compare_list
@ -253,60 +176,35 @@ def remove_common_word(text_list: list):
        text = text.lower()
        text = remove_special_characters(text)
        text_splits = text.split()
-        while "fund" in text_splits:
-            text_splits.remove("fund")
-        while "portfolio" in text_splits:
-            text_splits.remove("portfolio")
-        while "share" in text_splits:
-            text_splits.remove("share")
-        while "class" in text_splits:
-            text_splits.remove("class")
-        text = " ".join(text_splits)
+        while 'fund' in text_splits:
+            text_splits.remove('fund')
+        while 'portfolio' in text_splits:
+            text_splits.remove('portfolio')
+        while 'share' in text_splits:
+            text_splits.remove('share')
+        while 'class' in text_splits:
+            text_splits.remove('class')
+        text = ' '.join(text_splits)
        new_text_list.append(text)
    # remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words
    # the result is ['Global', 'Growth']
    common_word_list = []
    new_text_splits_list = [text.split() for text in new_text_list]
    for i in range(len(new_text_splits_list)):
-        for j in range(i + 1, len(new_text_splits_list)):
+        for j in range(i+1, len(new_text_splits_list)):
            if common_word_list is None or len(common_word_list) == 0:
                common_word_list = list(
-                    set(new_text_splits_list[i]).intersection(
-                        set(new_text_splits_list[j])
-                    )
-                )
+                    set(new_text_splits_list[i]).intersection(set(new_text_splits_list[j])))
            else:
                common_word_list = list(
-                    set(common_word_list).intersection(set(new_text_splits_list[j]))
-                )
+                    set(common_word_list).intersection(set(new_text_splits_list[j])))
    common_word_list = list(set(common_word_list))
-    
-    remove_list = []
-    # if exists the share name and currency name, remove from the list
-    for word in common_word_list:
-        if word.upper() in total_currency_list:
-            remove_list.append(word)
-    for text in new_text_list:
-        text_splits = text.split()
-        if len(text_splits) < 4:
-            continue
-        # get last 3 words from text_splits
-        last_three_words = text_splits[-3:]
-        for word in common_word_list:
-            if word not in remove_list and \
-                word.upper() == word and \
-                word in last_three_words:
-                remove_list.append(word)
-    for remove in remove_list:
-        if remove in common_word_list:
-            common_word_list.remove(remove)
-    
    for i in range(len(new_text_splits_list)):
        for common_word in common_word_list:
            if common_word in new_text_splits_list[i]:
                new_text_splits_list[i].remove(common_word)
-    new_text_list = [" ".join(text_splits) for text_splits in new_text_splits_list]
-    
+    new_text_list = [' '.join(text_splits)
+                     for text_splits in new_text_splits_list]
    return new_text_list, common_word_list


@ -321,22 +219,21 @@ def split_words_without_space(text: str):
    # if len(splits) > 1:
    #     return text
    # find all words with capital letter + lower letter
-    regex = r"[A-Z][a-z]+"
+    regex = r'[A-Z][a-z]+'
    word_list = re.findall(regex, text)
    if len(word_list) > 0:
        for word in word_list:
-            text = text.replace(word, " " + word + " ")
-        text = re.sub(r"(\s)+", " ", text)
+            text = text.replace(word, ' ' + word + ' ')
+        text = re.sub(r'(\s)+', ' ', text)
    return text.strip()


 def remove_special_characters(text):
-    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
-    text = re.sub(r"\s+", " ", text)
+    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
+    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

-
 def get_unique_words_text(text):
    text = remove_special_characters(text)
    text = text.lower()
@ -344,24 +241,22 @@ def get_unique_words_text(text):
    text_split = list(set(text_split))
    # sort the list
    text_split.sort()
-    return_text = " ".join(text_split)
+    return_text = ' '.join(text_split)
    return return_text


 def remove_numeric_characters(text):
    # remove numeric characters
-    text = re.sub(r"\d+", " ", text)
-    text = re.sub(r"\s+", " ", text)
+    text = re.sub(r'\d+', ' ', text)
+    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


-def get_jacard_similarity(
-    text_left,
+def get_jacard_similarity(text_left,
                          text_right,
                          need_remove_special_characters=True,
-    need_remove_numeric_characters=True,
-):
+                          need_remove_numeric_characters=True):
    if need_remove_special_characters:
        text_left = remove_special_characters(text_left)
        text_right = remove_special_characters(text_right)
@ -379,7 +274,6 @@ def get_jacard_similarity(
    else:
        return 0

-
 def get_beginning_common_words(text_list: list):
    """
    Get the beginning common words in text_list
@ -404,89 +298,86 @@ def get_beginning_common_words(text_list: list):
        else:
            break
    
-    return " ".join(common_words_list).strip()
-
+    return ' '.join(common_words_list).strip()

 def replace_abbrevation(text: str):
    if text is None or len(text.strip()) == 0:
        return text
    text = text.strip()
-    if "swiss franc" in text.lower():
-        text = re.sub(r"swiss\s+franc", "CHF", text, flags=re.IGNORECASE)
-    elif "us dollar" in text.lower():
-        text = re.sub(r"us\s+dollar", "USD", text, flags=re.IGNORECASE)
-    elif "singapore dollar" in text.lower():
-        text = re.sub(r"singapore\s+dollar", "SGD", text, flags=re.IGNORECASE)
-    elif "hong kong dollar" in text.lower():
-        text = re.sub(r"hong\s+kong\s+dollar", "HKD", text, flags=re.IGNORECASE)
-    elif "hongkong dollar" in text.lower():
-        text = re.sub(r"hongkong\s+dollar", "HKD", text, flags=re.IGNORECASE)
-    elif "australian dollar" in text.lower():
-        text = re.sub(r"australian\s+dollar", "AUD", text, flags=re.IGNORECASE)
-    elif "japanese yen" in text.lower():
-        text = re.sub(r"japanese\s+yen", "JPY", text, flags=re.IGNORECASE)
-    elif "south african rand" in text.lower():
-        text = re.sub(r"South\s+African\s+rand", "ZAR", text, flags=re.IGNORECASE)
-    elif "canadian dollar" in text.lower():
-        text = re.sub(r"canadian\s+dollar", "CAD", text, flags=re.IGNORECASE)
-    elif "new zealand dollar" in text.lower():
-        text = re.sub(r"new\s+zealand\s+dollar", "NZD", text, flags=re.IGNORECASE)
-    elif "norwegian krone" in text.lower():
-        text = re.sub(r"norwegian\s+krone", "NOK", text, flags=re.IGNORECASE)
-    elif "danish krone" in text.lower():
-        text = re.sub(r"danish\s+krone", "DKK", text, flags=re.IGNORECASE)
-    elif "swedish krona" in text.lower():
-        text = re.sub(r"swedish\s+krona", "SEK", text, flags=re.IGNORECASE)
-    elif "swedish kronor" in text.lower():
-        text = re.sub(r"swedish\s+kronor", "SEK", text, flags=re.IGNORECASE)
-    elif "GPB" in text.split():
-        text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE)
-    elif "sterling" in text.lower().split():
-        text = re.sub(r"sterling", "GBP", text, flags=re.IGNORECASE)
-    elif "euro" in text.lower().split():
-        text = re.sub(r"euro", "EUR", text, flags=re.IGNORECASE)
-    elif "€" in text.lower().split():
-        text = re.sub(r"\€", "EUR", text, flags=re.IGNORECASE)
-    elif "$" in text.lower().split():
-        text = re.sub(r"\$", "USD", text, flags=re.IGNORECASE)
-    elif "£" in text.lower().split():
-        text = re.sub(r"\£", "GBP", text, flags=re.IGNORECASE)
-    elif "RMB" in text.split():
-        text = re.sub(r"RMB", "CNY", text, flags=re.IGNORECASE)
+    if 'swiss franc' in text.lower():
+        text = re.sub(r'swiss\s+franc', 'CHF', text, flags=re.IGNORECASE)
+    elif 'us dollar' in text.lower():
+        text = re.sub(r'us\s+dollar', 'USD', text, flags=re.IGNORECASE)
+    elif 'singapore dollar' in text.lower():
+        text = re.sub(r'singapore\s+dollar', 'SGD', text, flags=re.IGNORECASE)
+    elif 'hong kong dollar' in text.lower():
+        text = re.sub(r'hong\s+kong\s+dollar', 'HKD', text, flags=re.IGNORECASE)
+    elif 'hongkong dollar' in text.lower():
+        text = re.sub(r'hongkong\s+dollar', 'HKD', text, flags=re.IGNORECASE)
+    elif 'australian dollar' in text.lower():
+        text = re.sub(r'australian\s+dollar', 'AUD', text, flags=re.IGNORECASE)
+    elif 'japanese yen' in text.lower():
+        text = re.sub(r'japanese\s+yen', 'JPY', text, flags=re.IGNORECASE)
+    elif 'south african rand' in text.lower():
+        text = re.sub(r'South\s+African\s+rand', 'ZAR', text, flags=re.IGNORECASE)
+    elif 'canadian dollar' in text.lower():
+        text = re.sub(r'canadian\s+dollar', 'CAD', text, flags=re.IGNORECASE)
+    elif 'new zealand dollar' in text.lower():
+        text = re.sub(r'new\s+zealand\s+dollar', 'NZD', text, flags=re.IGNORECASE)
+    elif 'norwegian krone' in text.lower():
+        text = re.sub(r'norwegian\s+krone', 'NOK', text, flags=re.IGNORECASE)
+    elif 'danish krone' in text.lower():
+        text = re.sub(r'danish\s+krone', 'DKK', text, flags=re.IGNORECASE)
+    elif 'swedish krona' in text.lower():
+        text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE)
+    elif 'swedish kronor' in text.lower():
+        text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE)
+    elif 'sterling' in text.lower().split():
+        text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE)
+    elif 'euro' in text.lower().split():
+        text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE)
+    elif '€' in text.lower().split():
+        text = re.sub(r'\€', 'EUR', text, flags=re.IGNORECASE)
+    elif '$' in text.lower().split():
+        text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
+    elif '£' in text.lower().split():
+        text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
+    elif 'RMB' in text.lower().split():
+        text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
    else:
        pass
    
    text_splits = text.split()
    new_text_splits = []
    for split in text_splits:
-        if split.lower() in ["acc", "acc."]:
-            new_text_splits.append("Accumulation")
-        elif split.lower() in ["inc", "inc."]:
-            new_text_splits.append("Income")
-        elif split.lower() in ["dist", "dist."]:
-            new_text_splits.append("Distribution")
-        elif split.lower() in ["inv", "inv."]:
-            new_text_splits.append("Investor")
-        elif split.lower() in ["inst", "inst.", "institution"]:
-            new_text_splits.append("Institutional")
-        elif split.lower() in ["cap", "cap."]:
-            new_text_splits.append("Capitalisation")
-        elif split.lower() in ["adm", "adm."]:
-            new_text_splits.append("Admin")
-        elif split.lower() in ["adv", "adv."]:
-            new_text_splits.append("Advantage")
-        elif split.lower() in ["hdg", "hgd", "hdg.", "hgd.", "(h)"]:
-            new_text_splits.append("Hedged")
-        elif split.lower() in ["cl", "cl."]:
-            new_text_splits.append("Class")
-        elif split.lower() in ["ser", "ser."]:
-            new_text_splits.append("Series")
-        elif split.lower() in ["u.s."]:
-            new_text_splits.append("US")
-        elif split.lower() in ["nc", "nc."]:
-            new_text_splits.append("no trail")
+        if split.lower() in ['acc', 'acc.']:
+            new_text_splits.append('Accumulation')
+        elif split.lower() in ['inc', 'inc.']:
+            new_text_splits.append('Income')
+        elif split.lower() in ['dist', 'dist.']:
+            new_text_splits.append('Distribution')
+        elif split.lower() in ['inv', 'inv.']:
+            new_text_splits.append('Investor')
+        elif split.lower() in ['inst', 'inst.', 'institution']:
+            new_text_splits.append('Institutional')
+        elif split.lower() in ['cap', 'cap.']:
+            new_text_splits.append('Capitalisation')
+        elif split.lower() in ['adm', 'adm.']:
+            new_text_splits.append('Admin')
+        elif split.lower() in ['adv', 'adv.']:
+            new_text_splits.append('Advantage')
+        elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']:
+            new_text_splits.append('Hedged')
+        elif split.lower() in ['cl', 'cl.']:
+            new_text_splits.append('Class')
+        elif split.lower() in ['ser', 'ser.']:
+            new_text_splits.append('Series')
+        elif split.lower() in ['u.s.']:
+            new_text_splits.append('US')
+        elif split.lower() in ['nc', 'nc.']:
+            new_text_splits.append('no trail')
        else:
            new_text_splits.append(split)
    
-    new_text = " ".join(new_text_splits)
+    new_text = ' '.join(new_text_splits)
    return new_text