dc-ml-emea-ar/utils/biz_utils.py

import re
from copy import deepcopy
from traceback import print_exc


total_currency_list = [
    "USD",
    "EUR",
    "AUD",
    "JPY",
    "CHF",
    "GBP",
    "SEK",
    "CNY",
    "NZD",
    "CNH",
    "NOK",
    "SGD",
    "HKD",
    "ZAR",
    "PLN",
    "CAD",
    "CZK",
    "HUF",
    "DKK",
    "BRL",
    "SKK",
    "RON",
    "TRY",
    "BGN",
    "CUP",
    "MXN",
    "TOP",
    "ILS",
    "CLF",
    "XCD",
    "ISK",
    "IDR",
    "MNT",
    "AED",
    "AFN",
    "INR",
    "ESP",
    "RUB",
    "CLP",
    "KRW",
    "ETB",
    "DZD",
    "XEU",
    "XFO",
]


def add_slash_to_text_as_regex(text: str):
    if text is None or len(text) == 0:
        return text
    special_char_iter = re.finditer("\W", text)
    for special_iter in special_char_iter:
        if len(special_iter.group().strip()) == 0:
            continue
        replace = r"\{0}".format(special_iter.group())
        if replace not in text:
            text = re.sub(replace, replace, text)
    text = re.sub(r"\s+", r"\\s+", text)
    return text


def clean_text(text: str) -> str:
    # text = text.lower()
    # update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
    text = re.sub(r"\\u[A-Z0-9a-z]{4}", " ", text)
    text = re.sub(r"( ){2,}", " ", text.strip())
    return text


def get_most_similar_name(
    text: str, name_list: list, pre_common_word_list: list = None
) -> str:
    """
    Get the most similar fund name from fund_name_list by jacard similarity
    """
    try:
        copy_fund_name_list = deepcopy(name_list)
        if (
            text is None
            or len(text.split()) == 0
            or copy_fund_name_list is None
            or len(copy_fund_name_list) == 0
        ):
            return None, None

        copy_fund_name_list = [
            replace_abbrevation(copy_fund_name)
            for copy_fund_name in copy_fund_name_list
        ]

        copy_fund_name_list = [
            replace_abbrevation(remove_special_characters(copy_fund_name))
            for copy_fund_name in copy_fund_name_list
        ]

        # get common words in fund_name_list
        common_word_list = []
        if len(name_list) > 1:
            _, common_word_list = remove_common_word(copy_fund_name_list)
        if pre_common_word_list is not None and len(pre_common_word_list) > 0:
            common_word_list.extend(
                [word for word in pre_common_word_list if word not in common_word_list]
            )

        text = text.strip()
        text = replace_abbrevation(text)
        text = replace_abbrevation(remove_special_characters(text))
        text_splits = text.split()
        if len(text_splits) == 1:
            text = split_words_without_space(text)
        else:
            new_splits = []
            for split in text_splits:
                if len(split) > 1:
                    new_splits.extend(split_words_without_space(split).split())
                else:
                    new_splits.append(split)

            lower_new_splits = [split.lower() for split in new_splits]
            for word in common_word_list:
                if word not in lower_new_splits:
                    # remove word in fund_name_list
                    for i in range(len(copy_fund_name_list)):
                        temp_splits = copy_fund_name_list[i].split()
                        copy_fund_name_list[i] = " ".join(
                            [
                                split
                                for split in temp_splits
                                if remove_special_characters(split).lower() != word
                            ]
                        )

            for i in range(len(copy_fund_name_list)):
                temp_splits = copy_fund_name_list[i].split()
                copy_fund_name_list[i] = " ".join(
                    [
                        split
                        for split in temp_splits
                        if remove_special_characters(split).lower()
                        not in ["fund", "portfolio", "class", "share", "shares"]
                    ]
                )
            final_splits = []
            for split in new_splits:
                if split.lower() not in [
                    "fund",
                    "portfolio",
                    "class",
                    "share",
                    "shares",
                ]:
                    final_splits.append(split)

            text = " ".join(final_splits)
        max_similarity = 0
        max_similarity_fund_name = None
        text = remove_special_characters(text)
        text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
        for fund_name, copy_fund_name in zip(name_list, copy_fund_name_list):
            copy_fund_name = remove_special_characters(copy_fund_name)
            copy_fund_name = split_words_without_space(copy_fund_name)
            similarity = get_jacard_similarity(
                text, copy_fund_name, need_remove_numeric_characters=False
            )
            if similarity > max_similarity:
                max_similarity = similarity
                max_similarity_fund_name = fund_name
            if max_similarity == 1:
                break
        if max_similarity < 0.35:
            return None, max_similarity
        return max_similarity_fund_name, max_similarity
    except Exception as e:
        print(e)
        print_exc()
        return None, 0.0


def update_for_currency(text: str, compare_list: list):
    text_split = text.split()
    with_currency = False
    for split in text_split:
        if split.upper() in total_currency_list:
            with_currency = True
            break

    with_currency_list = []
    without_currency_list = []
    for index, compare in enumerate(compare_list):
        compare_split = compare.split()
        with_currency_compare = False
        for split in compare_split:
            if split.upper() in total_currency_list:
                with_currency_compare = True
                break
        if with_currency_compare:
            with_currency_list.append(index)
        else:
            without_currency_list.append(index)
    if not with_currency and len(with_currency_list) == 0:
        return text, compare_list
    elif not with_currency and len(with_currency_list) > 0:
        last_split = text_split[-1]
        updated = False
        if len(last_split) < 4 and last_split.upper() == last_split:
            if len(without_currency_list) > 0:
                for index in without_currency_list:
                    if last_split in compare_list[index].split():
                        text = text + " " + "USD"
                        updated = True
                        break
            if not updated:
                currency_list = []
                for index in with_currency_list:
                    compare_split = compare_list[index].split()
                    if last_split in compare_split:
                        current_currency_list = [
                            split
                            for split in compare_split
                            if split.upper() in total_currency_list
                        ]
                        if len(current_currency_list) > 0:
                            currency_list.append(current_currency_list[-1])
                if len(currency_list) == 1:
                    text = text + " " + currency_list[0]
                    updated = True

        for index in without_currency_list:
            compare_list[index] = compare_list[index] + " " + "USD"

        if not updated:
            text = text + " " + "USD"
        return text, compare_list
    elif with_currency and len(without_currency_list) == 0:
        for index in without_currency_list:
            compare_list[index] = compare_list[index] + " " + "USD"
        return text, compare_list
    else:
        return text, compare_list


def remove_common_word(text_list: list):
    if text_list is None or len(text_list) == 0:
        return text_list
    new_text_list = []
    for text in text_list:
        text = text.lower()
        text = remove_special_characters(text)
        text_splits = text.split()
        while "fund" in text_splits:
            text_splits.remove("fund")
        while "portfolio" in text_splits:
            text_splits.remove("portfolio")
        while "share" in text_splits:
            text_splits.remove("share")
        while "class" in text_splits:
            text_splits.remove("class")
        text = " ".join(text_splits)
        new_text_list.append(text)
    # remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words
    # the result is ['Global', 'Growth']
    common_word_list = []
    new_text_splits_list = [text.split() for text in new_text_list]
    for i in range(len(new_text_splits_list)):
        for j in range(i + 1, len(new_text_splits_list)):
            if common_word_list is None or len(common_word_list) == 0:
                common_word_list = list(
                    set(new_text_splits_list[i]).intersection(
                        set(new_text_splits_list[j])
                    )
                )
            else:
                common_word_list = list(
                    set(common_word_list).intersection(set(new_text_splits_list[j]))
                )
    common_word_list = list(set(common_word_list))

    remove_list = []
    # if exists the share name and currency name, remove from the list
    for word in common_word_list:
        if word.upper() in total_currency_list:
            remove_list.append(word)
    for text in new_text_list:
        text_splits = text.split()
        if len(text_splits) < 4:
            continue
        # get last 3 words from text_splits
        last_three_words = text_splits[-3:]
        for word in common_word_list:
            if word not in remove_list and \
                word.upper() == word and \
                word in last_three_words:
                remove_list.append(word)
    for remove in remove_list:
        if remove in common_word_list:
            common_word_list.remove(remove)

    for i in range(len(new_text_splits_list)):
        for common_word in common_word_list:
            if common_word in new_text_splits_list[i]:
                new_text_splits_list[i].remove(common_word)
    new_text_list = [" ".join(text_splits) for text_splits in new_text_splits_list]

    return new_text_list, common_word_list


def split_words_without_space(text: str):
    """
    Split words without space, such as 'BlackrockGlobalFund' will be split to 'Blackrock', 'Global', 'Fund'
    """
    if text is None or len(text.strip()) == 0:
        return []
    text = text.strip()
    # splits = text.split()
    # if len(splits) > 1:
    #     return text
    # find all words with capital letter + lower letter
    regex = r"[A-Z][a-z]+"
    word_list = re.findall(regex, text)
    if len(word_list) > 0:
        for word in word_list:
            text = text.replace(word, " " + word + " ")
        text = re.sub(r"(\s)+", " ", text)
    return text.strip()


def remove_special_characters(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text


def get_unique_words_text(text):
    text = remove_special_characters(text)
    text = text.lower()
    text_split = text.split()
    text_split = list(set(text_split))
    # sort the list
    text_split.sort()
    return_text = " ".join(text_split)
    return return_text


def remove_numeric_characters(text):
    # remove numeric characters
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text


def get_jacard_similarity(
    text_left,
    text_right,
    need_remove_special_characters=True,
    need_remove_numeric_characters=True,
):
    if need_remove_special_characters:
        text_left = remove_special_characters(text_left)
        text_right = remove_special_characters(text_right)
    if need_remove_numeric_characters:
        text_left = remove_numeric_characters(text_left)
        text_right = remove_numeric_characters(text_right)
    text_left = text_left.lower()
    text_right = text_right.lower()
    text_left = text_left.split()
    text_right = text_right.split()
    intersection = set(text_left).intersection(set(text_right))
    union = set(text_left).union(set(text_right))
    if len(union) > 0:
        return round(len(intersection) / len(union), 3)
    else:
        return 0


def get_beginning_common_words(text_list: list):
    """
    Get the beginning common words in text_list
    """
    if text_list is None or len(text_list) < 2:
        return []

    common_words_list = []
    first_text_split = text_list[0].split()
    for w_i, word in enumerate(first_text_split):
        all_same = True
        for text in text_list[1:]:
            text_split = text.split()
            if w_i >= len(text_split):
                all_same = False
                break
            if text_split[w_i] != word:
                all_same = False
                break
        if all_same:
            common_words_list.append(word)
        else:
            break

    return " ".join(common_words_list).strip()


def replace_abbrevation(text: str):
    if text is None or len(text.strip()) == 0:
        return text
    text = text.strip()
    if "swiss franc" in text.lower():
        text = re.sub(r"swiss\s+franc", "CHF", text, flags=re.IGNORECASE)
    elif "us dollar" in text.lower():
        text = re.sub(r"us\s+dollar", "USD", text, flags=re.IGNORECASE)
    elif "singapore dollar" in text.lower():
        text = re.sub(r"singapore\s+dollar", "SGD", text, flags=re.IGNORECASE)
    elif "hong kong dollar" in text.lower():
        text = re.sub(r"hong\s+kong\s+dollar", "HKD", text, flags=re.IGNORECASE)
    elif "hongkong dollar" in text.lower():
        text = re.sub(r"hongkong\s+dollar", "HKD", text, flags=re.IGNORECASE)
    elif "australian dollar" in text.lower():
        text = re.sub(r"australian\s+dollar", "AUD", text, flags=re.IGNORECASE)
    elif "japanese yen" in text.lower():
        text = re.sub(r"japanese\s+yen", "JPY", text, flags=re.IGNORECASE)
    elif "south african rand" in text.lower():
        text = re.sub(r"South\s+African\s+rand", "ZAR", text, flags=re.IGNORECASE)
    elif "canadian dollar" in text.lower():
        text = re.sub(r"canadian\s+dollar", "CAD", text, flags=re.IGNORECASE)
    elif "new zealand dollar" in text.lower():
        text = re.sub(r"new\s+zealand\s+dollar", "NZD", text, flags=re.IGNORECASE)
    elif "norwegian krone" in text.lower():
        text = re.sub(r"norwegian\s+krone", "NOK", text, flags=re.IGNORECASE)
    elif "danish krone" in text.lower():
        text = re.sub(r"danish\s+krone", "DKK", text, flags=re.IGNORECASE)
    elif "swedish krona" in text.lower():
        text = re.sub(r"swedish\s+krona", "SEK", text, flags=re.IGNORECASE)
    elif "swedish kronor" in text.lower():
        text = re.sub(r"swedish\s+kronor", "SEK", text, flags=re.IGNORECASE)
    elif "GPB" in text.split():
        text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE)
    elif "sterling" in text.lower().split():
        text = re.sub(r"sterling", "GBP", text, flags=re.IGNORECASE)
    elif "euro" in text.lower().split():
        text = re.sub(r"euro", "EUR", text, flags=re.IGNORECASE)
    elif "€" in text.lower().split():
        text = re.sub(r"\€", "EUR", text, flags=re.IGNORECASE)
    elif "$" in text.lower().split():
        text = re.sub(r"\$", "USD", text, flags=re.IGNORECASE)
    elif "£" in text.lower().split():
        text = re.sub(r"\£", "GBP", text, flags=re.IGNORECASE)
    elif "RMB" in text.split():
        text = re.sub(r"RMB", "CNY", text, flags=re.IGNORECASE)
    else:
        pass

    text_splits = text.split()
    new_text_splits = []
    for split in text_splits:
        if split.lower() in ["acc", "acc."]:
            new_text_splits.append("Accumulation")
        elif split.lower() in ["inc", "inc."]:
            new_text_splits.append("Income")
        elif split.lower() in ["dist", "dist."]:
            new_text_splits.append("Distribution")
        elif split.lower() in ["inv", "inv."]:
            new_text_splits.append("Investor")
        elif split.lower() in ["inst", "inst.", "institution"]:
            new_text_splits.append("Institutional")
        elif split.lower() in ["cap", "cap."]:
            new_text_splits.append("Capitalisation")
        elif split.lower() in ["adm", "adm."]:
            new_text_splits.append("Admin")
        elif split.lower() in ["adv", "adv."]:
            new_text_splits.append("Advantage")
        elif split.lower() in ["hdg", "hgd", "hdg.", "hgd.", "(h)"]:
            new_text_splits.append("Hedged")
        elif split.lower() in ["cl", "cl."]:
            new_text_splits.append("Class")
        elif split.lower() in ["ser", "ser."]:
            new_text_splits.append("Series")
        elif split.lower() in ["u.s."]:
            new_text_splits.append("US")
        elif split.lower() in ["nc", "nc."]:
            new_text_splits.append("no trail")
        else:
            new_text_splits.append(split)

    new_text = " ".join(new_text_splits)
    return new_text