dc-ml-emea-ar/utils/biz_utils.py

import re
from copy import deepcopy

def add_slash_to_text_as_regex(text: str):
    if text is None or len(text) == 0:
        return text
    special_char_iter = re.finditer("\W", text)
    for special_iter in special_char_iter:
        if len(special_iter.group().strip()) == 0:
            continue
        replace = r"\{0}".format(special_iter.group())
        if replace not in text:
            text = re.sub(replace, replace, text)
    text = re.sub(r"\s+", r"\\s+", text)
    return text


def clean_text(text: str) -> str:
    # text = text.lower()
    # update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
    text = re.sub(r"\\u[A-Z0-9a-z]{4}", ' ', text)
    text = re.sub(r"( ){2,}", ' ', text.strip())
    return text


def get_most_similar_name(text: str, name_list: list):
    """
    Get the most similar fund name from fund_name_list by jacard similarity
    """
    try:
        copy_fund_name_list = deepcopy(name_list)
        if text is None or len(text.split()) == 0 or \
                copy_fund_name_list is None or len(copy_fund_name_list) == 0:
            return None, None
        
        copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name 
                               in copy_fund_name_list]

        # get common words in fund_name_list
        common_word_list = []
        if len(name_list) > 1:
            _, common_word_list = remove_common_word(copy_fund_name_list)

        text = text.strip()
        text = remove_special_characters(text)
        text = replace_abbrevation(text)
        text_splits = text.split()
        if len(text_splits) == 1:
            text = split_words_without_space(text)
        else:
            new_splits = []
            for split in text_splits:
                if len(split) > 1:
                    new_splits.extend(split_words_without_space(split).split())
                else:
                    new_splits.append(split)

            lower_new_splits = [split.lower() for split in new_splits]
            for word in common_word_list:
                if word not in lower_new_splits:
                    # remove word in fund_name_list
                    for i in range(len(copy_fund_name_list)):
                        temp_splits = copy_fund_name_list[i].split()
                        for temp in temp_splits:
                            if remove_special_characters(temp).lower() == word:
                                copy_fund_name_list[i] = re.sub(r'\s+', ' ', 
                                                                copy_fund_name_list[i].replace(temp, ' '))

            for i in range(len(copy_fund_name_list)):
                temp_splits = copy_fund_name_list[i].split()
                for temp in temp_splits:
                    if remove_special_characters(temp).lower() in ['fund', 'portfolio', 'class', 'share', 'shares']:
                        copy_fund_name_list[i] = \
                            re.sub(r'\s+', ' ', copy_fund_name_list[i].replace(temp, ' '))
            final_splits = []
            for split in new_splits:
                if split.lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']:
                    final_splits.append(split)

            text = ' '.join(final_splits)
        max_similarity = 0
        max_similarity_fund_name = None
        for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list):
            copy_fund_name = remove_special_characters(copy_fund_name)
            copy_fund_name = split_words_without_space(copy_fund_name)
            similarity = get_jacard_similarity(text,
                                            copy_fund_name,
                                            need_remove_numeric_characters=False)
            if similarity > max_similarity:
                max_similarity = similarity
                max_similarity_fund_name = fund_name
            if max_similarity == 1:
                break
        if max_similarity < 0.35:
            return None, max_similarity
        return max_similarity_fund_name, max_similarity
    except Exception as e:
        print(e)
        return None, 0.0


def remove_common_word(text_list: list):
    if text_list is None or len(text_list) == 0:
        return text_list
    new_text_list = []
    for text in text_list:
        text = text.lower()
        text = remove_special_characters(text)
        text_splits = text.split()
        while 'fund' in text_splits:
            text_splits.remove('fund')
        while 'portfolio' in text_splits:
            text_splits.remove('portfolio')
        while 'share' in text_splits:
            text_splits.remove('share')
        while 'class' in text_splits:
            text_splits.remove('class')
        text = ' '.join(text_splits)
        new_text_list.append(text)
    # remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words
    # the result is ['Global', 'Growth']
    common_word_list = []
    new_text_splits_list = [text.split() for text in new_text_list]
    for i in range(len(new_text_splits_list)):
        for j in range(i+1, len(new_text_splits_list)):
            if common_word_list is None or len(common_word_list) == 0:
                common_word_list = list(
                    set(new_text_splits_list[i]).intersection(set(new_text_splits_list[j])))
            else:
                common_word_list = list(
                    set(common_word_list).intersection(set(new_text_splits_list[j])))
    common_word_list = list(set(common_word_list))
    for i in range(len(new_text_splits_list)):
        for common_word in common_word_list:
            if common_word in new_text_splits_list[i]:
                new_text_splits_list[i].remove(common_word)
    new_text_list = [' '.join(text_splits)
                     for text_splits in new_text_splits_list]
    return new_text_list, common_word_list


def split_words_without_space(text: str):
    """
    Split words without space, such as 'BlackrockGlobalFund' will be split to 'Blackrock', 'Global', 'Fund'
    """
    if text is None or len(text.strip()) == 0:
        return []
    text = text.strip()
    # splits = text.split()
    # if len(splits) > 1:
    #     return text
    # find all words with capital letter + lower letter
    regex = r'[A-Z][a-z]+'
    word_list = re.findall(regex, text)
    if len(word_list) > 0:
        for word in word_list:
            text = text.replace(word, ' ' + word + ' ')
        text = re.sub(r'(\s)+', ' ', text)
    return text.strip()


def remove_special_characters(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


def remove_numeric_characters(text):
    # remove numeric characters
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


def get_jacard_similarity(text_left,
                          text_right,
                          need_remove_special_characters=True,
                          need_remove_numeric_characters=True):
    if need_remove_special_characters:
        text_left = remove_special_characters(text_left)
        text_right = remove_special_characters(text_right)
    if need_remove_numeric_characters:
        text_left = remove_numeric_characters(text_left)
        text_right = remove_numeric_characters(text_right)
    text_left = text_left.lower()
    text_right = text_right.lower()
    text_left = text_left.split()
    text_right = text_right.split()
    intersection = set(text_left).intersection(set(text_right))
    union = set(text_left).union(set(text_right))
    if len(union) > 0:
        return round(len(intersection) / len(union), 3)
    else:
        return 0


def replace_abbrevation(text: str):
    if text is None or len(text.strip()) == 0:
        return text
    text = text.strip()
    text_splits = text.split()
    new_text_splits = []
    for split in text_splits:
        if split.lower() in ['acc']:
            new_text_splits.append('Accumulation')
        elif split.lower() in ['inc']:
            new_text_splits.append('Income')
        elif split.lower() in ['dist']:
            new_text_splits.append('Distribution')
        elif split.lower() in ['inv']:
            new_text_splits.append('Investor')
        elif split.lower() in ['inst', 'inst', 'institution']:
            new_text_splits.append('Institutional')
        elif split.lower() in ['adm']:
            new_text_splits.append('Admin')
        elif split.lower() in ['adv']:
            new_text_splits.append('Advantage')
        elif split.lower() in ['hdg', 'hgd', '(h)']:
            new_text_splits.append('Hedged')
        elif split.lower() in ['cl']:
            new_text_splits.append('Class')
        elif split.lower() in ['ser']:
            new_text_splits.append('Series')
        elif split.lower() in ['u.s.']:
            new_text_splits.append('US')
        elif split.lower() in ['nc']:
            new_text_splits.append('no trail')
        else:
            new_text_splits.append(split)
    new_text = ' '.join(new_text_splits)
    return new_text
support filter pages by data point keywords 2024-08-23 21:38:11 +00:00			`import re`
support auto-mapping fund/ share by raw names. 2024-09-09 22:34:53 +00:00			`from copy import deepcopy`
support filter pages by data point keywords 2024-08-23 21:38:11 +00:00
			`def add_slash_to_text_as_regex(text: str):`
			`if text is None or len(text) == 0:`
			`return text`
			`special_char_iter = re.finditer("\W", text)`
			`for special_iter in special_char_iter:`
			`if len(special_iter.group().strip()) == 0:`
			`continue`
			`replace = r"\{0}".format(special_iter.group())`
			`if replace not in text:`
			`text = re.sub(replace, replace, text)`
			`text = re.sub(r"\s+", r"\\s+", text)`
optimize prompts 2024-08-28 15:21:26 +00:00			`return text`


			`def clean_text(text: str) -> str:`
support calculate page filter metrics. 2024-09-03 22:07:53 +00:00			`# text = text.lower()`
optimize prompts 2024-08-28 15:21:26 +00:00			`# update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space`
support calculate page filter metrics. 2024-09-03 22:07:53 +00:00			`text = re.sub(r"\\u[A-Z0-9a-z]{4}", ' ', text)`
optimize prompts 2024-08-28 15:21:26 +00:00			`text = re.sub(r"( ){2,}", ' ', text.strip())`
support auto-mapping fund/ share by raw names. 2024-09-09 22:34:53 +00:00			`return text`


			`def get_most_similar_name(text: str, name_list: list):`
			`"""`
			`Get the most similar fund name from fund_name_list by jacard similarity`
			`"""`
			`try:`
			`copy_fund_name_list = deepcopy(name_list)`
			`if text is None or len(text.split()) == 0 or \`
			`copy_fund_name_list is None or len(copy_fund_name_list) == 0:`
			`return None, None`

			`copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name`
			`in copy_fund_name_list]`

			`# get common words in fund_name_list`
			`common_word_list = []`
			`if len(name_list) > 1:`
			`_, common_word_list = remove_common_word(copy_fund_name_list)`

			`text = text.strip()`
			`text = remove_special_characters(text)`
			`text = replace_abbrevation(text)`
			`text_splits = text.split()`
			`if len(text_splits) == 1:`
			`text = split_words_without_space(text)`
			`else:`
			`new_splits = []`
			`for split in text_splits:`
			`if len(split) > 1:`
			`new_splits.extend(split_words_without_space(split).split())`
			`else:`
			`new_splits.append(split)`

			`lower_new_splits = [split.lower() for split in new_splits]`
			`for word in common_word_list:`
			`if word not in lower_new_splits:`
			`# remove word in fund_name_list`
			`for i in range(len(copy_fund_name_list)):`
			`temp_splits = copy_fund_name_list[i].split()`
			`for temp in temp_splits:`
			`if remove_special_characters(temp).lower() == word:`
			`copy_fund_name_list[i] = re.sub(r'\s+', ' ',`
			`copy_fund_name_list[i].replace(temp, ' '))`

			`for i in range(len(copy_fund_name_list)):`
			`temp_splits = copy_fund_name_list[i].split()`
			`for temp in temp_splits:`
			`if remove_special_characters(temp).lower() in ['fund', 'portfolio', 'class', 'share', 'shares']:`
			`copy_fund_name_list[i] = \`
			`re.sub(r'\s+', ' ', copy_fund_name_list[i].replace(temp, ' '))`
			`final_splits = []`
			`for split in new_splits:`
			`if split.lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']:`
			`final_splits.append(split)`

			`text = ' '.join(final_splits)`
			`max_similarity = 0`
			`max_similarity_fund_name = None`
			`for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list):`
			`copy_fund_name = remove_special_characters(copy_fund_name)`
			`copy_fund_name = split_words_without_space(copy_fund_name)`
			`similarity = get_jacard_similarity(text,`
			`copy_fund_name,`
			`need_remove_numeric_characters=False)`
			`if similarity > max_similarity:`
			`max_similarity = similarity`
			`max_similarity_fund_name = fund_name`
			`if max_similarity == 1:`
			`break`
			`if max_similarity < 0.35:`
			`return None, max_similarity`
			`return max_similarity_fund_name, max_similarity`
			`except Exception as e:`
			`print(e)`
			`return None, 0.0`


			`def remove_common_word(text_list: list):`
			`if text_list is None or len(text_list) == 0:`
			`return text_list`
			`new_text_list = []`
			`for text in text_list:`
			`text = text.lower()`
			`text = remove_special_characters(text)`
			`text_splits = text.split()`
			`while 'fund' in text_splits:`
			`text_splits.remove('fund')`
			`while 'portfolio' in text_splits:`
			`text_splits.remove('portfolio')`
			`while 'share' in text_splits:`
			`text_splits.remove('share')`
			`while 'class' in text_splits:`
			`text_splits.remove('class')`
			`text = ' '.join(text_splits)`
			`new_text_list.append(text)`
			`# remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words`
			`# the result is ['Global', 'Growth']`
			`common_word_list = []`
			`new_text_splits_list = [text.split() for text in new_text_list]`
			`for i in range(len(new_text_splits_list)):`
			`for j in range(i+1, len(new_text_splits_list)):`
			`if common_word_list is None or len(common_word_list) == 0:`
			`common_word_list = list(`
			`set(new_text_splits_list[i]).intersection(set(new_text_splits_list[j])))`
			`else:`
			`common_word_list = list(`
			`set(common_word_list).intersection(set(new_text_splits_list[j])))`
			`common_word_list = list(set(common_word_list))`
			`for i in range(len(new_text_splits_list)):`
			`for common_word in common_word_list:`
			`if common_word in new_text_splits_list[i]:`
			`new_text_splits_list[i].remove(common_word)`
			`new_text_list = [' '.join(text_splits)`
			`for text_splits in new_text_splits_list]`
			`return new_text_list, common_word_list`


			`def split_words_without_space(text: str):`
			`"""`
			`Split words without space, such as 'BlackrockGlobalFund' will be split to 'Blackrock', 'Global', 'Fund'`
			`"""`
			`if text is None or len(text.strip()) == 0:`
			`return []`
			`text = text.strip()`
			`# splits = text.split()`
			`# if len(splits) > 1:`
			`# return text`
			`# find all words with capital letter + lower letter`
			`regex = r'[A-Z][a-z]+'`
			`word_list = re.findall(regex, text)`
			`if len(word_list) > 0:`
			`for word in word_list:`
			`text = text.replace(word, ' ' + word + ' ')`
			`text = re.sub(r'(\s)+', ' ', text)`
			`return text.strip()`


			`def remove_special_characters(text):`
			`text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)`
			`text = re.sub(r'\s+', ' ', text)`
			`text = text.strip()`
			`return text`


			`def remove_numeric_characters(text):`
			`# remove numeric characters`
			`text = re.sub(r'\d+', ' ', text)`
			`text = re.sub(r'\s+', ' ', text)`
			`text = text.strip()`
			`return text`


			`def get_jacard_similarity(text_left,`
			`text_right,`
			`need_remove_special_characters=True,`
			`need_remove_numeric_characters=True):`
			`if need_remove_special_characters:`
			`text_left = remove_special_characters(text_left)`
			`text_right = remove_special_characters(text_right)`
			`if need_remove_numeric_characters:`
			`text_left = remove_numeric_characters(text_left)`
			`text_right = remove_numeric_characters(text_right)`
			`text_left = text_left.lower()`
			`text_right = text_right.lower()`
			`text_left = text_left.split()`
			`text_right = text_right.split()`
			`intersection = set(text_left).intersection(set(text_right))`
			`union = set(text_left).union(set(text_right))`
			`if len(union) > 0:`
			`return round(len(intersection) / len(union), 3)`
			`else:`
			`return 0`


			`def replace_abbrevation(text: str):`
			`if text is None or len(text.strip()) == 0:`
			`return text`
			`text = text.strip()`
			`text_splits = text.split()`
			`new_text_splits = []`
			`for split in text_splits:`
			`if split.lower() in ['acc']:`
			`new_text_splits.append('Accumulation')`
			`elif split.lower() in ['inc']:`
			`new_text_splits.append('Income')`
			`elif split.lower() in ['dist']:`
			`new_text_splits.append('Distribution')`
			`elif split.lower() in ['inv']:`
			`new_text_splits.append('Investor')`
			`elif split.lower() in ['inst', 'inst', 'institution']:`
			`new_text_splits.append('Institutional')`
			`elif split.lower() in ['adm']:`
			`new_text_splits.append('Admin')`
			`elif split.lower() in ['adv']:`
			`new_text_splits.append('Advantage')`
			`elif split.lower() in ['hdg', 'hgd', '(h)']:`
			`new_text_splits.append('Hedged')`
			`elif split.lower() in ['cl']:`
			`new_text_splits.append('Class')`
			`elif split.lower() in ['ser']:`
			`new_text_splits.append('Series')`
			`elif split.lower() in ['u.s.']:`
			`new_text_splits.append('US')`
			`elif split.lower() in ['nc']:`
			`new_text_splits.append('no trail')`
			`else:`
			`new_text_splits.append(split)`
			`new_text = ' '.join(new_text_splits)`
			`return new_text`