From d25bae936c522a79594b756abc9470f167bb2282 Mon Sep 17 00:00:00 2001 From: Blade He Date: Thu, 26 Sep 2024 12:18:37 -0500 Subject: [PATCH] Optimize investment mapping algorithm. --- main.py | 43 +++-- prepare_data.py | 20 +-- utils/biz_utils.py | 379 +++++++++++++++++++++++++++++---------------- 3 files changed, 281 insertions(+), 161 deletions(-) diff --git a/main.py b/main.py index 0323f90..74b5098 100644 --- a/main.py +++ b/main.py @@ -564,8 +564,8 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "292989214" - raw_name = "ENBD Saudi Arabia Equity Fund Class A USD Accumulation" + doc_id = "391456740" + raw_name = "Robeco Multi Asset Sustainable D EUR" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -575,7 +575,7 @@ def test_mapping_raw_name(): output_data_folder=output_folder, ) mapping_info = data_mapping.matching_with_database( - raw_name=raw_name, parent_id="FS0000B4A7", matching_type="share" + raw_name=raw_name, parent_id=None, matching_type="share" ) print(mapping_info) @@ -622,30 +622,41 @@ if __name__ == "__main__": # special_doc_id_list = ["505174428", "510326848", "349679479"] check_mapping_doc_id_list = [ - "458359181", - "486383912", - "529925114", + "327956364", "391456740", "391736837", + "458359181", + "486383912", "497497599", - "327956364", - "479793787", - "334718372", + "529925114", "321733631", - "507967525", - "478585901", - "366179419", - "509845549", - "323390570", + "334718372", "344636875", + "362246081", "445256897", + "449623976", + "458291624", + "478585901", + "492121213", + "502821436", + "507967525", + "481475385", "508854243", "520879048", + "402181770", "463081566", - "389171486" + "502693599", + "509845549", + "389171486", + "323390570", + "366179419", + "486378555", + "506559375", + "479793787", + "333207452" ] special_doc_id_list = check_mapping_doc_id_list - # special_doc_id_list = ["445256897"] + # special_doc_id_list = ["333207452"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False diff --git a/prepare_data.py b/prepare_data.py index 72d69b3..9453f2e 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -848,7 +848,7 @@ def compare_records_count_by_document_id(): # get the count of records by DocumentId document_records_count = data_from_document_df.groupby("DocumentId").size().reset_index(name="records_count") - data_from_database = r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx" + data_from_database = r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document_from_DocumentAcquisition.xlsx" sheet_name = "random_small_document_all_data" data_from_database_df = pd.read_excel(data_from_database, sheet_name=sheet_name) database_records_count = data_from_database_df.groupby("DocumentId").size().reset_index(name="records_count") @@ -870,7 +870,7 @@ def compare_records_count_by_document_id(): records_count_compare.reset_index(drop=True, inplace=True) records_count_compare_file = ( - r"/data/emea_ar/basic_information/English/records_count_compare_between_document_database.xlsx" + r"/data/emea_ar/basic_information/English/records_count_compare_between_document_database_from_DocumentAcquisition.xlsx" ) with pd.ExcelWriter(records_count_compare_file) as writer: records_count_compare.to_excel( @@ -886,7 +886,7 @@ def get_document_extracted_share_diff_by_db(): db_data = pd.read_excel(db_data_file, sheet_name="Sheet1") extract_data = pd.read_excel(extract_data_file, sheet_name="mapping_data") # only get data which investment_type is 1 - extract_data = extract_data[extract_data["investment_type"] == 1] + # extract_data = extract_data[extract_data["investment_type"] == 1] extract_data.reset_index(drop=True, inplace=True) unique_doc_id = extract_data["doc_id"].unique().tolist() @@ -1012,7 +1012,7 @@ if __name__ == "__main__": # sheet_name="latest_doc_ar_data", # output_folder=output_data_folder, # output_file="latest_doc_ar_mapping_statistics.xlsx") - # get_document_extracted_share_diff_by_db() + get_document_extracted_share_diff_by_db() # statistics_provider_mapping( # provider_mapping_data_file=provider_mapping_data_file, # output_folder=basic_info_folder, @@ -1021,11 +1021,11 @@ if __name__ == "__main__": # pickup_document_from_top_100_providers() # compare_records_count_by_document_id() - document_mapping_folder = r"/data/emea_ar/output/mapping/document/" - all_data_file = r"/data/emea_ar/output/mapping/all_document_mapping.xlsx" - concat_mapping(document_mapping_folder, all_data_file) + # document_mapping_folder = r"/data/emea_ar/output/mapping/document/" + # all_data_file = r"/data/emea_ar/output/mapping/all_document_mapping.xlsx" + # concat_mapping(document_mapping_folder, all_data_file) - provider_mapping_folder = r"/data/emea_ar/output/mapping/provider/" - all_data_file = r"/data/emea_ar/output/mapping/all_provider_mapping.xlsx" - concat_mapping(provider_mapping_folder, all_data_file) + # provider_mapping_folder = r"/data/emea_ar/output/mapping/provider/" + # all_data_file = r"/data/emea_ar/output/mapping/all_provider_mapping.xlsx" + # concat_mapping(provider_mapping_folder, all_data_file) diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 1ca1697..31bf322 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -2,6 +2,55 @@ import re from copy import deepcopy from traceback import print_exc + +total_currency_list = [ + "USD", + "EUR", + "AUD", + "JPY", + "CHF", + "GBP", + "SEK", + "CNY", + "NZD", + "CNH", + "NOK", + "SGD", + "HKD", + "ZAR", + "PLN", + "CAD", + "CZK", + "HUF", + "DKK", + "BRL", + "SKK", + "RON", + "TRY", + "BGN", + "CUP", + "MXN", + "TOP", + "ILS", + "CLF", + "XCD", + "ISK", + "IDR", + "MNT", + "AED", + "AFN", + "INR", + "ESP", + "RUB", + "CLP", + "KRW", + "ETB", + "DZD", + "XEU", + "XFO", +] + + def add_slash_to_text_as_regex(text: str): if text is None or len(text) == 0: return text @@ -19,35 +68,49 @@ def add_slash_to_text_as_regex(text: str): def clean_text(text: str) -> str: # text = text.lower() # update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space - text = re.sub(r"\\u[A-Z0-9a-z]{4}", ' ', text) - text = re.sub(r"( ){2,}", ' ', text.strip()) + text = re.sub(r"\\u[A-Z0-9a-z]{4}", " ", text) + text = re.sub(r"( ){2,}", " ", text.strip()) return text -def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list = None) -> str: +def get_most_similar_name( + text: str, name_list: list, pre_common_word_list: list = None +) -> str: """ Get the most similar fund name from fund_name_list by jacard similarity """ try: copy_fund_name_list = deepcopy(name_list) - if text is None or len(text.split()) == 0 or \ - copy_fund_name_list is None or len(copy_fund_name_list) == 0: + if ( + text is None + or len(text.split()) == 0 + or copy_fund_name_list is None + or len(copy_fund_name_list) == 0 + ): return None, None - copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name - in copy_fund_name_list] + copy_fund_name_list = [ + replace_abbrevation(copy_fund_name) + for copy_fund_name in copy_fund_name_list + ] + + copy_fund_name_list = [ + replace_abbrevation(remove_special_characters(copy_fund_name)) + for copy_fund_name in copy_fund_name_list + ] # get common words in fund_name_list common_word_list = [] if len(name_list) > 1: _, common_word_list = remove_common_word(copy_fund_name_list) if pre_common_word_list is not None and len(pre_common_word_list) > 0: - common_word_list.extend([word for word in pre_common_word_list - if word not in common_word_list]) + common_word_list.extend( + [word for word in pre_common_word_list if word not in common_word_list] + ) text = text.strip() - text = remove_special_characters(text) text = replace_abbrevation(text) + text = replace_abbrevation(remove_special_characters(text)) text_splits = text.split() if len(text_splits) == 1: text = split_words_without_space(text) @@ -65,29 +128,46 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list # remove word in fund_name_list for i in range(len(copy_fund_name_list)): temp_splits = copy_fund_name_list[i].split() - copy_fund_name_list[i] = ' '.join([split for split in temp_splits - if remove_special_characters(split).lower() != word]) + copy_fund_name_list[i] = " ".join( + [ + split + for split in temp_splits + if remove_special_characters(split).lower() != word + ] + ) for i in range(len(copy_fund_name_list)): temp_splits = copy_fund_name_list[i].split() - copy_fund_name_list[i] = ' '.join([split for split in temp_splits - if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']]) + copy_fund_name_list[i] = " ".join( + [ + split + for split in temp_splits + if remove_special_characters(split).lower() + not in ["fund", "portfolio", "class", "share", "shares"] + ] + ) final_splits = [] for split in new_splits: - if split.lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']: + if split.lower() not in [ + "fund", + "portfolio", + "class", + "share", + "shares", + ]: final_splits.append(split) - text = ' '.join(final_splits) + text = " ".join(final_splits) max_similarity = 0 max_similarity_fund_name = None text = remove_special_characters(text) text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list) - for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list): + for fund_name, copy_fund_name in zip(name_list, copy_fund_name_list): copy_fund_name = remove_special_characters(copy_fund_name) copy_fund_name = split_words_without_space(copy_fund_name) - similarity = get_jacard_similarity(text, - copy_fund_name, - need_remove_numeric_characters=False) + similarity = get_jacard_similarity( + text, copy_fund_name, need_remove_numeric_characters=False + ) if similarity > max_similarity: max_similarity = similarity max_similarity_fund_name = fund_name @@ -105,17 +185,11 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list def update_for_currency(text: str, compare_list: list): text_split = text.split() with_currency = False - total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY', - 'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD', - 'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN', - 'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR', - 'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW', - 'ETB', 'DZD', 'XEU', 'XFO'] for split in text_split: if split.upper() in total_currency_list: with_currency = True break - + with_currency_list = [] without_currency_list = [] for index, compare in enumerate(compare_list): @@ -138,7 +212,7 @@ def update_for_currency(text: str, compare_list: list): if len(without_currency_list) > 0: for index in without_currency_list: if last_split in compare_list[index].split(): - text = text + ' ' + 'USD' + text = text + " " + "USD" updated = True break if not updated: @@ -146,23 +220,26 @@ def update_for_currency(text: str, compare_list: list): for index in with_currency_list: compare_split = compare_list[index].split() if last_split in compare_split: - current_currency_list = [split for split in compare_split - if split.upper() in total_currency_list] + current_currency_list = [ + split + for split in compare_split + if split.upper() in total_currency_list + ] if len(current_currency_list) > 0: currency_list.append(current_currency_list[-1]) if len(currency_list) == 1: - text = text + ' ' + currency_list[0] - updated = True - + text = text + " " + currency_list[0] + updated = True + for index in without_currency_list: - compare_list[index] = compare_list[index] + ' ' + 'USD' - + compare_list[index] = compare_list[index] + " " + "USD" + if not updated: - text = text + ' ' + 'USD' + text = text + " " + "USD" return text, compare_list elif with_currency and len(without_currency_list) == 0: for index in without_currency_list: - compare_list[index] = compare_list[index] + ' ' + 'USD' + compare_list[index] = compare_list[index] + " " + "USD" return text, compare_list else: return text, compare_list @@ -176,35 +253,60 @@ def remove_common_word(text_list: list): text = text.lower() text = remove_special_characters(text) text_splits = text.split() - while 'fund' in text_splits: - text_splits.remove('fund') - while 'portfolio' in text_splits: - text_splits.remove('portfolio') - while 'share' in text_splits: - text_splits.remove('share') - while 'class' in text_splits: - text_splits.remove('class') - text = ' '.join(text_splits) + while "fund" in text_splits: + text_splits.remove("fund") + while "portfolio" in text_splits: + text_splits.remove("portfolio") + while "share" in text_splits: + text_splits.remove("share") + while "class" in text_splits: + text_splits.remove("class") + text = " ".join(text_splits) new_text_list.append(text) # remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words # the result is ['Global', 'Growth'] common_word_list = [] new_text_splits_list = [text.split() for text in new_text_list] for i in range(len(new_text_splits_list)): - for j in range(i+1, len(new_text_splits_list)): + for j in range(i + 1, len(new_text_splits_list)): if common_word_list is None or len(common_word_list) == 0: common_word_list = list( - set(new_text_splits_list[i]).intersection(set(new_text_splits_list[j]))) + set(new_text_splits_list[i]).intersection( + set(new_text_splits_list[j]) + ) + ) else: common_word_list = list( - set(common_word_list).intersection(set(new_text_splits_list[j]))) + set(common_word_list).intersection(set(new_text_splits_list[j])) + ) common_word_list = list(set(common_word_list)) + + remove_list = [] + # if exists the share name and currency name, remove from the list + for word in common_word_list: + if word.upper() in total_currency_list: + remove_list.append(word) + for text in new_text_list: + text_splits = text.split() + if len(text_splits) < 4: + continue + # get last 3 words from text_splits + last_three_words = text_splits[-3:] + for word in common_word_list: + if word not in remove_list and \ + word.upper() == word and \ + word in last_three_words: + remove_list.append(word) + for remove in remove_list: + if remove in common_word_list: + common_word_list.remove(remove) + for i in range(len(new_text_splits_list)): for common_word in common_word_list: if common_word in new_text_splits_list[i]: new_text_splits_list[i].remove(common_word) - new_text_list = [' '.join(text_splits) - for text_splits in new_text_splits_list] + new_text_list = [" ".join(text_splits) for text_splits in new_text_splits_list] + return new_text_list, common_word_list @@ -219,21 +321,22 @@ def split_words_without_space(text: str): # if len(splits) > 1: # return text # find all words with capital letter + lower letter - regex = r'[A-Z][a-z]+' + regex = r"[A-Z][a-z]+" word_list = re.findall(regex, text) if len(word_list) > 0: for word in word_list: - text = text.replace(word, ' ' + word + ' ') - text = re.sub(r'(\s)+', ' ', text) + text = text.replace(word, " " + word + " ") + text = re.sub(r"(\s)+", " ", text) return text.strip() def remove_special_characters(text): - text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) - text = re.sub(r'\s+', ' ', text) + text = re.sub(r"[^a-zA-Z0-9\s]", " ", text) + text = re.sub(r"\s+", " ", text) text = text.strip() return text + def get_unique_words_text(text): text = remove_special_characters(text) text = text.lower() @@ -241,22 +344,24 @@ def get_unique_words_text(text): text_split = list(set(text_split)) # sort the list text_split.sort() - return_text = ' '.join(text_split) + return_text = " ".join(text_split) return return_text def remove_numeric_characters(text): # remove numeric characters - text = re.sub(r'\d+', ' ', text) - text = re.sub(r'\s+', ' ', text) + text = re.sub(r"\d+", " ", text) + text = re.sub(r"\s+", " ", text) text = text.strip() return text -def get_jacard_similarity(text_left, - text_right, - need_remove_special_characters=True, - need_remove_numeric_characters=True): +def get_jacard_similarity( + text_left, + text_right, + need_remove_special_characters=True, + need_remove_numeric_characters=True, +): if need_remove_special_characters: text_left = remove_special_characters(text_left) text_right = remove_special_characters(text_right) @@ -274,6 +379,7 @@ def get_jacard_similarity(text_left, else: return 0 + def get_beginning_common_words(text_list: list): """ Get the beginning common words in text_list @@ -297,87 +403,90 @@ def get_beginning_common_words(text_list: list): common_words_list.append(word) else: break - - return ' '.join(common_words_list).strip() + + return " ".join(common_words_list).strip() + def replace_abbrevation(text: str): if text is None or len(text.strip()) == 0: return text text = text.strip() - if 'swiss franc' in text.lower(): - text = re.sub(r'swiss\s+franc', 'CHF', text, flags=re.IGNORECASE) - elif 'us dollar' in text.lower(): - text = re.sub(r'us\s+dollar', 'USD', text, flags=re.IGNORECASE) - elif 'singapore dollar' in text.lower(): - text = re.sub(r'singapore\s+dollar', 'SGD', text, flags=re.IGNORECASE) - elif 'hong kong dollar' in text.lower(): - text = re.sub(r'hong\s+kong\s+dollar', 'HKD', text, flags=re.IGNORECASE) - elif 'hongkong dollar' in text.lower(): - text = re.sub(r'hongkong\s+dollar', 'HKD', text, flags=re.IGNORECASE) - elif 'australian dollar' in text.lower(): - text = re.sub(r'australian\s+dollar', 'AUD', text, flags=re.IGNORECASE) - elif 'japanese yen' in text.lower(): - text = re.sub(r'japanese\s+yen', 'JPY', text, flags=re.IGNORECASE) - elif 'south african rand' in text.lower(): - text = re.sub(r'South\s+African\s+rand', 'ZAR', text, flags=re.IGNORECASE) - elif 'canadian dollar' in text.lower(): - text = re.sub(r'canadian\s+dollar', 'CAD', text, flags=re.IGNORECASE) - elif 'new zealand dollar' in text.lower(): - text = re.sub(r'new\s+zealand\s+dollar', 'NZD', text, flags=re.IGNORECASE) - elif 'norwegian krone' in text.lower(): - text = re.sub(r'norwegian\s+krone', 'NOK', text, flags=re.IGNORECASE) - elif 'danish krone' in text.lower(): - text = re.sub(r'danish\s+krone', 'DKK', text, flags=re.IGNORECASE) - elif 'swedish krona' in text.lower(): - text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE) - elif 'swedish kronor' in text.lower(): - text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE) - elif 'sterling' in text.lower().split(): - text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE) - elif 'euro' in text.lower().split(): - text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE) - elif '€' in text.lower().split(): - text = re.sub(r'\€', 'EUR', text, flags=re.IGNORECASE) - elif '$' in text.lower().split(): - text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE) - elif '£' in text.lower().split(): - text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE) - elif 'RMB' in text.lower().split(): - text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE) + if "swiss franc" in text.lower(): + text = re.sub(r"swiss\s+franc", "CHF", text, flags=re.IGNORECASE) + elif "us dollar" in text.lower(): + text = re.sub(r"us\s+dollar", "USD", text, flags=re.IGNORECASE) + elif "singapore dollar" in text.lower(): + text = re.sub(r"singapore\s+dollar", "SGD", text, flags=re.IGNORECASE) + elif "hong kong dollar" in text.lower(): + text = re.sub(r"hong\s+kong\s+dollar", "HKD", text, flags=re.IGNORECASE) + elif "hongkong dollar" in text.lower(): + text = re.sub(r"hongkong\s+dollar", "HKD", text, flags=re.IGNORECASE) + elif "australian dollar" in text.lower(): + text = re.sub(r"australian\s+dollar", "AUD", text, flags=re.IGNORECASE) + elif "japanese yen" in text.lower(): + text = re.sub(r"japanese\s+yen", "JPY", text, flags=re.IGNORECASE) + elif "south african rand" in text.lower(): + text = re.sub(r"South\s+African\s+rand", "ZAR", text, flags=re.IGNORECASE) + elif "canadian dollar" in text.lower(): + text = re.sub(r"canadian\s+dollar", "CAD", text, flags=re.IGNORECASE) + elif "new zealand dollar" in text.lower(): + text = re.sub(r"new\s+zealand\s+dollar", "NZD", text, flags=re.IGNORECASE) + elif "norwegian krone" in text.lower(): + text = re.sub(r"norwegian\s+krone", "NOK", text, flags=re.IGNORECASE) + elif "danish krone" in text.lower(): + text = re.sub(r"danish\s+krone", "DKK", text, flags=re.IGNORECASE) + elif "swedish krona" in text.lower(): + text = re.sub(r"swedish\s+krona", "SEK", text, flags=re.IGNORECASE) + elif "swedish kronor" in text.lower(): + text = re.sub(r"swedish\s+kronor", "SEK", text, flags=re.IGNORECASE) + elif "GPB" in text.split(): + text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE) + elif "sterling" in text.lower().split(): + text = re.sub(r"sterling", "GBP", text, flags=re.IGNORECASE) + elif "euro" in text.lower().split(): + text = re.sub(r"euro", "EUR", text, flags=re.IGNORECASE) + elif "€" in text.lower().split(): + text = re.sub(r"\€", "EUR", text, flags=re.IGNORECASE) + elif "$" in text.lower().split(): + text = re.sub(r"\$", "USD", text, flags=re.IGNORECASE) + elif "£" in text.lower().split(): + text = re.sub(r"\£", "GBP", text, flags=re.IGNORECASE) + elif "RMB" in text.split(): + text = re.sub(r"RMB", "CNY", text, flags=re.IGNORECASE) else: pass - + text_splits = text.split() new_text_splits = [] for split in text_splits: - if split.lower() in ['acc', 'acc.']: - new_text_splits.append('Accumulation') - elif split.lower() in ['inc', 'inc.']: - new_text_splits.append('Income') - elif split.lower() in ['dist', 'dist.']: - new_text_splits.append('Distribution') - elif split.lower() in ['inv', 'inv.']: - new_text_splits.append('Investor') - elif split.lower() in ['inst', 'inst.', 'institution']: - new_text_splits.append('Institutional') - elif split.lower() in ['cap', 'cap.']: - new_text_splits.append('Capitalisation') - elif split.lower() in ['adm', 'adm.']: - new_text_splits.append('Admin') - elif split.lower() in ['adv', 'adv.']: - new_text_splits.append('Advantage') - elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']: - new_text_splits.append('Hedged') - elif split.lower() in ['cl', 'cl.']: - new_text_splits.append('Class') - elif split.lower() in ['ser', 'ser.']: - new_text_splits.append('Series') - elif split.lower() in ['u.s.']: - new_text_splits.append('US') - elif split.lower() in ['nc', 'nc.']: - new_text_splits.append('no trail') + if split.lower() in ["acc", "acc."]: + new_text_splits.append("Accumulation") + elif split.lower() in ["inc", "inc."]: + new_text_splits.append("Income") + elif split.lower() in ["dist", "dist."]: + new_text_splits.append("Distribution") + elif split.lower() in ["inv", "inv."]: + new_text_splits.append("Investor") + elif split.lower() in ["inst", "inst.", "institution"]: + new_text_splits.append("Institutional") + elif split.lower() in ["cap", "cap."]: + new_text_splits.append("Capitalisation") + elif split.lower() in ["adm", "adm."]: + new_text_splits.append("Admin") + elif split.lower() in ["adv", "adv."]: + new_text_splits.append("Advantage") + elif split.lower() in ["hdg", "hgd", "hdg.", "hgd.", "(h)"]: + new_text_splits.append("Hedged") + elif split.lower() in ["cl", "cl."]: + new_text_splits.append("Class") + elif split.lower() in ["ser", "ser."]: + new_text_splits.append("Series") + elif split.lower() in ["u.s."]: + new_text_splits.append("US") + elif split.lower() in ["nc", "nc."]: + new_text_splits.append("no trail") else: new_text_splits.append(split) - - new_text = ' '.join(new_text_splits) - return new_text \ No newline at end of file + + new_text = " ".join(new_text_splits) + return new_text