Optimize investment mapping algorithm.

This commit is contained in:
Blade He 2024-09-26 12:18:37 -05:00
parent 598e2ab820
commit d25bae936c
3 changed files with 281 additions and 161 deletions

43
main.py
View File

@ -564,8 +564,8 @@ def test_data_extraction_metrics():
def test_mapping_raw_name(): def test_mapping_raw_name():
doc_id = "292989214" doc_id = "391456740"
raw_name = "ENBD Saudi Arabia Equity Fund Class A USD Accumulation" raw_name = "Robeco Multi Asset Sustainable D EUR"
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
data_mapping = DataMapping( data_mapping = DataMapping(
doc_id, doc_id,
@ -575,7 +575,7 @@ def test_mapping_raw_name():
output_data_folder=output_folder, output_data_folder=output_folder,
) )
mapping_info = data_mapping.matching_with_database( mapping_info = data_mapping.matching_with_database(
raw_name=raw_name, parent_id="FS0000B4A7", matching_type="share" raw_name=raw_name, parent_id=None, matching_type="share"
) )
print(mapping_info) print(mapping_info)
@ -622,30 +622,41 @@ if __name__ == "__main__":
# special_doc_id_list = ["505174428", "510326848", "349679479"] # special_doc_id_list = ["505174428", "510326848", "349679479"]
check_mapping_doc_id_list = [ check_mapping_doc_id_list = [
"458359181", "327956364",
"486383912",
"529925114",
"391456740", "391456740",
"391736837", "391736837",
"458359181",
"486383912",
"497497599", "497497599",
"327956364", "529925114",
"479793787",
"334718372",
"321733631", "321733631",
"507967525", "334718372",
"478585901",
"366179419",
"509845549",
"323390570",
"344636875", "344636875",
"362246081",
"445256897", "445256897",
"449623976",
"458291624",
"478585901",
"492121213",
"502821436",
"507967525",
"481475385",
"508854243", "508854243",
"520879048", "520879048",
"402181770",
"463081566", "463081566",
"389171486" "502693599",
"509845549",
"389171486",
"323390570",
"366179419",
"486378555",
"506559375",
"479793787",
"333207452"
] ]
special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_mapping_doc_id_list
# special_doc_id_list = ["445256897"] # special_doc_id_list = ["333207452"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False

View File

@ -848,7 +848,7 @@ def compare_records_count_by_document_id():
# get the count of records by DocumentId # get the count of records by DocumentId
document_records_count = data_from_document_df.groupby("DocumentId").size().reset_index(name="records_count") document_records_count = data_from_document_df.groupby("DocumentId").size().reset_index(name="records_count")
data_from_database = r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx" data_from_database = r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document_from_DocumentAcquisition.xlsx"
sheet_name = "random_small_document_all_data" sheet_name = "random_small_document_all_data"
data_from_database_df = pd.read_excel(data_from_database, sheet_name=sheet_name) data_from_database_df = pd.read_excel(data_from_database, sheet_name=sheet_name)
database_records_count = data_from_database_df.groupby("DocumentId").size().reset_index(name="records_count") database_records_count = data_from_database_df.groupby("DocumentId").size().reset_index(name="records_count")
@ -870,7 +870,7 @@ def compare_records_count_by_document_id():
records_count_compare.reset_index(drop=True, inplace=True) records_count_compare.reset_index(drop=True, inplace=True)
records_count_compare_file = ( records_count_compare_file = (
r"/data/emea_ar/basic_information/English/records_count_compare_between_document_database.xlsx" r"/data/emea_ar/basic_information/English/records_count_compare_between_document_database_from_DocumentAcquisition.xlsx"
) )
with pd.ExcelWriter(records_count_compare_file) as writer: with pd.ExcelWriter(records_count_compare_file) as writer:
records_count_compare.to_excel( records_count_compare.to_excel(
@ -886,7 +886,7 @@ def get_document_extracted_share_diff_by_db():
db_data = pd.read_excel(db_data_file, sheet_name="Sheet1") db_data = pd.read_excel(db_data_file, sheet_name="Sheet1")
extract_data = pd.read_excel(extract_data_file, sheet_name="mapping_data") extract_data = pd.read_excel(extract_data_file, sheet_name="mapping_data")
# only get data which investment_type is 1 # only get data which investment_type is 1
extract_data = extract_data[extract_data["investment_type"] == 1] # extract_data = extract_data[extract_data["investment_type"] == 1]
extract_data.reset_index(drop=True, inplace=True) extract_data.reset_index(drop=True, inplace=True)
unique_doc_id = extract_data["doc_id"].unique().tolist() unique_doc_id = extract_data["doc_id"].unique().tolist()
@ -1012,7 +1012,7 @@ if __name__ == "__main__":
# sheet_name="latest_doc_ar_data", # sheet_name="latest_doc_ar_data",
# output_folder=output_data_folder, # output_folder=output_data_folder,
# output_file="latest_doc_ar_mapping_statistics.xlsx") # output_file="latest_doc_ar_mapping_statistics.xlsx")
# get_document_extracted_share_diff_by_db() get_document_extracted_share_diff_by_db()
# statistics_provider_mapping( # statistics_provider_mapping(
# provider_mapping_data_file=provider_mapping_data_file, # provider_mapping_data_file=provider_mapping_data_file,
# output_folder=basic_info_folder, # output_folder=basic_info_folder,
@ -1021,11 +1021,11 @@ if __name__ == "__main__":
# pickup_document_from_top_100_providers() # pickup_document_from_top_100_providers()
# compare_records_count_by_document_id() # compare_records_count_by_document_id()
document_mapping_folder = r"/data/emea_ar/output/mapping/document/" # document_mapping_folder = r"/data/emea_ar/output/mapping/document/"
all_data_file = r"/data/emea_ar/output/mapping/all_document_mapping.xlsx" # all_data_file = r"/data/emea_ar/output/mapping/all_document_mapping.xlsx"
concat_mapping(document_mapping_folder, all_data_file) # concat_mapping(document_mapping_folder, all_data_file)
provider_mapping_folder = r"/data/emea_ar/output/mapping/provider/" # provider_mapping_folder = r"/data/emea_ar/output/mapping/provider/"
all_data_file = r"/data/emea_ar/output/mapping/all_provider_mapping.xlsx" # all_data_file = r"/data/emea_ar/output/mapping/all_provider_mapping.xlsx"
concat_mapping(provider_mapping_folder, all_data_file) # concat_mapping(provider_mapping_folder, all_data_file)

View File

@ -2,6 +2,55 @@ import re
from copy import deepcopy from copy import deepcopy
from traceback import print_exc from traceback import print_exc
total_currency_list = [
"USD",
"EUR",
"AUD",
"JPY",
"CHF",
"GBP",
"SEK",
"CNY",
"NZD",
"CNH",
"NOK",
"SGD",
"HKD",
"ZAR",
"PLN",
"CAD",
"CZK",
"HUF",
"DKK",
"BRL",
"SKK",
"RON",
"TRY",
"BGN",
"CUP",
"MXN",
"TOP",
"ILS",
"CLF",
"XCD",
"ISK",
"IDR",
"MNT",
"AED",
"AFN",
"INR",
"ESP",
"RUB",
"CLP",
"KRW",
"ETB",
"DZD",
"XEU",
"XFO",
]
def add_slash_to_text_as_regex(text: str): def add_slash_to_text_as_regex(text: str):
if text is None or len(text) == 0: if text is None or len(text) == 0:
return text return text
@ -19,35 +68,49 @@ def add_slash_to_text_as_regex(text: str):
def clean_text(text: str) -> str: def clean_text(text: str) -> str:
# text = text.lower() # text = text.lower()
# update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space # update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
text = re.sub(r"\\u[A-Z0-9a-z]{4}", ' ', text) text = re.sub(r"\\u[A-Z0-9a-z]{4}", " ", text)
text = re.sub(r"( ){2,}", ' ', text.strip()) text = re.sub(r"( ){2,}", " ", text.strip())
return text return text
def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list = None) -> str: def get_most_similar_name(
text: str, name_list: list, pre_common_word_list: list = None
) -> str:
""" """
Get the most similar fund name from fund_name_list by jacard similarity Get the most similar fund name from fund_name_list by jacard similarity
""" """
try: try:
copy_fund_name_list = deepcopy(name_list) copy_fund_name_list = deepcopy(name_list)
if text is None or len(text.split()) == 0 or \ if (
copy_fund_name_list is None or len(copy_fund_name_list) == 0: text is None
or len(text.split()) == 0
or copy_fund_name_list is None
or len(copy_fund_name_list) == 0
):
return None, None return None, None
copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name copy_fund_name_list = [
in copy_fund_name_list] replace_abbrevation(copy_fund_name)
for copy_fund_name in copy_fund_name_list
]
copy_fund_name_list = [
replace_abbrevation(remove_special_characters(copy_fund_name))
for copy_fund_name in copy_fund_name_list
]
# get common words in fund_name_list # get common words in fund_name_list
common_word_list = [] common_word_list = []
if len(name_list) > 1: if len(name_list) > 1:
_, common_word_list = remove_common_word(copy_fund_name_list) _, common_word_list = remove_common_word(copy_fund_name_list)
if pre_common_word_list is not None and len(pre_common_word_list) > 0: if pre_common_word_list is not None and len(pre_common_word_list) > 0:
common_word_list.extend([word for word in pre_common_word_list common_word_list.extend(
if word not in common_word_list]) [word for word in pre_common_word_list if word not in common_word_list]
)
text = text.strip() text = text.strip()
text = remove_special_characters(text)
text = replace_abbrevation(text) text = replace_abbrevation(text)
text = replace_abbrevation(remove_special_characters(text))
text_splits = text.split() text_splits = text.split()
if len(text_splits) == 1: if len(text_splits) == 1:
text = split_words_without_space(text) text = split_words_without_space(text)
@ -65,29 +128,46 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
# remove word in fund_name_list # remove word in fund_name_list
for i in range(len(copy_fund_name_list)): for i in range(len(copy_fund_name_list)):
temp_splits = copy_fund_name_list[i].split() temp_splits = copy_fund_name_list[i].split()
copy_fund_name_list[i] = ' '.join([split for split in temp_splits copy_fund_name_list[i] = " ".join(
if remove_special_characters(split).lower() != word]) [
split
for split in temp_splits
if remove_special_characters(split).lower() != word
]
)
for i in range(len(copy_fund_name_list)): for i in range(len(copy_fund_name_list)):
temp_splits = copy_fund_name_list[i].split() temp_splits = copy_fund_name_list[i].split()
copy_fund_name_list[i] = ' '.join([split for split in temp_splits copy_fund_name_list[i] = " ".join(
if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']]) [
split
for split in temp_splits
if remove_special_characters(split).lower()
not in ["fund", "portfolio", "class", "share", "shares"]
]
)
final_splits = [] final_splits = []
for split in new_splits: for split in new_splits:
if split.lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']: if split.lower() not in [
"fund",
"portfolio",
"class",
"share",
"shares",
]:
final_splits.append(split) final_splits.append(split)
text = ' '.join(final_splits) text = " ".join(final_splits)
max_similarity = 0 max_similarity = 0
max_similarity_fund_name = None max_similarity_fund_name = None
text = remove_special_characters(text) text = remove_special_characters(text)
text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list) text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list): for fund_name, copy_fund_name in zip(name_list, copy_fund_name_list):
copy_fund_name = remove_special_characters(copy_fund_name) copy_fund_name = remove_special_characters(copy_fund_name)
copy_fund_name = split_words_without_space(copy_fund_name) copy_fund_name = split_words_without_space(copy_fund_name)
similarity = get_jacard_similarity(text, similarity = get_jacard_similarity(
copy_fund_name, text, copy_fund_name, need_remove_numeric_characters=False
need_remove_numeric_characters=False) )
if similarity > max_similarity: if similarity > max_similarity:
max_similarity = similarity max_similarity = similarity
max_similarity_fund_name = fund_name max_similarity_fund_name = fund_name
@ -105,12 +185,6 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
def update_for_currency(text: str, compare_list: list): def update_for_currency(text: str, compare_list: list):
text_split = text.split() text_split = text.split()
with_currency = False with_currency = False
total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY',
'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD',
'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN',
'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR',
'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW',
'ETB', 'DZD', 'XEU', 'XFO']
for split in text_split: for split in text_split:
if split.upper() in total_currency_list: if split.upper() in total_currency_list:
with_currency = True with_currency = True
@ -138,7 +212,7 @@ def update_for_currency(text: str, compare_list: list):
if len(without_currency_list) > 0: if len(without_currency_list) > 0:
for index in without_currency_list: for index in without_currency_list:
if last_split in compare_list[index].split(): if last_split in compare_list[index].split():
text = text + ' ' + 'USD' text = text + " " + "USD"
updated = True updated = True
break break
if not updated: if not updated:
@ -146,23 +220,26 @@ def update_for_currency(text: str, compare_list: list):
for index in with_currency_list: for index in with_currency_list:
compare_split = compare_list[index].split() compare_split = compare_list[index].split()
if last_split in compare_split: if last_split in compare_split:
current_currency_list = [split for split in compare_split current_currency_list = [
if split.upper() in total_currency_list] split
for split in compare_split
if split.upper() in total_currency_list
]
if len(current_currency_list) > 0: if len(current_currency_list) > 0:
currency_list.append(current_currency_list[-1]) currency_list.append(current_currency_list[-1])
if len(currency_list) == 1: if len(currency_list) == 1:
text = text + ' ' + currency_list[0] text = text + " " + currency_list[0]
updated = True updated = True
for index in without_currency_list: for index in without_currency_list:
compare_list[index] = compare_list[index] + ' ' + 'USD' compare_list[index] = compare_list[index] + " " + "USD"
if not updated: if not updated:
text = text + ' ' + 'USD' text = text + " " + "USD"
return text, compare_list return text, compare_list
elif with_currency and len(without_currency_list) == 0: elif with_currency and len(without_currency_list) == 0:
for index in without_currency_list: for index in without_currency_list:
compare_list[index] = compare_list[index] + ' ' + 'USD' compare_list[index] = compare_list[index] + " " + "USD"
return text, compare_list return text, compare_list
else: else:
return text, compare_list return text, compare_list
@ -176,35 +253,60 @@ def remove_common_word(text_list: list):
text = text.lower() text = text.lower()
text = remove_special_characters(text) text = remove_special_characters(text)
text_splits = text.split() text_splits = text.split()
while 'fund' in text_splits: while "fund" in text_splits:
text_splits.remove('fund') text_splits.remove("fund")
while 'portfolio' in text_splits: while "portfolio" in text_splits:
text_splits.remove('portfolio') text_splits.remove("portfolio")
while 'share' in text_splits: while "share" in text_splits:
text_splits.remove('share') text_splits.remove("share")
while 'class' in text_splits: while "class" in text_splits:
text_splits.remove('class') text_splits.remove("class")
text = ' '.join(text_splits) text = " ".join(text_splits)
new_text_list.append(text) new_text_list.append(text)
# remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words # remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words
# the result is ['Global', 'Growth'] # the result is ['Global', 'Growth']
common_word_list = [] common_word_list = []
new_text_splits_list = [text.split() for text in new_text_list] new_text_splits_list = [text.split() for text in new_text_list]
for i in range(len(new_text_splits_list)): for i in range(len(new_text_splits_list)):
for j in range(i+1, len(new_text_splits_list)): for j in range(i + 1, len(new_text_splits_list)):
if common_word_list is None or len(common_word_list) == 0: if common_word_list is None or len(common_word_list) == 0:
common_word_list = list( common_word_list = list(
set(new_text_splits_list[i]).intersection(set(new_text_splits_list[j]))) set(new_text_splits_list[i]).intersection(
set(new_text_splits_list[j])
)
)
else: else:
common_word_list = list( common_word_list = list(
set(common_word_list).intersection(set(new_text_splits_list[j]))) set(common_word_list).intersection(set(new_text_splits_list[j]))
)
common_word_list = list(set(common_word_list)) common_word_list = list(set(common_word_list))
remove_list = []
# if exists the share name and currency name, remove from the list
for word in common_word_list:
if word.upper() in total_currency_list:
remove_list.append(word)
for text in new_text_list:
text_splits = text.split()
if len(text_splits) < 4:
continue
# get last 3 words from text_splits
last_three_words = text_splits[-3:]
for word in common_word_list:
if word not in remove_list and \
word.upper() == word and \
word in last_three_words:
remove_list.append(word)
for remove in remove_list:
if remove in common_word_list:
common_word_list.remove(remove)
for i in range(len(new_text_splits_list)): for i in range(len(new_text_splits_list)):
for common_word in common_word_list: for common_word in common_word_list:
if common_word in new_text_splits_list[i]: if common_word in new_text_splits_list[i]:
new_text_splits_list[i].remove(common_word) new_text_splits_list[i].remove(common_word)
new_text_list = [' '.join(text_splits) new_text_list = [" ".join(text_splits) for text_splits in new_text_splits_list]
for text_splits in new_text_splits_list]
return new_text_list, common_word_list return new_text_list, common_word_list
@ -219,21 +321,22 @@ def split_words_without_space(text: str):
# if len(splits) > 1: # if len(splits) > 1:
# return text # return text
# find all words with capital letter + lower letter # find all words with capital letter + lower letter
regex = r'[A-Z][a-z]+' regex = r"[A-Z][a-z]+"
word_list = re.findall(regex, text) word_list = re.findall(regex, text)
if len(word_list) > 0: if len(word_list) > 0:
for word in word_list: for word in word_list:
text = text.replace(word, ' ' + word + ' ') text = text.replace(word, " " + word + " ")
text = re.sub(r'(\s)+', ' ', text) text = re.sub(r"(\s)+", " ", text)
return text.strip() return text.strip()
def remove_special_characters(text): def remove_special_characters(text):
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
text = re.sub(r'\s+', ' ', text) text = re.sub(r"\s+", " ", text)
text = text.strip() text = text.strip()
return text return text
def get_unique_words_text(text): def get_unique_words_text(text):
text = remove_special_characters(text) text = remove_special_characters(text)
text = text.lower() text = text.lower()
@ -241,22 +344,24 @@ def get_unique_words_text(text):
text_split = list(set(text_split)) text_split = list(set(text_split))
# sort the list # sort the list
text_split.sort() text_split.sort()
return_text = ' '.join(text_split) return_text = " ".join(text_split)
return return_text return return_text
def remove_numeric_characters(text): def remove_numeric_characters(text):
# remove numeric characters # remove numeric characters
text = re.sub(r'\d+', ' ', text) text = re.sub(r"\d+", " ", text)
text = re.sub(r'\s+', ' ', text) text = re.sub(r"\s+", " ", text)
text = text.strip() text = text.strip()
return text return text
def get_jacard_similarity(text_left, def get_jacard_similarity(
text_left,
text_right, text_right,
need_remove_special_characters=True, need_remove_special_characters=True,
need_remove_numeric_characters=True): need_remove_numeric_characters=True,
):
if need_remove_special_characters: if need_remove_special_characters:
text_left = remove_special_characters(text_left) text_left = remove_special_characters(text_left)
text_right = remove_special_characters(text_right) text_right = remove_special_characters(text_right)
@ -274,6 +379,7 @@ def get_jacard_similarity(text_left,
else: else:
return 0 return 0
def get_beginning_common_words(text_list: list): def get_beginning_common_words(text_list: list):
""" """
Get the beginning common words in text_list Get the beginning common words in text_list
@ -298,86 +404,89 @@ def get_beginning_common_words(text_list: list):
else: else:
break break
return ' '.join(common_words_list).strip() return " ".join(common_words_list).strip()
def replace_abbrevation(text: str): def replace_abbrevation(text: str):
if text is None or len(text.strip()) == 0: if text is None or len(text.strip()) == 0:
return text return text
text = text.strip() text = text.strip()
if 'swiss franc' in text.lower(): if "swiss franc" in text.lower():
text = re.sub(r'swiss\s+franc', 'CHF', text, flags=re.IGNORECASE) text = re.sub(r"swiss\s+franc", "CHF", text, flags=re.IGNORECASE)
elif 'us dollar' in text.lower(): elif "us dollar" in text.lower():
text = re.sub(r'us\s+dollar', 'USD', text, flags=re.IGNORECASE) text = re.sub(r"us\s+dollar", "USD", text, flags=re.IGNORECASE)
elif 'singapore dollar' in text.lower(): elif "singapore dollar" in text.lower():
text = re.sub(r'singapore\s+dollar', 'SGD', text, flags=re.IGNORECASE) text = re.sub(r"singapore\s+dollar", "SGD", text, flags=re.IGNORECASE)
elif 'hong kong dollar' in text.lower(): elif "hong kong dollar" in text.lower():
text = re.sub(r'hong\s+kong\s+dollar', 'HKD', text, flags=re.IGNORECASE) text = re.sub(r"hong\s+kong\s+dollar", "HKD", text, flags=re.IGNORECASE)
elif 'hongkong dollar' in text.lower(): elif "hongkong dollar" in text.lower():
text = re.sub(r'hongkong\s+dollar', 'HKD', text, flags=re.IGNORECASE) text = re.sub(r"hongkong\s+dollar", "HKD", text, flags=re.IGNORECASE)
elif 'australian dollar' in text.lower(): elif "australian dollar" in text.lower():
text = re.sub(r'australian\s+dollar', 'AUD', text, flags=re.IGNORECASE) text = re.sub(r"australian\s+dollar", "AUD", text, flags=re.IGNORECASE)
elif 'japanese yen' in text.lower(): elif "japanese yen" in text.lower():
text = re.sub(r'japanese\s+yen', 'JPY', text, flags=re.IGNORECASE) text = re.sub(r"japanese\s+yen", "JPY", text, flags=re.IGNORECASE)
elif 'south african rand' in text.lower(): elif "south african rand" in text.lower():
text = re.sub(r'South\s+African\s+rand', 'ZAR', text, flags=re.IGNORECASE) text = re.sub(r"South\s+African\s+rand", "ZAR", text, flags=re.IGNORECASE)
elif 'canadian dollar' in text.lower(): elif "canadian dollar" in text.lower():
text = re.sub(r'canadian\s+dollar', 'CAD', text, flags=re.IGNORECASE) text = re.sub(r"canadian\s+dollar", "CAD", text, flags=re.IGNORECASE)
elif 'new zealand dollar' in text.lower(): elif "new zealand dollar" in text.lower():
text = re.sub(r'new\s+zealand\s+dollar', 'NZD', text, flags=re.IGNORECASE) text = re.sub(r"new\s+zealand\s+dollar", "NZD", text, flags=re.IGNORECASE)
elif 'norwegian krone' in text.lower(): elif "norwegian krone" in text.lower():
text = re.sub(r'norwegian\s+krone', 'NOK', text, flags=re.IGNORECASE) text = re.sub(r"norwegian\s+krone", "NOK", text, flags=re.IGNORECASE)
elif 'danish krone' in text.lower(): elif "danish krone" in text.lower():
text = re.sub(r'danish\s+krone', 'DKK', text, flags=re.IGNORECASE) text = re.sub(r"danish\s+krone", "DKK", text, flags=re.IGNORECASE)
elif 'swedish krona' in text.lower(): elif "swedish krona" in text.lower():
text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE) text = re.sub(r"swedish\s+krona", "SEK", text, flags=re.IGNORECASE)
elif 'swedish kronor' in text.lower(): elif "swedish kronor" in text.lower():
text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE) text = re.sub(r"swedish\s+kronor", "SEK", text, flags=re.IGNORECASE)
elif 'sterling' in text.lower().split(): elif "GPB" in text.split():
text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE) text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE)
elif 'euro' in text.lower().split(): elif "sterling" in text.lower().split():
text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE) text = re.sub(r"sterling", "GBP", text, flags=re.IGNORECASE)
elif '' in text.lower().split(): elif "euro" in text.lower().split():
text = re.sub(r'\', 'EUR', text, flags=re.IGNORECASE) text = re.sub(r"euro", "EUR", text, flags=re.IGNORECASE)
elif '$' in text.lower().split(): elif "" in text.lower().split():
text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE) text = re.sub(r"\", "EUR", text, flags=re.IGNORECASE)
elif '£' in text.lower().split(): elif "$" in text.lower().split():
text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE) text = re.sub(r"\$", "USD", text, flags=re.IGNORECASE)
elif 'RMB' in text.lower().split(): elif "£" in text.lower().split():
text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE) text = re.sub(r"\£", "GBP", text, flags=re.IGNORECASE)
elif "RMB" in text.split():
text = re.sub(r"RMB", "CNY", text, flags=re.IGNORECASE)
else: else:
pass pass
text_splits = text.split() text_splits = text.split()
new_text_splits = [] new_text_splits = []
for split in text_splits: for split in text_splits:
if split.lower() in ['acc', 'acc.']: if split.lower() in ["acc", "acc."]:
new_text_splits.append('Accumulation') new_text_splits.append("Accumulation")
elif split.lower() in ['inc', 'inc.']: elif split.lower() in ["inc", "inc."]:
new_text_splits.append('Income') new_text_splits.append("Income")
elif split.lower() in ['dist', 'dist.']: elif split.lower() in ["dist", "dist."]:
new_text_splits.append('Distribution') new_text_splits.append("Distribution")
elif split.lower() in ['inv', 'inv.']: elif split.lower() in ["inv", "inv."]:
new_text_splits.append('Investor') new_text_splits.append("Investor")
elif split.lower() in ['inst', 'inst.', 'institution']: elif split.lower() in ["inst", "inst.", "institution"]:
new_text_splits.append('Institutional') new_text_splits.append("Institutional")
elif split.lower() in ['cap', 'cap.']: elif split.lower() in ["cap", "cap."]:
new_text_splits.append('Capitalisation') new_text_splits.append("Capitalisation")
elif split.lower() in ['adm', 'adm.']: elif split.lower() in ["adm", "adm."]:
new_text_splits.append('Admin') new_text_splits.append("Admin")
elif split.lower() in ['adv', 'adv.']: elif split.lower() in ["adv", "adv."]:
new_text_splits.append('Advantage') new_text_splits.append("Advantage")
elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']: elif split.lower() in ["hdg", "hgd", "hdg.", "hgd.", "(h)"]:
new_text_splits.append('Hedged') new_text_splits.append("Hedged")
elif split.lower() in ['cl', 'cl.']: elif split.lower() in ["cl", "cl."]:
new_text_splits.append('Class') new_text_splits.append("Class")
elif split.lower() in ['ser', 'ser.']: elif split.lower() in ["ser", "ser."]:
new_text_splits.append('Series') new_text_splits.append("Series")
elif split.lower() in ['u.s.']: elif split.lower() in ["u.s."]:
new_text_splits.append('US') new_text_splits.append("US")
elif split.lower() in ['nc', 'nc.']: elif split.lower() in ["nc", "nc."]:
new_text_splits.append('no trail') new_text_splits.append("no trail")
else: else:
new_text_splits.append(split) new_text_splits.append(split)
new_text = ' '.join(new_text_splits) new_text = " ".join(new_text_splits)
return new_text return new_text