493 lines
17 KiB
Python
493 lines
17 KiB
Python
import re
|
|
from copy import deepcopy
|
|
from traceback import print_exc
|
|
|
|
|
|
total_currency_list = [
|
|
"USD",
|
|
"EUR",
|
|
"AUD",
|
|
"JPY",
|
|
"CHF",
|
|
"GBP",
|
|
"SEK",
|
|
"CNY",
|
|
"NZD",
|
|
"CNH",
|
|
"NOK",
|
|
"SGD",
|
|
"HKD",
|
|
"ZAR",
|
|
"PLN",
|
|
"CAD",
|
|
"CZK",
|
|
"HUF",
|
|
"DKK",
|
|
"BRL",
|
|
"SKK",
|
|
"RON",
|
|
"TRY",
|
|
"BGN",
|
|
"CUP",
|
|
"MXN",
|
|
"TOP",
|
|
"ILS",
|
|
"CLF",
|
|
"XCD",
|
|
"ISK",
|
|
"IDR",
|
|
"MNT",
|
|
"AED",
|
|
"AFN",
|
|
"INR",
|
|
"ESP",
|
|
"RUB",
|
|
"CLP",
|
|
"KRW",
|
|
"ETB",
|
|
"DZD",
|
|
"XEU",
|
|
"XFO",
|
|
]
|
|
|
|
|
|
def add_slash_to_text_as_regex(text: str):
|
|
if text is None or len(text) == 0:
|
|
return text
|
|
special_char_iter = re.finditer("\W", text)
|
|
for special_iter in special_char_iter:
|
|
if len(special_iter.group().strip()) == 0:
|
|
continue
|
|
replace = r"\{0}".format(special_iter.group())
|
|
if replace not in text:
|
|
text = re.sub(replace, replace, text)
|
|
text = re.sub(r"\s+", r"\\s+", text)
|
|
return text
|
|
|
|
|
|
def clean_text(text: str) -> str:
|
|
# text = text.lower()
|
|
# update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
|
|
text = re.sub(r"\\u[A-Z0-9a-z]{4}", " ", text)
|
|
text = re.sub(r"( ){2,}", " ", text.strip())
|
|
return text
|
|
|
|
|
|
def get_most_similar_name(
|
|
text: str, name_list: list, pre_common_word_list: list = None
|
|
) -> str:
|
|
"""
|
|
Get the most similar fund name from fund_name_list by jacard similarity
|
|
"""
|
|
try:
|
|
copy_fund_name_list = deepcopy(name_list)
|
|
if (
|
|
text is None
|
|
or len(text.split()) == 0
|
|
or copy_fund_name_list is None
|
|
or len(copy_fund_name_list) == 0
|
|
):
|
|
return None, None
|
|
|
|
copy_fund_name_list = [
|
|
replace_abbrevation(copy_fund_name)
|
|
for copy_fund_name in copy_fund_name_list
|
|
]
|
|
|
|
copy_fund_name_list = [
|
|
replace_abbrevation(remove_special_characters(copy_fund_name))
|
|
for copy_fund_name in copy_fund_name_list
|
|
]
|
|
|
|
# get common words in fund_name_list
|
|
common_word_list = []
|
|
if len(name_list) > 1:
|
|
_, common_word_list = remove_common_word(copy_fund_name_list)
|
|
if pre_common_word_list is not None and len(pre_common_word_list) > 0:
|
|
common_word_list.extend(
|
|
[word for word in pre_common_word_list if word not in common_word_list]
|
|
)
|
|
|
|
text = text.strip()
|
|
text = replace_abbrevation(text)
|
|
text = replace_abbrevation(remove_special_characters(text))
|
|
text_splits = text.split()
|
|
if len(text_splits) == 1:
|
|
text = split_words_without_space(text)
|
|
else:
|
|
new_splits = []
|
|
for split in text_splits:
|
|
if len(split) > 1:
|
|
new_splits.extend(split_words_without_space(split).split())
|
|
else:
|
|
new_splits.append(split)
|
|
|
|
lower_new_splits = [split.lower() for split in new_splits]
|
|
for word in common_word_list:
|
|
if word not in lower_new_splits:
|
|
# remove word in fund_name_list
|
|
for i in range(len(copy_fund_name_list)):
|
|
temp_splits = copy_fund_name_list[i].split()
|
|
copy_fund_name_list[i] = " ".join(
|
|
[
|
|
split
|
|
for split in temp_splits
|
|
if remove_special_characters(split).lower() != word
|
|
]
|
|
)
|
|
|
|
for i in range(len(copy_fund_name_list)):
|
|
temp_splits = copy_fund_name_list[i].split()
|
|
copy_fund_name_list[i] = " ".join(
|
|
[
|
|
split
|
|
for split in temp_splits
|
|
if remove_special_characters(split).lower()
|
|
not in ["fund", "portfolio", "class", "share", "shares"]
|
|
]
|
|
)
|
|
final_splits = []
|
|
for split in new_splits:
|
|
if split.lower() not in [
|
|
"fund",
|
|
"portfolio",
|
|
"class",
|
|
"share",
|
|
"shares",
|
|
]:
|
|
final_splits.append(split)
|
|
|
|
text = " ".join(final_splits)
|
|
max_similarity = 0
|
|
max_similarity_fund_name = None
|
|
text = remove_special_characters(text)
|
|
text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
|
|
for fund_name, copy_fund_name in zip(name_list, copy_fund_name_list):
|
|
copy_fund_name = remove_special_characters(copy_fund_name)
|
|
copy_fund_name = split_words_without_space(copy_fund_name)
|
|
similarity = get_jacard_similarity(
|
|
text, copy_fund_name, need_remove_numeric_characters=False
|
|
)
|
|
if similarity > max_similarity:
|
|
max_similarity = similarity
|
|
max_similarity_fund_name = fund_name
|
|
if max_similarity == 1:
|
|
break
|
|
if max_similarity < 0.35:
|
|
return None, max_similarity
|
|
return max_similarity_fund_name, max_similarity
|
|
except Exception as e:
|
|
print(e)
|
|
print_exc()
|
|
return None, 0.0
|
|
|
|
|
|
def update_for_currency(text: str, compare_list: list):
|
|
text_split = text.split()
|
|
with_currency = False
|
|
for split in text_split:
|
|
if split.upper() in total_currency_list:
|
|
with_currency = True
|
|
break
|
|
|
|
with_currency_list = []
|
|
without_currency_list = []
|
|
for index, compare in enumerate(compare_list):
|
|
compare_split = compare.split()
|
|
with_currency_compare = False
|
|
for split in compare_split:
|
|
if split.upper() in total_currency_list:
|
|
with_currency_compare = True
|
|
break
|
|
if with_currency_compare:
|
|
with_currency_list.append(index)
|
|
else:
|
|
without_currency_list.append(index)
|
|
if not with_currency and len(with_currency_list) == 0:
|
|
return text, compare_list
|
|
elif not with_currency and len(with_currency_list) > 0:
|
|
last_split = text_split[-1]
|
|
updated = False
|
|
if len(last_split) < 4 and last_split.upper() == last_split:
|
|
if len(without_currency_list) > 0:
|
|
for index in without_currency_list:
|
|
if last_split in compare_list[index].split():
|
|
text = text + " " + "USD"
|
|
updated = True
|
|
break
|
|
if not updated:
|
|
currency_list = []
|
|
for index in with_currency_list:
|
|
compare_split = compare_list[index].split()
|
|
if last_split in compare_split:
|
|
current_currency_list = [
|
|
split
|
|
for split in compare_split
|
|
if split.upper() in total_currency_list
|
|
]
|
|
if len(current_currency_list) > 0:
|
|
currency_list.append(current_currency_list[-1])
|
|
if len(currency_list) == 1:
|
|
text = text + " " + currency_list[0]
|
|
updated = True
|
|
|
|
for index in without_currency_list:
|
|
compare_list[index] = compare_list[index] + " " + "USD"
|
|
|
|
if not updated:
|
|
text = text + " " + "USD"
|
|
return text, compare_list
|
|
elif with_currency and len(without_currency_list) == 0:
|
|
for index in without_currency_list:
|
|
compare_list[index] = compare_list[index] + " " + "USD"
|
|
return text, compare_list
|
|
else:
|
|
return text, compare_list
|
|
|
|
|
|
def remove_common_word(text_list: list):
|
|
if text_list is None or len(text_list) == 0:
|
|
return text_list
|
|
new_text_list = []
|
|
for text in text_list:
|
|
text = text.lower()
|
|
text = remove_special_characters(text)
|
|
text_splits = text.split()
|
|
while "fund" in text_splits:
|
|
text_splits.remove("fund")
|
|
while "portfolio" in text_splits:
|
|
text_splits.remove("portfolio")
|
|
while "share" in text_splits:
|
|
text_splits.remove("share")
|
|
while "class" in text_splits:
|
|
text_splits.remove("class")
|
|
text = " ".join(text_splits)
|
|
new_text_list.append(text)
|
|
# remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words
|
|
# the result is ['Global', 'Growth']
|
|
common_word_list = []
|
|
new_text_splits_list = [text.split() for text in new_text_list]
|
|
for i in range(len(new_text_splits_list)):
|
|
for j in range(i + 1, len(new_text_splits_list)):
|
|
if common_word_list is None or len(common_word_list) == 0:
|
|
common_word_list = list(
|
|
set(new_text_splits_list[i]).intersection(
|
|
set(new_text_splits_list[j])
|
|
)
|
|
)
|
|
else:
|
|
common_word_list = list(
|
|
set(common_word_list).intersection(set(new_text_splits_list[j]))
|
|
)
|
|
common_word_list = list(set(common_word_list))
|
|
|
|
remove_list = []
|
|
# if exists the share name and currency name, remove from the list
|
|
for word in common_word_list:
|
|
if word.upper() in total_currency_list:
|
|
remove_list.append(word)
|
|
for text in new_text_list:
|
|
text_splits = text.split()
|
|
if len(text_splits) < 4:
|
|
continue
|
|
# get last 3 words from text_splits
|
|
last_three_words = text_splits[-3:]
|
|
for word in common_word_list:
|
|
if word not in remove_list and \
|
|
word.upper() == word and \
|
|
word in last_three_words:
|
|
remove_list.append(word)
|
|
for remove in remove_list:
|
|
if remove in common_word_list:
|
|
common_word_list.remove(remove)
|
|
|
|
for i in range(len(new_text_splits_list)):
|
|
for common_word in common_word_list:
|
|
if common_word in new_text_splits_list[i]:
|
|
new_text_splits_list[i].remove(common_word)
|
|
new_text_list = [" ".join(text_splits) for text_splits in new_text_splits_list]
|
|
|
|
return new_text_list, common_word_list
|
|
|
|
|
|
def split_words_without_space(text: str):
|
|
"""
|
|
Split words without space, such as 'BlackrockGlobalFund' will be split to 'Blackrock', 'Global', 'Fund'
|
|
"""
|
|
if text is None or len(text.strip()) == 0:
|
|
return []
|
|
text = text.strip()
|
|
# splits = text.split()
|
|
# if len(splits) > 1:
|
|
# return text
|
|
# find all words with capital letter + lower letter
|
|
regex = r"[A-Z][a-z]+"
|
|
word_list = re.findall(regex, text)
|
|
if len(word_list) > 0:
|
|
for word in word_list:
|
|
text = text.replace(word, " " + word + " ")
|
|
text = re.sub(r"(\s)+", " ", text)
|
|
return text.strip()
|
|
|
|
|
|
def remove_special_characters(text):
|
|
text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
|
|
text = re.sub(r"\s+", " ", text)
|
|
text = text.strip()
|
|
return text
|
|
|
|
|
|
def get_unique_words_text(text):
|
|
text = remove_special_characters(text)
|
|
text = text.lower()
|
|
text_split = text.split()
|
|
text_split = list(set(text_split))
|
|
# sort the list
|
|
text_split.sort()
|
|
return_text = " ".join(text_split)
|
|
return return_text
|
|
|
|
|
|
def remove_numeric_characters(text):
|
|
# remove numeric characters
|
|
text = re.sub(r"\d+", " ", text)
|
|
text = re.sub(r"\s+", " ", text)
|
|
text = text.strip()
|
|
return text
|
|
|
|
|
|
def get_jacard_similarity(
|
|
text_left,
|
|
text_right,
|
|
need_remove_special_characters=True,
|
|
need_remove_numeric_characters=True,
|
|
):
|
|
if need_remove_special_characters:
|
|
text_left = remove_special_characters(text_left)
|
|
text_right = remove_special_characters(text_right)
|
|
if need_remove_numeric_characters:
|
|
text_left = remove_numeric_characters(text_left)
|
|
text_right = remove_numeric_characters(text_right)
|
|
text_left = text_left.lower()
|
|
text_right = text_right.lower()
|
|
text_left = text_left.split()
|
|
text_right = text_right.split()
|
|
intersection = set(text_left).intersection(set(text_right))
|
|
union = set(text_left).union(set(text_right))
|
|
if len(union) > 0:
|
|
return round(len(intersection) / len(union), 3)
|
|
else:
|
|
return 0
|
|
|
|
|
|
def get_beginning_common_words(text_list: list):
|
|
"""
|
|
Get the beginning common words in text_list
|
|
"""
|
|
if text_list is None or len(text_list) < 2:
|
|
return []
|
|
|
|
common_words_list = []
|
|
first_text_split = text_list[0].split()
|
|
for w_i, word in enumerate(first_text_split):
|
|
all_same = True
|
|
for text in text_list[1:]:
|
|
text_split = text.split()
|
|
if w_i >= len(text_split):
|
|
all_same = False
|
|
break
|
|
if text_split[w_i] != word:
|
|
all_same = False
|
|
break
|
|
if all_same:
|
|
common_words_list.append(word)
|
|
else:
|
|
break
|
|
|
|
return " ".join(common_words_list).strip()
|
|
|
|
|
|
def replace_abbrevation(text: str):
|
|
if text is None or len(text.strip()) == 0:
|
|
return text
|
|
text = text.strip()
|
|
if "swiss franc" in text.lower():
|
|
text = re.sub(r"swiss\s+franc", "CHF", text, flags=re.IGNORECASE)
|
|
elif "us dollar" in text.lower():
|
|
text = re.sub(r"us\s+dollar", "USD", text, flags=re.IGNORECASE)
|
|
elif "singapore dollar" in text.lower():
|
|
text = re.sub(r"singapore\s+dollar", "SGD", text, flags=re.IGNORECASE)
|
|
elif "hong kong dollar" in text.lower():
|
|
text = re.sub(r"hong\s+kong\s+dollar", "HKD", text, flags=re.IGNORECASE)
|
|
elif "hongkong dollar" in text.lower():
|
|
text = re.sub(r"hongkong\s+dollar", "HKD", text, flags=re.IGNORECASE)
|
|
elif "australian dollar" in text.lower():
|
|
text = re.sub(r"australian\s+dollar", "AUD", text, flags=re.IGNORECASE)
|
|
elif "japanese yen" in text.lower():
|
|
text = re.sub(r"japanese\s+yen", "JPY", text, flags=re.IGNORECASE)
|
|
elif "south african rand" in text.lower():
|
|
text = re.sub(r"South\s+African\s+rand", "ZAR", text, flags=re.IGNORECASE)
|
|
elif "canadian dollar" in text.lower():
|
|
text = re.sub(r"canadian\s+dollar", "CAD", text, flags=re.IGNORECASE)
|
|
elif "new zealand dollar" in text.lower():
|
|
text = re.sub(r"new\s+zealand\s+dollar", "NZD", text, flags=re.IGNORECASE)
|
|
elif "norwegian krone" in text.lower():
|
|
text = re.sub(r"norwegian\s+krone", "NOK", text, flags=re.IGNORECASE)
|
|
elif "danish krone" in text.lower():
|
|
text = re.sub(r"danish\s+krone", "DKK", text, flags=re.IGNORECASE)
|
|
elif "swedish krona" in text.lower():
|
|
text = re.sub(r"swedish\s+krona", "SEK", text, flags=re.IGNORECASE)
|
|
elif "swedish kronor" in text.lower():
|
|
text = re.sub(r"swedish\s+kronor", "SEK", text, flags=re.IGNORECASE)
|
|
elif "GPB" in text.split():
|
|
text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE)
|
|
elif "sterling" in text.lower().split():
|
|
text = re.sub(r"sterling", "GBP", text, flags=re.IGNORECASE)
|
|
elif "euro" in text.lower().split():
|
|
text = re.sub(r"euro", "EUR", text, flags=re.IGNORECASE)
|
|
elif "€" in text.lower().split():
|
|
text = re.sub(r"\€", "EUR", text, flags=re.IGNORECASE)
|
|
elif "$" in text.lower().split():
|
|
text = re.sub(r"\$", "USD", text, flags=re.IGNORECASE)
|
|
elif "£" in text.lower().split():
|
|
text = re.sub(r"\£", "GBP", text, flags=re.IGNORECASE)
|
|
elif "RMB" in text.split():
|
|
text = re.sub(r"RMB", "CNY", text, flags=re.IGNORECASE)
|
|
else:
|
|
pass
|
|
|
|
text_splits = text.split()
|
|
new_text_splits = []
|
|
for split in text_splits:
|
|
if split.lower() in ["acc", "acc."]:
|
|
new_text_splits.append("Accumulation")
|
|
elif split.lower() in ["inc", "inc."]:
|
|
new_text_splits.append("Income")
|
|
elif split.lower() in ["dist", "dist."]:
|
|
new_text_splits.append("Distribution")
|
|
elif split.lower() in ["inv", "inv."]:
|
|
new_text_splits.append("Investor")
|
|
elif split.lower() in ["inst", "inst.", "institution"]:
|
|
new_text_splits.append("Institutional")
|
|
elif split.lower() in ["cap", "cap."]:
|
|
new_text_splits.append("Capitalisation")
|
|
elif split.lower() in ["adm", "adm."]:
|
|
new_text_splits.append("Admin")
|
|
elif split.lower() in ["adv", "adv."]:
|
|
new_text_splits.append("Advantage")
|
|
elif split.lower() in ["hdg", "hgd", "hdg.", "hgd.", "(h)"]:
|
|
new_text_splits.append("Hedged")
|
|
elif split.lower() in ["cl", "cl."]:
|
|
new_text_splits.append("Class")
|
|
elif split.lower() in ["ser", "ser."]:
|
|
new_text_splits.append("Series")
|
|
elif split.lower() in ["u.s."]:
|
|
new_text_splits.append("US")
|
|
elif split.lower() in ["nc", "nc."]:
|
|
new_text_splits.append("no trail")
|
|
else:
|
|
new_text_splits.append(split)
|
|
|
|
new_text = " ".join(new_text_splits)
|
|
return new_text
|