2024-08-23 21:38:11 +00:00
|
|
|
import re
|
2024-11-26 17:24:29 +00:00
|
|
|
import os
|
|
|
|
|
import time
|
2024-09-27 21:39:56 +00:00
|
|
|
from utils.logger import logger
|
2024-09-09 22:34:53 +00:00
|
|
|
from copy import deepcopy
|
2024-09-25 22:28:22 +00:00
|
|
|
from traceback import print_exc
|
2024-08-23 21:38:11 +00:00
|
|
|
|
2024-09-27 14:25:11 +00:00
|
|
|
|
|
|
|
|
total_currency_list = [
|
|
|
|
|
"USD",
|
|
|
|
|
"EUR",
|
|
|
|
|
"AUD",
|
|
|
|
|
"JPY",
|
|
|
|
|
"CHF",
|
|
|
|
|
"GBP",
|
|
|
|
|
"SEK",
|
|
|
|
|
"CNY",
|
|
|
|
|
"NZD",
|
|
|
|
|
"CNH",
|
|
|
|
|
"NOK",
|
|
|
|
|
"SGD",
|
|
|
|
|
"HKD",
|
|
|
|
|
"ZAR",
|
|
|
|
|
"PLN",
|
|
|
|
|
"CAD",
|
|
|
|
|
"CZK",
|
|
|
|
|
"HUF",
|
|
|
|
|
"DKK",
|
|
|
|
|
"BRL",
|
|
|
|
|
"SKK",
|
|
|
|
|
"RON",
|
|
|
|
|
"TRY",
|
|
|
|
|
"BGN",
|
|
|
|
|
"CUP",
|
|
|
|
|
"MXN",
|
|
|
|
|
"CLF",
|
|
|
|
|
"XCD",
|
|
|
|
|
"ISK",
|
|
|
|
|
"IDR",
|
|
|
|
|
"MNT",
|
|
|
|
|
"AED",
|
|
|
|
|
"AFN",
|
|
|
|
|
"INR",
|
|
|
|
|
"ESP",
|
|
|
|
|
"RUB",
|
|
|
|
|
"CLP",
|
|
|
|
|
"KRW",
|
|
|
|
|
"ETB",
|
|
|
|
|
"DZD",
|
|
|
|
|
"XEU",
|
|
|
|
|
"XFO",
|
|
|
|
|
]
|
|
|
|
|
|
2024-10-01 21:46:59 +00:00
|
|
|
share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Admin', 'Advantage']
|
2024-09-27 21:39:56 +00:00
|
|
|
share_features_abbrevation = ['Acc', 'Inc', 'Dist', 'Div', 'Inv', 'Inst', 'Adm', 'Adv']
|
2024-10-07 15:34:13 +00:00
|
|
|
lower_pre_fix_fund_share = ['fund', "funds", 'portfolio',
|
|
|
|
|
'bond', 'bonds', 'class',
|
|
|
|
|
'classes', 'share', 'shares']
|
2024-09-27 21:39:56 +00:00
|
|
|
|
2024-09-27 14:25:11 +00:00
|
|
|
|
2024-08-23 21:38:11 +00:00
|
|
|
def add_slash_to_text_as_regex(text: str):
|
|
|
|
|
if text is None or len(text) == 0:
|
|
|
|
|
return text
|
|
|
|
|
special_char_iter = re.finditer("\W", text)
|
|
|
|
|
for special_iter in special_char_iter:
|
|
|
|
|
if len(special_iter.group().strip()) == 0:
|
|
|
|
|
continue
|
|
|
|
|
replace = r"\{0}".format(special_iter.group())
|
|
|
|
|
if replace not in text:
|
2024-11-08 17:22:35 +00:00
|
|
|
text = re.sub(replace, r"\\W", text)
|
|
|
|
|
text = re.sub(r"( ){2,}", " ", text)
|
|
|
|
|
text = text.replace(" ", r"\s*")
|
2024-08-28 15:21:26 +00:00
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_text(text: str) -> str:
|
2024-09-03 22:07:53 +00:00
|
|
|
# text = text.lower()
|
2024-08-28 15:21:26 +00:00
|
|
|
# update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
|
2024-09-27 00:25:17 +00:00
|
|
|
text = re.sub(r"\\u[A-Z0-9a-z]{4}", ' ', text)
|
|
|
|
|
text = re.sub(r"( ){2,}", ' ', text.strip())
|
2024-09-09 22:34:53 +00:00
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
2024-09-27 21:39:56 +00:00
|
|
|
def get_most_similar_name(text: str,
|
|
|
|
|
name_list: list,
|
|
|
|
|
share_name: str = None,
|
2024-10-09 04:53:55 +00:00
|
|
|
fund_name: str = None,
|
2024-09-27 21:39:56 +00:00
|
|
|
matching_type="share",
|
|
|
|
|
pre_common_word_list: list = None,
|
|
|
|
|
process_cache: dict = None) -> str:
|
2024-09-09 22:34:53 +00:00
|
|
|
"""
|
|
|
|
|
Get the most similar fund name from fund_name_list by jacard similarity
|
|
|
|
|
"""
|
|
|
|
|
try:
|
2024-09-27 14:25:11 +00:00
|
|
|
copy_name_list = deepcopy(name_list)
|
2024-09-27 00:25:17 +00:00
|
|
|
if text is None or len(text.split()) == 0 or \
|
2024-09-27 14:25:11 +00:00
|
|
|
copy_name_list is None or len(copy_name_list) == 0:
|
2024-09-09 22:34:53 +00:00
|
|
|
return None, None
|
|
|
|
|
|
2024-10-02 16:38:11 +00:00
|
|
|
for i in range(len(copy_name_list)):
|
|
|
|
|
copy_name = copy_name_list[i]
|
2024-11-21 17:37:58 +00:00
|
|
|
if matching_type == "share":
|
|
|
|
|
copy_name, _ = replace_share_name_for_multilingual(copy_name, None)
|
|
|
|
|
share_part = get_share_part_list([copy_name])[0]
|
|
|
|
|
if '-' in share_part:
|
|
|
|
|
copy_name = copy_name.replace('-', ' ')
|
2024-10-02 16:38:11 +00:00
|
|
|
copy_name = replace_abbrevation(copy_name)
|
|
|
|
|
copy_name_list[i] = copy_name
|
2024-09-09 22:34:53 +00:00
|
|
|
|
|
|
|
|
# get common words in fund_name_list
|
|
|
|
|
common_word_list = []
|
|
|
|
|
if len(name_list) > 1:
|
2024-09-27 14:25:11 +00:00
|
|
|
_, common_word_list = remove_common_word(copy_name_list)
|
2024-09-25 20:15:38 +00:00
|
|
|
if pre_common_word_list is not None and len(pre_common_word_list) > 0:
|
2024-09-27 00:25:17 +00:00
|
|
|
common_word_list.extend([word for word in pre_common_word_list
|
|
|
|
|
if word not in common_word_list])
|
2024-10-01 21:46:59 +00:00
|
|
|
|
|
|
|
|
if len(common_word_list) > 0:
|
|
|
|
|
common_word_list = [word for word in common_word_list
|
|
|
|
|
if len(word) > 1 and word.upper() not in total_currency_list]
|
2024-09-09 22:34:53 +00:00
|
|
|
|
|
|
|
|
text = text.strip()
|
2024-09-27 00:25:17 +00:00
|
|
|
text = remove_special_characters(text)
|
2024-09-09 22:34:53 +00:00
|
|
|
text = replace_abbrevation(text)
|
2024-10-09 04:53:55 +00:00
|
|
|
raw_fund_name_split = []
|
|
|
|
|
if fund_name is not None and len(fund_name.strip()) > 0:
|
|
|
|
|
fund_name = fund_name.strip()
|
|
|
|
|
fund_name = remove_special_characters(fund_name)
|
|
|
|
|
raw_fund_name_split = fund_name.upper().split()
|
|
|
|
|
|
2024-10-01 20:31:15 +00:00
|
|
|
if share_name is not None:
|
|
|
|
|
share_name = remove_special_characters(share_name)
|
|
|
|
|
share_name = replace_abbrevation(share_name)
|
2024-11-21 17:37:58 +00:00
|
|
|
|
|
|
|
|
text, share_name = replace_share_name_for_multilingual(text, share_name)
|
2024-10-01 20:31:15 +00:00
|
|
|
|
2024-12-06 15:54:43 +00:00
|
|
|
if matching_type == "share" and share_name is None:
|
|
|
|
|
text, share_name = replace_share_name_for_multilingual(text, None)
|
|
|
|
|
|
2024-09-09 22:34:53 +00:00
|
|
|
text_splits = text.split()
|
|
|
|
|
if len(text_splits) == 1:
|
|
|
|
|
text = split_words_without_space(text)
|
|
|
|
|
else:
|
|
|
|
|
new_splits = []
|
|
|
|
|
for split in text_splits:
|
|
|
|
|
if len(split) > 1:
|
|
|
|
|
new_splits.extend(split_words_without_space(split).split())
|
|
|
|
|
else:
|
|
|
|
|
new_splits.append(split)
|
2024-10-02 18:25:08 +00:00
|
|
|
text = ' '.join(new_splits)
|
2024-09-09 22:34:53 +00:00
|
|
|
lower_new_splits = [split.lower() for split in new_splits]
|
|
|
|
|
for word in common_word_list:
|
|
|
|
|
if word not in lower_new_splits:
|
|
|
|
|
# remove word in fund_name_list
|
2024-09-27 14:25:11 +00:00
|
|
|
for i in range(len(copy_name_list)):
|
|
|
|
|
temp_splits = copy_name_list[i].split()
|
|
|
|
|
copy_name_list[i] = ' '.join([split for split in temp_splits
|
2024-09-27 00:25:17 +00:00
|
|
|
if remove_special_characters(split).lower() != word])
|
2024-10-02 18:25:08 +00:00
|
|
|
|
|
|
|
|
max_similarity = 0
|
|
|
|
|
max_similarity_full_name = None
|
|
|
|
|
text = remove_special_characters(text)
|
|
|
|
|
|
|
|
|
|
if matching_type == "share":
|
|
|
|
|
text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list)
|
|
|
|
|
|
|
|
|
|
text = ' '.join([split for split in text.split()
|
2024-10-07 15:34:13 +00:00
|
|
|
if split.lower() not in lower_pre_fix_fund_share])
|
2024-10-02 18:25:08 +00:00
|
|
|
if share_name is not None:
|
|
|
|
|
share_name = ' '.join([split for split in share_name.split()
|
2024-10-07 15:34:13 +00:00
|
|
|
if split.lower() not in lower_pre_fix_fund_share])
|
2024-10-01 20:31:15 +00:00
|
|
|
|
2024-10-02 18:25:08 +00:00
|
|
|
copy_share_name_list = get_share_part_list(copy_name_list)
|
2024-10-01 20:31:15 +00:00
|
|
|
for i in range(len(copy_name_list)):
|
|
|
|
|
temp_splits = copy_name_list[i].split()
|
|
|
|
|
copy_name_list[i] = ' '.join([split for split in temp_splits
|
|
|
|
|
if remove_special_characters(split).lower()
|
2024-10-07 15:34:13 +00:00
|
|
|
not in lower_pre_fix_fund_share])
|
2024-09-27 21:39:56 +00:00
|
|
|
text_currency = None
|
|
|
|
|
text_feature = None
|
2024-10-02 20:08:26 +00:00
|
|
|
text_share_short_name_list = None
|
2024-09-27 21:39:56 +00:00
|
|
|
if matching_type == "share" and text is not None and len(text.strip()) > 0:
|
|
|
|
|
if process_cache is not None and isinstance(process_cache, dict):
|
|
|
|
|
if process_cache.get(text, None) is not None:
|
|
|
|
|
cache = process_cache.get(text)
|
2024-10-02 22:25:25 +00:00
|
|
|
text_share_short_name_list = cache.get("share_short_name")
|
2024-09-27 21:39:56 +00:00
|
|
|
text_feature = cache.get("share_feature")
|
|
|
|
|
text_currency = cache.get("share_currency")
|
|
|
|
|
else:
|
2024-10-01 20:31:15 +00:00
|
|
|
if share_name is not None and len(share_name.strip()) > 0:
|
2024-10-09 04:53:55 +00:00
|
|
|
text_share_short_name_list = get_share_short_name_from_text(share_name,
|
|
|
|
|
confirm_text_share=True)
|
2024-10-01 20:31:15 +00:00
|
|
|
text_feature = get_share_feature_from_text(share_name)
|
|
|
|
|
text_currency = get_currency_from_text(share_name)
|
|
|
|
|
else:
|
2024-10-09 04:53:55 +00:00
|
|
|
text_share_short_name_list = get_share_short_name_from_text(text,
|
|
|
|
|
confirm_text_share=True)
|
2024-10-01 20:31:15 +00:00
|
|
|
text_feature = get_share_feature_from_text(text)
|
|
|
|
|
text_currency = get_currency_from_text(text)
|
2024-10-02 20:08:26 +00:00
|
|
|
# sort text_share_short_name_list
|
|
|
|
|
text_share_short_name_list.sort()
|
2024-09-27 21:39:56 +00:00
|
|
|
process_cache[text] = {
|
2024-10-02 20:08:26 +00:00
|
|
|
"share_short_name": text_share_short_name_list,
|
2024-09-27 21:39:56 +00:00
|
|
|
"share_feature": text_feature,
|
|
|
|
|
"share_currency": text_currency
|
|
|
|
|
}
|
|
|
|
|
else:
|
2024-10-07 15:34:13 +00:00
|
|
|
if share_name is not None and len(share_name.strip()) > 0:
|
2024-10-09 04:53:55 +00:00
|
|
|
text_share_short_name_list = get_share_short_name_from_text(share_name,
|
|
|
|
|
confirm_text_share=True)
|
2024-10-07 15:34:13 +00:00
|
|
|
text_share_short_name_list.sort()
|
|
|
|
|
text_feature = get_share_feature_from_text(share_name)
|
|
|
|
|
text_currency = get_currency_from_text(share_name)
|
|
|
|
|
else:
|
2024-10-09 04:53:55 +00:00
|
|
|
text_share_short_name_list = get_share_short_name_from_text(text,
|
|
|
|
|
confirm_text_share=True)
|
2024-10-07 15:34:13 +00:00
|
|
|
text_feature = get_share_feature_from_text(text)
|
|
|
|
|
text_currency = get_currency_from_text(text)
|
2024-09-27 21:39:56 +00:00
|
|
|
|
|
|
|
|
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
|
2024-09-30 21:32:56 +00:00
|
|
|
same_max_similarity_name_list = []
|
2024-10-01 20:31:15 +00:00
|
|
|
for full_name, copy_name, copy_share_name in zip(name_list , copy_name_list, copy_share_name_list):
|
2024-10-02 18:25:08 +00:00
|
|
|
if not isinstance(copy_name, str) or len(copy_name.strip()) == 0:
|
|
|
|
|
continue
|
2024-09-27 14:25:11 +00:00
|
|
|
copy_name = remove_special_characters(copy_name)
|
|
|
|
|
copy_name = split_words_without_space(copy_name)
|
2024-10-09 04:53:55 +00:00
|
|
|
copy_name_short_name_list = None
|
|
|
|
|
copy_name_feature = None
|
|
|
|
|
copy_name_currency = None
|
|
|
|
|
if matching_type == "share":
|
|
|
|
|
if process_cache is not None and isinstance(process_cache, dict):
|
|
|
|
|
if process_cache.get(copy_name, None) is not None:
|
|
|
|
|
cache = process_cache.get(copy_name)
|
|
|
|
|
copy_name_short_name_list = cache.get("share_short_name")
|
|
|
|
|
copy_name_feature = cache.get("share_feature")
|
|
|
|
|
copy_name_currency = cache.get("share_currency")
|
|
|
|
|
else:
|
|
|
|
|
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
|
|
|
|
|
if copy_name_short_name_list is not None:
|
|
|
|
|
copy_name_short_name_list.sort()
|
|
|
|
|
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
|
|
|
|
copy_name_currency = get_currency_from_text(copy_share_name)
|
|
|
|
|
process_cache[copy_name] = {
|
|
|
|
|
"share_short_name": copy_name_short_name_list,
|
|
|
|
|
"share_feature": copy_name_feature,
|
|
|
|
|
"share_currency": copy_name_currency
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
|
|
|
|
|
copy_name_short_name_list.sort()
|
|
|
|
|
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
|
|
|
|
copy_name_currency = get_currency_from_text(copy_share_name)
|
|
|
|
|
try:
|
|
|
|
|
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
|
|
|
|
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
|
|
|
|
|
updated_text_share_short_name_list, updated_copy_name_short_name_list = \
|
|
|
|
|
compare_both_short_name(text_share_short_name_list, copy_name_short_name_list)
|
|
|
|
|
|
|
|
|
|
if updated_text_share_short_name_list != text_share_short_name_list:
|
|
|
|
|
text = ' '.join([split for split in text.split()
|
|
|
|
|
if split not in text_share_short_name_list])
|
|
|
|
|
text += ' ' + ' '.join(updated_text_share_short_name_list)
|
|
|
|
|
text_share_short_name_list = updated_text_share_short_name_list
|
|
|
|
|
|
|
|
|
|
if updated_copy_name_short_name_list != copy_name_short_name_list:
|
|
|
|
|
copy_name = ' '.join([split for split in copy_name.split()
|
|
|
|
|
if split not in copy_name_short_name_list])
|
|
|
|
|
copy_name += ' ' + ' '.join(updated_copy_name_short_name_list)
|
|
|
|
|
copy_name_short_name_list = updated_copy_name_short_name_list
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(e)
|
|
|
|
|
|
2024-12-02 19:11:49 +00:00
|
|
|
compare_text = text
|
2024-10-02 18:25:08 +00:00
|
|
|
try:
|
2024-12-02 19:11:49 +00:00
|
|
|
text_split = text.split()
|
|
|
|
|
text_split_lower = text.lower().split()
|
|
|
|
|
copy_name_split_lower = copy_name.lower().split()
|
|
|
|
|
if copy_name_feature == "accumulation" and \
|
|
|
|
|
(text_feature is None or len(text_feature) == 0 or
|
|
|
|
|
text_feature in ["capitalisation", "institutional"]
|
|
|
|
|
or "capitalisation" in text_split_lower or "institutional" in text_split_lower):
|
|
|
|
|
if "capitalisation" not in copy_name_split_lower:
|
|
|
|
|
compare_text = " ".join([split for split in text_split
|
|
|
|
|
if split.lower() not in ["cap", "cap.", "capitalisation"]])
|
|
|
|
|
text_split = compare_text.split()
|
|
|
|
|
if "institutional" not in copy_name_split_lower:
|
|
|
|
|
compare_text = " ".join([split for split in text_split
|
|
|
|
|
if split.lower() not in ["inst", "inst.", "institutional"]])
|
|
|
|
|
text_split = compare_text.split()
|
|
|
|
|
if text_feature is not None and len(text_feature) > 0:
|
|
|
|
|
compare_text = " ".join([split for split in text_split
|
|
|
|
|
if split.lower() != text_feature])
|
|
|
|
|
compare_text += " accumulation"
|
|
|
|
|
text_feature = "accumulation"
|
|
|
|
|
elif copy_name_feature == "income" and \
|
|
|
|
|
(text_feature is None or len(text_feature) == 0 or text_feature == "distribution"):
|
|
|
|
|
if "dist" in text_split_lower or "dist." in text_split_lower or "distribution" in text_split_lower:
|
|
|
|
|
compare_text = " ".join([split for split in text_split
|
|
|
|
|
if split.lower() not in ["dist", "dist.", "distribution"]])
|
|
|
|
|
compare_text += " income"
|
|
|
|
|
text_feature = "income"
|
|
|
|
|
else:
|
|
|
|
|
pass
|
|
|
|
|
similarity = get_jacard_similarity(compare_text,
|
|
|
|
|
copy_name,
|
|
|
|
|
need_remove_numeric_characters=False)
|
2024-10-02 18:25:08 +00:00
|
|
|
except Exception as e:
|
|
|
|
|
print(e)
|
|
|
|
|
print_exc()
|
|
|
|
|
similarity = 0
|
2024-10-02 22:25:25 +00:00
|
|
|
if similarity == 1:
|
|
|
|
|
return full_name, similarity
|
2024-09-27 14:25:11 +00:00
|
|
|
copy_name_2 = replace_abbrevation(copy_name)
|
|
|
|
|
if copy_name != copy_name_2:
|
2024-12-02 19:11:49 +00:00
|
|
|
similarity_2 = get_jacard_similarity(compare_text,
|
|
|
|
|
copy_name_2,
|
|
|
|
|
need_remove_numeric_characters=False)
|
2024-09-27 14:25:11 +00:00
|
|
|
if similarity_2 > similarity:
|
|
|
|
|
similarity = similarity_2
|
2024-09-09 22:34:53 +00:00
|
|
|
if similarity > max_similarity:
|
2024-10-09 04:53:55 +00:00
|
|
|
if matching_type == "share":
|
2024-09-27 21:39:56 +00:00
|
|
|
if text_currency is not None and len(text_currency) > 0 and \
|
|
|
|
|
copy_name_currency is not None and len(copy_name_currency) > 0:
|
|
|
|
|
if text_currency != copy_name_currency:
|
|
|
|
|
continue
|
|
|
|
|
if text_feature is not None and len(text_feature) > 0 and \
|
|
|
|
|
copy_name_feature is not None and len(copy_name_feature) > 0:
|
|
|
|
|
if text_feature != copy_name_feature:
|
2024-10-09 19:07:07 +00:00
|
|
|
if text_feature.lower() not in copy_name.lower().split() and \
|
|
|
|
|
copy_name_feature.lower() != "accmulation" and \
|
2024-12-02 19:11:49 +00:00
|
|
|
copy_name_feature.lower() not in compare_text.lower().split():
|
2024-10-01 21:46:59 +00:00
|
|
|
continue
|
2024-09-27 21:39:56 +00:00
|
|
|
if matching_type == "share":
|
2024-10-02 20:08:26 +00:00
|
|
|
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
|
|
|
|
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
|
2024-10-09 04:53:55 +00:00
|
|
|
short_name_invalid = False
|
2024-10-02 22:25:25 +00:00
|
|
|
for short in text_share_short_name_list:
|
|
|
|
|
if short not in copy_name_short_name_list:
|
2024-10-09 04:53:55 +00:00
|
|
|
short_name_invalid = True
|
2024-10-02 22:25:25 +00:00
|
|
|
break
|
2024-10-09 04:53:55 +00:00
|
|
|
for compare_short in copy_name_short_name_list:
|
|
|
|
|
if compare_short not in text_share_short_name_list:
|
|
|
|
|
# some short word is in fund name, but not belong to share name
|
2024-12-02 19:11:49 +00:00
|
|
|
|
2024-10-09 04:53:55 +00:00
|
|
|
if compare_short.upper() not in raw_fund_name_split:
|
|
|
|
|
short_name_invalid = True
|
|
|
|
|
break
|
|
|
|
|
if short_name_invalid:
|
2024-10-02 22:25:25 +00:00
|
|
|
continue
|
2024-09-09 22:34:53 +00:00
|
|
|
max_similarity = similarity
|
2024-09-27 14:25:11 +00:00
|
|
|
max_similarity_full_name = full_name
|
2024-09-30 21:32:56 +00:00
|
|
|
same_max_similarity_name_list = []
|
|
|
|
|
elif matching_type == "fund" and max_similarity > 0 and max_similarity == similarity:
|
|
|
|
|
if full_name is not None and max_similarity_full_name is not None and \
|
|
|
|
|
len(full_name.split()) > len(max_similarity_full_name.split()):
|
|
|
|
|
max_similarity_full_name = full_name
|
|
|
|
|
same_max_similarity_name_list = []
|
|
|
|
|
else:
|
|
|
|
|
if full_name is not None:
|
|
|
|
|
same_max_similarity_name_list.append(full_name)
|
2024-09-09 22:34:53 +00:00
|
|
|
if max_similarity == 1:
|
|
|
|
|
break
|
2024-09-30 21:32:56 +00:00
|
|
|
# if there are multiple names with the same similarity, return None
|
|
|
|
|
if len(same_max_similarity_name_list) > 0:
|
|
|
|
|
return None, 0.0
|
2024-09-09 22:34:53 +00:00
|
|
|
if max_similarity < 0.35:
|
|
|
|
|
return None, max_similarity
|
2024-09-27 14:25:11 +00:00
|
|
|
return max_similarity_full_name, max_similarity
|
2024-09-09 22:34:53 +00:00
|
|
|
except Exception as e:
|
|
|
|
|
print(e)
|
2024-09-25 22:28:22 +00:00
|
|
|
print_exc()
|
2024-09-09 22:34:53 +00:00
|
|
|
return None, 0.0
|
|
|
|
|
|
2024-10-01 20:31:15 +00:00
|
|
|
|
2024-11-21 17:37:58 +00:00
|
|
|
def replace_share_name_for_multilingual(text: str, share_name: str):
|
|
|
|
|
if text is None or len(text.strip()) == 0:
|
|
|
|
|
return text, share_name
|
|
|
|
|
|
|
|
|
|
multilingual_share_list = ["Catégorie de parts", "Classe di quote",
|
|
|
|
|
"Kategorie Anteile", "Kategorie anteile",
|
|
|
|
|
"Clase de participaciones", "Aandelenklasse",
|
2024-12-03 17:27:28 +00:00
|
|
|
"aandelenklasse", "Anteilklasse", "anteilklasse",
|
|
|
|
|
"Aktien", "Aktienklasse", "aktien", "aktienklasse"]
|
2024-11-21 17:37:58 +00:00
|
|
|
for multilingual_share in multilingual_share_list:
|
|
|
|
|
if multilingual_share in text:
|
|
|
|
|
text = text.replace(multilingual_share, "Class")
|
|
|
|
|
if share_name is not None and len(share_name.strip()) > 0:
|
|
|
|
|
share_name = share_name.replace(multilingual_share, "Class")
|
|
|
|
|
break
|
|
|
|
|
return text, share_name
|
|
|
|
|
|
|
|
|
|
|
2024-10-09 04:53:55 +00:00
|
|
|
def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list):
|
|
|
|
|
copy_text_short_name_list = deepcopy(text_short_name_list)
|
|
|
|
|
copy_compare_short_name_list = deepcopy(compare_short_name_list)
|
|
|
|
|
copy_text_short_name_list = verify_short_name_container(copy_text_short_name_list,
|
|
|
|
|
copy_compare_short_name_list)
|
|
|
|
|
copy_compare_short_name_list = verify_short_name_container(copy_compare_short_name_list,
|
|
|
|
|
copy_text_short_name_list)
|
|
|
|
|
return copy_text_short_name_list, copy_compare_short_name_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def verify_short_name_container(left_short_name_list: list, right_short_name_list: list):
|
|
|
|
|
length_1_over_1 = False
|
|
|
|
|
length_1_count = 0
|
|
|
|
|
length_1_list = []
|
|
|
|
|
for short_name in left_short_name_list:
|
|
|
|
|
if len(short_name) == 1:
|
|
|
|
|
length_1_count += 1
|
|
|
|
|
length_1_list.append(short_name)
|
|
|
|
|
if length_1_count > 1:
|
|
|
|
|
length_1_over_1 = True
|
|
|
|
|
|
|
|
|
|
if length_1_over_1:
|
|
|
|
|
for compare_short_name in right_short_name_list:
|
|
|
|
|
if len(compare_short_name) == length_1_count:
|
|
|
|
|
all_in = True
|
|
|
|
|
for short_name in length_1_list:
|
|
|
|
|
if short_name not in compare_short_name:
|
|
|
|
|
all_in = False
|
|
|
|
|
break
|
|
|
|
|
if all_in:
|
|
|
|
|
for short_name in length_1_list:
|
|
|
|
|
if short_name in left_short_name_list:
|
|
|
|
|
left_short_name_list.remove(short_name)
|
|
|
|
|
left_short_name_list.append(compare_short_name)
|
|
|
|
|
return left_short_name_list
|
|
|
|
|
|
|
|
|
|
|
2024-10-01 20:31:15 +00:00
|
|
|
def get_share_part_list(text_list: list):
|
|
|
|
|
share_part_list = []
|
|
|
|
|
for text in text_list:
|
2024-10-02 22:25:25 +00:00
|
|
|
text_split = text.split("Funds")
|
2024-10-01 20:31:15 +00:00
|
|
|
if len(text_split) == 1:
|
2024-10-02 22:25:25 +00:00
|
|
|
text_split = text.split("Fund")
|
2024-10-01 20:31:15 +00:00
|
|
|
if len(text_split) == 1:
|
|
|
|
|
text_split = text.split("Portfolio")
|
2024-10-01 21:46:59 +00:00
|
|
|
if len(text_split) == 1:
|
|
|
|
|
text_split = text.split("Bonds")
|
2024-10-02 22:25:25 +00:00
|
|
|
if len(text_split) == 1:
|
|
|
|
|
text_split = text.split("Bond")
|
2024-10-01 20:31:15 +00:00
|
|
|
if len(text_split) > 1:
|
2024-10-02 22:25:25 +00:00
|
|
|
share_part_text = text_split[-1].strip()
|
2024-10-01 20:31:15 +00:00
|
|
|
else:
|
2024-10-02 22:25:25 +00:00
|
|
|
share_part_text = text.strip()
|
|
|
|
|
share_part_text = ' '.join([split for split in share_part_text.split()
|
|
|
|
|
if remove_special_characters(split).lower()
|
2024-10-07 15:34:13 +00:00
|
|
|
not in lower_pre_fix_fund_share])
|
2024-10-02 22:25:25 +00:00
|
|
|
share_part_list.append(share_part_text)
|
2024-10-01 20:31:15 +00:00
|
|
|
return share_part_list
|
|
|
|
|
|
|
|
|
|
|
2024-10-09 04:53:55 +00:00
|
|
|
def get_share_short_name_from_text(text: str, confirm_text_share: bool = False):
|
2024-09-27 21:39:56 +00:00
|
|
|
if text is None or len(text.strip()) == 0:
|
|
|
|
|
return None
|
2024-10-01 20:31:15 +00:00
|
|
|
text = remove_special_characters(text.strip())
|
2024-09-27 21:39:56 +00:00
|
|
|
text_split = text.split()
|
|
|
|
|
temp_share_features = [feature.lower() for feature in share_features_full_name]
|
|
|
|
|
|
|
|
|
|
count = 0
|
2024-10-02 20:08:26 +00:00
|
|
|
share_short_name_list = []
|
2024-10-09 04:53:55 +00:00
|
|
|
if confirm_text_share:
|
|
|
|
|
count_threshold = 6
|
|
|
|
|
else:
|
|
|
|
|
count_threshold = 4
|
2024-09-27 21:39:56 +00:00
|
|
|
for split in text_split[::-1]:
|
2024-10-09 04:53:55 +00:00
|
|
|
if count == count_threshold:
|
2024-09-27 21:39:56 +00:00
|
|
|
break
|
|
|
|
|
if split.lower() not in temp_share_features and \
|
2024-10-02 20:08:26 +00:00
|
|
|
split.upper() not in total_currency_list:
|
2024-10-02 22:25:25 +00:00
|
|
|
if len(split) <= 3:
|
2024-10-02 20:08:26 +00:00
|
|
|
share_short_name_list.append(split.upper())
|
2024-10-09 04:53:55 +00:00
|
|
|
count += 1
|
2024-10-02 22:25:25 +00:00
|
|
|
|
|
|
|
|
if len(share_short_name_list) > 1:
|
|
|
|
|
remove_number = []
|
|
|
|
|
for short_name in share_short_name_list[::-1]:
|
|
|
|
|
if short_name.isdigit():
|
|
|
|
|
remove_number.append(short_name)
|
|
|
|
|
else:
|
|
|
|
|
break
|
|
|
|
|
for remove in remove_number:
|
|
|
|
|
if remove in share_short_name_list:
|
|
|
|
|
share_short_name_list.remove(remove)
|
2024-10-02 20:08:26 +00:00
|
|
|
return share_short_name_list
|
2024-09-27 21:39:56 +00:00
|
|
|
|
2024-09-27 14:25:11 +00:00
|
|
|
def get_share_feature_from_text(text: str):
|
|
|
|
|
if text is None or len(text.strip()) == 0:
|
|
|
|
|
return None
|
|
|
|
|
text = text.strip()
|
|
|
|
|
text = text.lower()
|
|
|
|
|
text_split = text.split()
|
2024-09-27 21:39:56 +00:00
|
|
|
temp_share_features = [feature.lower() for feature in share_features_full_name]
|
|
|
|
|
count = 0
|
2024-09-27 14:25:11 +00:00
|
|
|
for split in text_split[::-1]:
|
2024-09-27 21:39:56 +00:00
|
|
|
if count == 4:
|
|
|
|
|
break
|
|
|
|
|
if split.lower() in temp_share_features:
|
2024-09-27 14:25:11 +00:00
|
|
|
return split
|
2024-09-27 21:39:56 +00:00
|
|
|
count += 1
|
2024-09-27 14:25:11 +00:00
|
|
|
return None
|
|
|
|
|
|
2024-11-21 22:36:58 +00:00
|
|
|
|
2024-09-27 14:25:11 +00:00
|
|
|
def get_currency_from_text(text: str):
|
|
|
|
|
if text is None or len(text.strip()) == 0:
|
|
|
|
|
return None
|
|
|
|
|
text = text.strip()
|
|
|
|
|
text_split = text.split()
|
2024-09-27 21:39:56 +00:00
|
|
|
count = 0
|
2024-11-21 17:37:58 +00:00
|
|
|
currency_list = []
|
2024-09-27 14:25:11 +00:00
|
|
|
for split in text_split[::-1]:
|
2024-09-27 21:39:56 +00:00
|
|
|
if count == 4:
|
|
|
|
|
break
|
2024-09-27 14:25:11 +00:00
|
|
|
if split.upper() in total_currency_list:
|
2024-11-21 17:37:58 +00:00
|
|
|
currency_list.append(split.upper())
|
2024-09-27 21:39:56 +00:00
|
|
|
count += 1
|
2024-11-21 17:37:58 +00:00
|
|
|
if len(currency_list) > 1:
|
|
|
|
|
# remove the first currency from currency list
|
2024-11-21 22:36:58 +00:00
|
|
|
if currency_list[0] in ['USD', 'EUR']:
|
|
|
|
|
currency_list.pop(0)
|
|
|
|
|
else:
|
|
|
|
|
remove_currency = None
|
|
|
|
|
for currency in currency_list:
|
|
|
|
|
if currency in ['USD', 'EUR']:
|
|
|
|
|
remove_currency = currency
|
|
|
|
|
break
|
|
|
|
|
if remove_currency is not None:
|
|
|
|
|
currency_list.remove(remove_currency)
|
2024-11-21 17:37:58 +00:00
|
|
|
return currency_list[0]
|
|
|
|
|
elif len(currency_list) == 1:
|
|
|
|
|
return currency_list[0]
|
|
|
|
|
else:
|
|
|
|
|
return None
|
2024-09-27 14:25:11 +00:00
|
|
|
|
2024-09-09 22:34:53 +00:00
|
|
|
|
2024-10-01 20:31:15 +00:00
|
|
|
def update_for_currency(text: str, share_name: str, compare_list: list):
|
2024-12-03 17:27:28 +00:00
|
|
|
try:
|
|
|
|
|
currency_in_text = get_currency_from_text(text)
|
|
|
|
|
with_currency = False
|
|
|
|
|
if currency_in_text is not None:
|
|
|
|
|
with_currency = True
|
|
|
|
|
|
|
|
|
|
with_currency_list = []
|
|
|
|
|
without_currency_list = []
|
|
|
|
|
for index, compare in enumerate(compare_list):
|
|
|
|
|
# compare_split = compare.split()
|
|
|
|
|
with_currency_compare = False
|
|
|
|
|
currecy_in_compare = get_currency_from_text(compare)
|
|
|
|
|
if currecy_in_compare is not None:
|
|
|
|
|
with_currency_compare = True
|
|
|
|
|
|
|
|
|
|
if with_currency_compare:
|
|
|
|
|
with_currency_list.append(index)
|
|
|
|
|
else:
|
|
|
|
|
without_currency_list.append(index)
|
|
|
|
|
if not with_currency and len(with_currency_list) == 0:
|
|
|
|
|
pass
|
|
|
|
|
elif not with_currency and len(with_currency_list) > 0:
|
|
|
|
|
share_short_name_list = []
|
|
|
|
|
if share_name is not None and len(share_name.strip()) > 0:
|
|
|
|
|
share_short_name_list = get_share_short_name_from_text(share_name)
|
|
|
|
|
updated = False
|
|
|
|
|
if len(share_short_name_list) > 0:
|
|
|
|
|
if len(without_currency_list) > 0:
|
|
|
|
|
for index in without_currency_list:
|
|
|
|
|
all_in_list = True
|
|
|
|
|
compare_split = [split.upper() for split in compare_list[index].split()]
|
|
|
|
|
for share_shot_name in share_short_name_list:
|
|
|
|
|
if share_shot_name not in compare_split:
|
|
|
|
|
all_in_list = False
|
|
|
|
|
break
|
|
|
|
|
if all_in_list:
|
|
|
|
|
text = text + ' ' + 'USD'
|
|
|
|
|
if share_name is not None:
|
|
|
|
|
share_name = share_name + ' ' + 'USD'
|
|
|
|
|
updated = True
|
2024-10-02 20:08:26 +00:00
|
|
|
break
|
2024-12-03 17:27:28 +00:00
|
|
|
if not updated:
|
|
|
|
|
currency_list = []
|
|
|
|
|
for index in with_currency_list:
|
|
|
|
|
all_in_list = True
|
|
|
|
|
compare_split = [split.upper() for split in compare_list[index].split()]
|
|
|
|
|
for share_shot_name in share_short_name_list:
|
|
|
|
|
if share_shot_name not in compare_split:
|
|
|
|
|
all_in_list = False
|
|
|
|
|
break
|
|
|
|
|
if all_in_list:
|
|
|
|
|
current_currency_list = [split for split in compare_split
|
|
|
|
|
if split.upper() in total_currency_list]
|
|
|
|
|
if len(current_currency_list) > 0:
|
|
|
|
|
currency_list.append(current_currency_list[-1])
|
|
|
|
|
if len(currency_list) == 1:
|
|
|
|
|
text = text + ' ' + currency_list[0]
|
2024-10-01 20:31:15 +00:00
|
|
|
if share_name is not None:
|
2024-12-03 17:27:28 +00:00
|
|
|
share_name = share_name + ' ' + currency_list[0]
|
|
|
|
|
updated = True
|
|
|
|
|
|
|
|
|
|
for index in without_currency_list:
|
|
|
|
|
compare_list[index] = compare_list[index] + ' ' + 'USD'
|
|
|
|
|
|
2024-09-25 22:28:22 +00:00
|
|
|
if not updated:
|
2024-12-03 17:27:28 +00:00
|
|
|
text = text + ' ' + 'USD'
|
|
|
|
|
if share_name is not None:
|
|
|
|
|
share_name = share_name + ' ' + 'USD'
|
|
|
|
|
# return text, share_name, compare_list
|
|
|
|
|
elif with_currency and len(without_currency_list) == 0:
|
|
|
|
|
for index in without_currency_list:
|
|
|
|
|
compare_list[index] = compare_list[index] + ' ' + 'USD'
|
|
|
|
|
# return text, share_name, compare_list
|
|
|
|
|
else:
|
|
|
|
|
# return text, share_name, compare_list
|
|
|
|
|
pass
|
|
|
|
|
default_currency = 'USD'
|
|
|
|
|
if with_currency and share_name is not None:
|
|
|
|
|
share_name_split = share_name.split()
|
|
|
|
|
share_name_currency = get_currency_from_text(share_name)
|
|
|
|
|
if share_name_currency is not None and share_name_currency in total_currency_list:
|
|
|
|
|
for split in share_name_split:
|
|
|
|
|
if split in total_currency_list and split != share_name_currency:
|
|
|
|
|
default_currency = split
|
|
|
|
|
break
|
|
|
|
|
new_share_name = ' '.join([split for split in share_name_split
|
|
|
|
|
if split not in total_currency_list
|
|
|
|
|
or (split == share_name_currency)])
|
|
|
|
|
if share_name in text:
|
|
|
|
|
text = text.replace(share_name, new_share_name)
|
|
|
|
|
else:
|
|
|
|
|
text = ' '.join([split for split in text.split()
|
|
|
|
|
if split not in total_currency_list
|
|
|
|
|
or (split == share_name_currency)])
|
|
|
|
|
share_name = new_share_name
|
|
|
|
|
|
|
|
|
|
for c_i in range(len(compare_list)):
|
|
|
|
|
compare = compare_list[c_i]
|
|
|
|
|
compare_share_part = get_share_part_list([compare])[0]
|
|
|
|
|
compare_share_part_split = compare_share_part.split()
|
|
|
|
|
compare_share_part_currency_list = []
|
|
|
|
|
for split in compare_share_part_split:
|
|
|
|
|
if split.upper() in total_currency_list and split.upper() not in compare_share_part_currency_list:
|
|
|
|
|
compare_share_part_currency_list.append(split)
|
|
|
|
|
if len(compare_share_part_currency_list) > 1 and default_currency in compare_share_part_currency_list:
|
|
|
|
|
compare_share_part_split = [split for split in compare_share_part_split if split.upper() != default_currency]
|
|
|
|
|
new_compare_share_part = ' '.join(compare_share_part_split)
|
|
|
|
|
compare_list[c_i] = compare.replace(compare_share_part, new_compare_share_part)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error in update_for_currency: {e}")
|
2024-10-02 18:25:08 +00:00
|
|
|
return text, share_name, compare_list
|
2024-09-25 22:28:22 +00:00
|
|
|
|
|
|
|
|
|
2024-09-09 22:34:53 +00:00
|
|
|
def remove_common_word(text_list: list):
|
|
|
|
|
if text_list is None or len(text_list) == 0:
|
|
|
|
|
return text_list
|
|
|
|
|
new_text_list = []
|
|
|
|
|
for text in text_list:
|
|
|
|
|
text = text.lower()
|
|
|
|
|
text = remove_special_characters(text)
|
|
|
|
|
text_splits = text.split()
|
2024-10-07 15:34:13 +00:00
|
|
|
text = ' '.join([split for split in text_splits
|
|
|
|
|
if split.lower() not in lower_pre_fix_fund_share])
|
2024-09-09 22:34:53 +00:00
|
|
|
new_text_list.append(text)
|
|
|
|
|
# remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words
|
|
|
|
|
# the result is ['Global', 'Growth']
|
|
|
|
|
common_word_list = []
|
|
|
|
|
new_text_splits_list = [text.split() for text in new_text_list]
|
2024-10-02 22:25:25 +00:00
|
|
|
with_common_word = False
|
2024-09-09 22:34:53 +00:00
|
|
|
for i in range(len(new_text_splits_list)):
|
2024-09-27 00:25:17 +00:00
|
|
|
for j in range(i+1, len(new_text_splits_list)):
|
2024-09-09 22:34:53 +00:00
|
|
|
if common_word_list is None or len(common_word_list) == 0:
|
|
|
|
|
common_word_list = list(
|
2024-09-27 00:25:17 +00:00
|
|
|
set(new_text_splits_list[i]).intersection(set(new_text_splits_list[j])))
|
2024-09-09 22:34:53 +00:00
|
|
|
else:
|
|
|
|
|
common_word_list = list(
|
2024-09-27 00:25:17 +00:00
|
|
|
set(common_word_list).intersection(set(new_text_splits_list[j])))
|
2024-10-02 22:25:25 +00:00
|
|
|
if len(common_word_list) > 0:
|
|
|
|
|
with_common_word = True
|
|
|
|
|
if with_common_word and len(common_word_list) == 0:
|
|
|
|
|
break
|
|
|
|
|
if with_common_word and len(common_word_list) == 0:
|
|
|
|
|
break
|
2024-09-27 14:25:11 +00:00
|
|
|
|
|
|
|
|
remove_list = []
|
|
|
|
|
# if exists the share name and currency name, remove from the list
|
|
|
|
|
for word in common_word_list:
|
|
|
|
|
if word.upper() in total_currency_list:
|
|
|
|
|
remove_list.append(word)
|
|
|
|
|
for remove in remove_list:
|
|
|
|
|
if remove in common_word_list:
|
|
|
|
|
common_word_list.remove(remove)
|
|
|
|
|
|
2024-09-09 22:34:53 +00:00
|
|
|
common_word_list = list(set(common_word_list))
|
|
|
|
|
for i in range(len(new_text_splits_list)):
|
|
|
|
|
for common_word in common_word_list:
|
|
|
|
|
if common_word in new_text_splits_list[i]:
|
|
|
|
|
new_text_splits_list[i].remove(common_word)
|
2024-09-27 00:25:17 +00:00
|
|
|
new_text_list = [' '.join(text_splits)
|
|
|
|
|
for text_splits in new_text_splits_list]
|
2024-09-09 22:34:53 +00:00
|
|
|
return new_text_list, common_word_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_words_without_space(text: str):
|
|
|
|
|
"""
|
|
|
|
|
Split words without space, such as 'BlackrockGlobalFund' will be split to 'Blackrock', 'Global', 'Fund'
|
|
|
|
|
"""
|
|
|
|
|
if text is None or len(text.strip()) == 0:
|
|
|
|
|
return []
|
|
|
|
|
text = text.strip()
|
|
|
|
|
# splits = text.split()
|
|
|
|
|
# if len(splits) > 1:
|
|
|
|
|
# return text
|
|
|
|
|
# find all words with capital letter + lower letter
|
2024-09-27 14:25:11 +00:00
|
|
|
regex = r"[A-Z][a-z]+"
|
|
|
|
|
regex2 = r"[A-Z]{2,}[a-z]+"
|
2024-09-09 22:34:53 +00:00
|
|
|
word_list = re.findall(regex, text)
|
2024-09-27 14:25:11 +00:00
|
|
|
word_list2 = re.findall(regex2, text)
|
2024-09-09 22:34:53 +00:00
|
|
|
if len(word_list) > 0:
|
|
|
|
|
for word in word_list:
|
2024-09-27 14:25:11 +00:00
|
|
|
if len(word_list2) > 0:
|
|
|
|
|
word_exists_in_word2 = False
|
|
|
|
|
for word2 in word_list2:
|
|
|
|
|
if word in word2:
|
|
|
|
|
word_exists_in_word2 = True
|
|
|
|
|
break
|
|
|
|
|
if word_exists_in_word2:
|
|
|
|
|
continue
|
|
|
|
|
text = text.replace(word, " " + word + " ")
|
|
|
|
|
text = re.sub(r"(\s)+", " ", text)
|
2024-09-09 22:34:53 +00:00
|
|
|
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def remove_special_characters(text):
|
2024-11-21 22:36:58 +00:00
|
|
|
text = re.sub(r'\W', ' ', text)
|
2024-09-27 00:25:17 +00:00
|
|
|
text = re.sub(r'\s+', ' ', text)
|
2024-09-09 22:34:53 +00:00
|
|
|
text = text.strip()
|
|
|
|
|
return text
|
|
|
|
|
|
2024-09-18 22:10:54 +00:00
|
|
|
def get_unique_words_text(text):
|
|
|
|
|
text = remove_special_characters(text)
|
|
|
|
|
text = text.lower()
|
|
|
|
|
text_split = text.split()
|
|
|
|
|
text_split = list(set(text_split))
|
|
|
|
|
# sort the list
|
|
|
|
|
text_split.sort()
|
2024-09-27 00:25:17 +00:00
|
|
|
return_text = ' '.join(text_split)
|
2024-09-18 22:10:54 +00:00
|
|
|
return return_text
|
|
|
|
|
|
2024-09-09 22:34:53 +00:00
|
|
|
|
|
|
|
|
def remove_numeric_characters(text):
|
|
|
|
|
# remove numeric characters
|
2024-09-27 00:25:17 +00:00
|
|
|
text = re.sub(r'\d+', ' ', text)
|
|
|
|
|
text = re.sub(r'\s+', ' ', text)
|
2024-09-09 22:34:53 +00:00
|
|
|
text = text.strip()
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
2024-09-27 00:25:17 +00:00
|
|
|
def get_jacard_similarity(text_left,
|
|
|
|
|
text_right,
|
|
|
|
|
need_remove_special_characters=True,
|
|
|
|
|
need_remove_numeric_characters=True):
|
2024-09-09 22:34:53 +00:00
|
|
|
if need_remove_special_characters:
|
|
|
|
|
text_left = remove_special_characters(text_left)
|
|
|
|
|
text_right = remove_special_characters(text_right)
|
|
|
|
|
if need_remove_numeric_characters:
|
|
|
|
|
text_left = remove_numeric_characters(text_left)
|
|
|
|
|
text_right = remove_numeric_characters(text_right)
|
|
|
|
|
text_left = text_left.lower()
|
|
|
|
|
text_right = text_right.lower()
|
|
|
|
|
text_left = text_left.split()
|
|
|
|
|
text_right = text_right.split()
|
|
|
|
|
intersection = set(text_left).intersection(set(text_right))
|
|
|
|
|
union = set(text_left).union(set(text_right))
|
2024-10-08 22:16:01 +00:00
|
|
|
|
|
|
|
|
intersection_count = len(intersection)
|
|
|
|
|
union_count = len(union)
|
|
|
|
|
|
|
|
|
|
differ_a = list(set(text_left).difference(set(text_right)))
|
|
|
|
|
differ_a.sort()
|
|
|
|
|
differ_b = list(set(text_right).difference(set(text_left)))
|
|
|
|
|
differ_b.sort()
|
|
|
|
|
if ''.join(differ_a) == ''.join(differ_b):
|
|
|
|
|
intersection_count += len(differ_a) + len(differ_b)
|
|
|
|
|
if union_count > 0:
|
|
|
|
|
return round(intersection_count / union_count, 3)
|
2024-09-09 22:34:53 +00:00
|
|
|
else:
|
|
|
|
|
return 0
|
|
|
|
|
|
2024-10-01 17:19:45 +00:00
|
|
|
|
|
|
|
|
def simple_most_similarity_name(text: str, name_list: list):
|
|
|
|
|
if text is None or len(text.strip()) == 0 or \
|
|
|
|
|
name_list is None or len(name_list) == 0:
|
|
|
|
|
return None, 0.0
|
|
|
|
|
max_similarity = 0
|
|
|
|
|
max_similarity_name = None
|
|
|
|
|
for full_name in name_list:
|
|
|
|
|
similarity = get_jacard_similarity(text, full_name)
|
|
|
|
|
if similarity > max_similarity:
|
|
|
|
|
max_similarity = similarity
|
|
|
|
|
max_similarity_name = full_name
|
|
|
|
|
if max_similarity == 1:
|
|
|
|
|
break
|
|
|
|
|
return max_similarity_name, max_similarity
|
|
|
|
|
|
|
|
|
|
|
2024-09-20 03:45:08 +00:00
|
|
|
def get_beginning_common_words(text_list: list):
|
|
|
|
|
"""
|
|
|
|
|
Get the beginning common words in text_list
|
|
|
|
|
"""
|
|
|
|
|
if text_list is None or len(text_list) < 2:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
common_words_list = []
|
|
|
|
|
first_text_split = text_list[0].split()
|
|
|
|
|
for w_i, word in enumerate(first_text_split):
|
|
|
|
|
all_same = True
|
|
|
|
|
for text in text_list[1:]:
|
|
|
|
|
text_split = text.split()
|
|
|
|
|
if w_i >= len(text_split):
|
|
|
|
|
all_same = False
|
|
|
|
|
break
|
|
|
|
|
if text_split[w_i] != word:
|
|
|
|
|
all_same = False
|
|
|
|
|
break
|
|
|
|
|
if all_same:
|
|
|
|
|
common_words_list.append(word)
|
|
|
|
|
else:
|
|
|
|
|
break
|
2024-09-27 00:25:17 +00:00
|
|
|
|
|
|
|
|
return ' '.join(common_words_list).strip()
|
2024-09-09 22:34:53 +00:00
|
|
|
|
|
|
|
|
def replace_abbrevation(text: str):
|
|
|
|
|
if text is None or len(text.strip()) == 0:
|
|
|
|
|
return text
|
2024-10-02 22:25:25 +00:00
|
|
|
text = text.replace('(', ' ').replace(')', ' ').replace('-', ' ')
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
2024-09-27 00:25:17 +00:00
|
|
|
if 'swiss franc' in text.lower():
|
|
|
|
|
text = re.sub(r'swiss\s+franc', 'CHF', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'us dollar' in text.lower():
|
|
|
|
|
text = re.sub(r'us\s+dollar', 'USD', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'singapore dollar' in text.lower():
|
|
|
|
|
text = re.sub(r'singapore\s+dollar', 'SGD', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'hong kong dollar' in text.lower():
|
|
|
|
|
text = re.sub(r'hong\s+kong\s+dollar', 'HKD', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'hongkong dollar' in text.lower():
|
|
|
|
|
text = re.sub(r'hongkong\s+dollar', 'HKD', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'australian dollar' in text.lower():
|
|
|
|
|
text = re.sub(r'australian\s+dollar', 'AUD', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'japanese yen' in text.lower():
|
|
|
|
|
text = re.sub(r'japanese\s+yen', 'JPY', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'south african rand' in text.lower():
|
|
|
|
|
text = re.sub(r'South\s+African\s+rand', 'ZAR', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'canadian dollar' in text.lower():
|
|
|
|
|
text = re.sub(r'canadian\s+dollar', 'CAD', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'new zealand dollar' in text.lower():
|
|
|
|
|
text = re.sub(r'new\s+zealand\s+dollar', 'NZD', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'norwegian krone' in text.lower():
|
|
|
|
|
text = re.sub(r'norwegian\s+krone', 'NOK', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'danish krone' in text.lower():
|
|
|
|
|
text = re.sub(r'danish\s+krone', 'DKK', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'swedish krona' in text.lower():
|
|
|
|
|
text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'swedish kronor' in text.lower():
|
|
|
|
|
text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE)
|
2024-09-27 14:25:11 +00:00
|
|
|
elif "GPB" in text.split():
|
|
|
|
|
text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE)
|
2024-09-27 00:25:17 +00:00
|
|
|
elif 'sterling' in text.lower().split():
|
|
|
|
|
text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE)
|
|
|
|
|
elif 'euro' in text.lower().split():
|
|
|
|
|
text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE)
|
|
|
|
|
elif '€' in text.lower().split():
|
|
|
|
|
text = re.sub(r'\€', 'EUR', text, flags=re.IGNORECASE)
|
|
|
|
|
elif '$' in text.lower().split():
|
|
|
|
|
text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
|
|
|
|
|
elif '£' in text.lower().split():
|
|
|
|
|
text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
|
2024-09-27 14:25:11 +00:00
|
|
|
elif 'RMB' in text.split():
|
2024-09-27 00:25:17 +00:00
|
|
|
text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
|
2024-09-12 21:00:49 +00:00
|
|
|
else:
|
|
|
|
|
pass
|
2024-09-27 00:25:17 +00:00
|
|
|
|
2024-09-09 22:34:53 +00:00
|
|
|
text_splits = text.split()
|
|
|
|
|
new_text_splits = []
|
|
|
|
|
for split in text_splits:
|
2024-10-23 21:07:54 +00:00
|
|
|
if split.lower() in ['acc', 'acc.', 'accumulating']:
|
2024-09-27 00:25:17 +00:00
|
|
|
new_text_splits.append('Accumulation')
|
|
|
|
|
elif split.lower() in ['inc', 'inc.']:
|
|
|
|
|
new_text_splits.append('Income')
|
2024-10-23 21:07:54 +00:00
|
|
|
elif split.lower() in ['dist', 'dist.', 'dis', 'dis.', "distributing"]:
|
2024-09-27 00:25:17 +00:00
|
|
|
new_text_splits.append('Distribution')
|
|
|
|
|
elif split.lower() in ['inv', 'inv.']:
|
|
|
|
|
new_text_splits.append('Investor')
|
|
|
|
|
elif split.lower() in ['inst', 'inst.', 'institution']:
|
|
|
|
|
new_text_splits.append('Institutional')
|
|
|
|
|
elif split.lower() in ['cap', 'cap.']:
|
|
|
|
|
new_text_splits.append('Capitalisation')
|
2024-10-01 16:35:29 +00:00
|
|
|
elif split.lower() in ['div', 'div.']:
|
|
|
|
|
new_text_splits.append('Dividend')
|
2024-09-27 00:25:17 +00:00
|
|
|
elif split.lower() in ['adm', 'adm.']:
|
|
|
|
|
new_text_splits.append('Admin')
|
|
|
|
|
elif split.lower() in ['adv', 'adv.']:
|
|
|
|
|
new_text_splits.append('Advantage')
|
|
|
|
|
elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']:
|
|
|
|
|
new_text_splits.append('Hedged')
|
|
|
|
|
elif split.lower() in ['cl', 'cl.']:
|
|
|
|
|
new_text_splits.append('Class')
|
|
|
|
|
elif split.lower() in ['ser', 'ser.']:
|
|
|
|
|
new_text_splits.append('Series')
|
|
|
|
|
elif split.lower() in ['u.s.']:
|
|
|
|
|
new_text_splits.append('US')
|
|
|
|
|
elif split.lower() in ['nc', 'nc.']:
|
|
|
|
|
new_text_splits.append('no trail')
|
2024-10-02 22:25:25 +00:00
|
|
|
elif split.lower() in ['non']:
|
|
|
|
|
new_text_splits.append('Not')
|
2024-11-22 20:54:52 +00:00
|
|
|
elif split.lower() in ['net', 'unhgd'] or split == "fl":
|
2024-10-24 21:24:21 +00:00
|
|
|
new_text_splits.append('')
|
2024-09-09 22:34:53 +00:00
|
|
|
else:
|
2024-11-05 17:14:56 +00:00
|
|
|
split = split_short_name_with_share_features(split)
|
2024-09-09 22:34:53 +00:00
|
|
|
new_text_splits.append(split)
|
2024-09-27 00:25:17 +00:00
|
|
|
|
|
|
|
|
new_text = ' '.join(new_text_splits)
|
2024-10-24 21:24:21 +00:00
|
|
|
new_text = re.sub(r'\s+', ' ', new_text).strip()
|
2024-11-05 17:14:56 +00:00
|
|
|
return new_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_short_name_with_share_features(text: str):
|
|
|
|
|
"""
|
|
|
|
|
Split short name with share features,
|
|
|
|
|
for examples:
|
|
|
|
|
Document mapping for 532422720
|
|
|
|
|
CHFHInc to be CHF H Income
|
|
|
|
|
USDHAcc to be USD H Accumulation
|
|
|
|
|
GBPHInc to be GBP H Income
|
|
|
|
|
HAcc to be H Accumulation
|
|
|
|
|
GBPHedgedAcc to be GBP Hedged Accumulation
|
|
|
|
|
HGBPInc to be H GBP Income
|
|
|
|
|
HNOKAcc to be H NOK Accumulation
|
|
|
|
|
"""
|
|
|
|
|
if text is None or len(text.strip()) == 0:
|
|
|
|
|
return text
|
|
|
|
|
if len(text.split()) > 1:
|
|
|
|
|
return text
|
|
|
|
|
text = text.strip()
|
|
|
|
|
share_features = {'Acc': 'Accumulation',
|
|
|
|
|
'Inc': 'Income',
|
|
|
|
|
'Dist': 'Distribution',
|
|
|
|
|
'Div': 'Dividend',}
|
|
|
|
|
feature_name = ""
|
|
|
|
|
for key, value in share_features.items():
|
|
|
|
|
if len(text) > len(key) and text.endswith(key):
|
|
|
|
|
feature_name = value
|
|
|
|
|
text = text.replace(key, '')
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
currency_text = ""
|
|
|
|
|
for currency in total_currency_list:
|
|
|
|
|
if len(text) > len(currency) and currency in text:
|
|
|
|
|
currency_text = currency
|
|
|
|
|
text = text.replace(currency, '')
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
new_text = currency_text + ' ' + text + ' ' + feature_name
|
|
|
|
|
new_text = re.sub(r'\s+', ' ', new_text).strip()
|
2024-11-26 17:24:29 +00:00
|
|
|
return new_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_folder(folder_path: str, expired_days: int = 5):
|
|
|
|
|
if not os.path.exists(folder_path):
|
|
|
|
|
return
|
|
|
|
|
for root, dirs, files in os.walk(folder_path):
|
|
|
|
|
for file in files:
|
|
|
|
|
file_path = os.path.join(root, file)
|
|
|
|
|
if os.path.exists(file_path):
|
|
|
|
|
file_time = os.path.getmtime(file_path)
|
|
|
|
|
current_time = time.time()
|
|
|
|
|
if (current_time - file_time) / (60 * 60 * 24) > expired_days:
|
|
|
|
|
try:
|
|
|
|
|
os.remove(file_path)
|
|
|
|
|
except:
|
2024-12-02 23:16:56 +00:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def remove_abundant_data(data_list: list):
|
|
|
|
|
exist_data_list = []
|
|
|
|
|
|
|
|
|
|
# remove abundant data, only keep the first one with value
|
|
|
|
|
for data in data_list:
|
|
|
|
|
extract_data = data.get("extract_data", {})
|
|
|
|
|
data_detail_list = extract_data.get("data", [])
|
|
|
|
|
data_detail_list = remove_abundant_data_detail(data_detail_list,
|
|
|
|
|
exist_data_list)
|
|
|
|
|
data["extract_data"]["data"] = data_detail_list
|
|
|
|
|
return data_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def remove_abundant_data_detail(data_detail_list: list,
|
|
|
|
|
exist_data_list: list):
|
|
|
|
|
regular_attributes = ["fund_name", "share_name"]
|
|
|
|
|
remove_list = []
|
|
|
|
|
for data_detail in data_detail_list:
|
|
|
|
|
fund_name = data_detail.get("fund_name", "")
|
|
|
|
|
share_name = data_detail.get("share_name", "")
|
|
|
|
|
is_exist_data = False
|
|
|
|
|
for exist_data in exist_data_list:
|
|
|
|
|
if fund_name == exist_data["fund_name"] and share_name == exist_data["share_name"]:
|
|
|
|
|
is_exist_data = True
|
|
|
|
|
break
|
|
|
|
|
if not is_exist_data:
|
|
|
|
|
exist_data_list.append({"fund_name": fund_name, "share_name": share_name})
|
|
|
|
|
pop_keys = []
|
|
|
|
|
for data_key, data_value in data_detail.items():
|
|
|
|
|
if data_key in regular_attributes:
|
|
|
|
|
continue
|
|
|
|
|
for exist_data in exist_data_list:
|
|
|
|
|
if fund_name == exist_data["fund_name"] and share_name == exist_data["share_name"]:
|
|
|
|
|
if data_key in exist_data.keys():
|
|
|
|
|
# fund_data remove fund_key
|
|
|
|
|
exist_data_value = exist_data.get(data_key, None)
|
|
|
|
|
if exist_data_value is not None:
|
|
|
|
|
pop_keys.append(data_key)
|
|
|
|
|
else:
|
|
|
|
|
exist_data[data_key] = data_value
|
|
|
|
|
if len(pop_keys) > 0:
|
|
|
|
|
for pop_key in pop_keys:
|
|
|
|
|
data_detail.pop(pop_key)
|
|
|
|
|
value_keys = [value_key for value_key in list(data_detail.keys())
|
|
|
|
|
if value_key not in regular_attributes]
|
|
|
|
|
if len(value_keys) == 0:
|
|
|
|
|
remove_list.append(data_detail)
|
|
|
|
|
for remove_data in remove_list:
|
|
|
|
|
if remove_data in data_detail_list:
|
|
|
|
|
data_detail_list.remove(remove_data)
|
|
|
|
|
return data_detail_list
|