optimize mapping algorithm, this is the fixed version to confirm mapping metrics
This commit is contained in:
parent
7eba9a52ae
commit
0c4c541319
13
main.py
13
main.py
|
|
@ -564,10 +564,8 @@ def test_data_extraction_metrics():
|
|||
|
||||
|
||||
def test_mapping_raw_name():
|
||||
doc_id = "344636875"
|
||||
raw_fund_name = ""
|
||||
raw_share_name = ""
|
||||
raw_name = "Aberdeen Standard Alpha Global Loans I QInc USD"
|
||||
doc_id = "481475385"
|
||||
raw_name = "Emerging Markets Fund Y-DIST Shares (USD)"
|
||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||
data_mapping = DataMapping(
|
||||
doc_id,
|
||||
|
|
@ -578,7 +576,7 @@ def test_mapping_raw_name():
|
|||
)
|
||||
mapping_info = data_mapping.matching_with_database(
|
||||
raw_name=raw_name,
|
||||
parent_id="FS0000DA0E",
|
||||
parent_id=None,
|
||||
matching_type="share"
|
||||
)
|
||||
print(mapping_info)
|
||||
|
|
@ -657,16 +655,15 @@ if __name__ == "__main__":
|
|||
"486378555",
|
||||
"506559375",
|
||||
"479793787",
|
||||
"333207452",
|
||||
"471641628",
|
||||
]
|
||||
special_doc_id_list = check_mapping_doc_id_list
|
||||
# special_doc_id_list = ["402181770"]
|
||||
special_doc_id_list = ["402113224"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = True
|
||||
force_save_total_data = False
|
||||
|
||||
extract_ways = ["text"]
|
||||
for extract_way in extract_ways:
|
||||
|
|
|
|||
|
|
@ -2,6 +2,54 @@ import re
|
|||
from copy import deepcopy
|
||||
from traceback import print_exc
|
||||
|
||||
|
||||
total_currency_list = [
|
||||
"USD",
|
||||
"EUR",
|
||||
"AUD",
|
||||
"JPY",
|
||||
"CHF",
|
||||
"GBP",
|
||||
"SEK",
|
||||
"CNY",
|
||||
"NZD",
|
||||
"CNH",
|
||||
"NOK",
|
||||
"SGD",
|
||||
"HKD",
|
||||
"ZAR",
|
||||
"PLN",
|
||||
"CAD",
|
||||
"CZK",
|
||||
"HUF",
|
||||
"DKK",
|
||||
"BRL",
|
||||
"SKK",
|
||||
"RON",
|
||||
"TRY",
|
||||
"BGN",
|
||||
"CUP",
|
||||
"MXN",
|
||||
"CLF",
|
||||
"XCD",
|
||||
"ISK",
|
||||
"IDR",
|
||||
"MNT",
|
||||
"AED",
|
||||
"AFN",
|
||||
"INR",
|
||||
"ESP",
|
||||
"RUB",
|
||||
"CLP",
|
||||
"KRW",
|
||||
"ETB",
|
||||
"DZD",
|
||||
"XEU",
|
||||
"XFO",
|
||||
]
|
||||
|
||||
share_features = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Capitalisation', 'Admin', 'Advantage']
|
||||
|
||||
def add_slash_to_text_as_regex(text: str):
|
||||
if text is None or len(text) == 0:
|
||||
return text
|
||||
|
|
@ -29,18 +77,18 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
|
|||
Get the most similar fund name from fund_name_list by jacard similarity
|
||||
"""
|
||||
try:
|
||||
copy_fund_name_list = deepcopy(name_list)
|
||||
copy_name_list = deepcopy(name_list)
|
||||
if text is None or len(text.split()) == 0 or \
|
||||
copy_fund_name_list is None or len(copy_fund_name_list) == 0:
|
||||
copy_name_list is None or len(copy_name_list) == 0:
|
||||
return None, None
|
||||
|
||||
copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name
|
||||
in copy_fund_name_list]
|
||||
copy_name_list = [replace_abbrevation(copy_name) for copy_name
|
||||
in copy_name_list]
|
||||
|
||||
# get common words in fund_name_list
|
||||
common_word_list = []
|
||||
if len(name_list) > 1:
|
||||
_, common_word_list = remove_common_word(copy_fund_name_list)
|
||||
_, common_word_list = remove_common_word(copy_name_list)
|
||||
if pre_common_word_list is not None and len(pre_common_word_list) > 0:
|
||||
common_word_list.extend([word for word in pre_common_word_list
|
||||
if word not in common_word_list])
|
||||
|
|
@ -63,14 +111,14 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
|
|||
for word in common_word_list:
|
||||
if word not in lower_new_splits:
|
||||
# remove word in fund_name_list
|
||||
for i in range(len(copy_fund_name_list)):
|
||||
temp_splits = copy_fund_name_list[i].split()
|
||||
copy_fund_name_list[i] = ' '.join([split for split in temp_splits
|
||||
for i in range(len(copy_name_list)):
|
||||
temp_splits = copy_name_list[i].split()
|
||||
copy_name_list[i] = ' '.join([split for split in temp_splits
|
||||
if remove_special_characters(split).lower() != word])
|
||||
|
||||
for i in range(len(copy_fund_name_list)):
|
||||
temp_splits = copy_fund_name_list[i].split()
|
||||
copy_fund_name_list[i] = ' '.join([split for split in temp_splits
|
||||
for i in range(len(copy_name_list)):
|
||||
temp_splits = copy_name_list[i].split()
|
||||
copy_name_list[i] = ' '.join([split for split in temp_splits
|
||||
if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']])
|
||||
final_splits = []
|
||||
for split in new_splits:
|
||||
|
|
@ -79,38 +127,72 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
|
|||
|
||||
text = ' '.join(final_splits)
|
||||
max_similarity = 0
|
||||
max_similarity_fund_name = None
|
||||
max_similarity_full_name = None
|
||||
text = remove_special_characters(text)
|
||||
text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
|
||||
for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list):
|
||||
copy_fund_name = remove_special_characters(copy_fund_name)
|
||||
copy_fund_name = split_words_without_space(copy_fund_name)
|
||||
text, copy_name_list = update_for_currency(text, copy_name_list)
|
||||
text_currencty = get_currency_from_text(text)
|
||||
text_feature = get_share_feature_from_text(text)
|
||||
for full_name, copy_name in zip(name_list , copy_name_list):
|
||||
copy_name = remove_special_characters(copy_name)
|
||||
copy_name = split_words_without_space(copy_name)
|
||||
similarity = get_jacard_similarity(text,
|
||||
copy_fund_name,
|
||||
copy_name,
|
||||
need_remove_numeric_characters=False)
|
||||
copy_name_2 = replace_abbrevation(copy_name)
|
||||
if copy_name != copy_name_2:
|
||||
similarity_2 = get_jacard_similarity(text,
|
||||
copy_name_2,
|
||||
need_remove_numeric_characters=False)
|
||||
if similarity_2 > similarity:
|
||||
similarity = similarity_2
|
||||
if similarity > max_similarity:
|
||||
copy_name_currency = get_currency_from_text(copy_name)
|
||||
if text_currencty is not None and copy_name_currency is not None:
|
||||
if text_currencty != copy_name_currency:
|
||||
continue
|
||||
copy_name_feature = get_share_feature_from_text(copy_name)
|
||||
if text_feature is not None and copy_name_feature is not None:
|
||||
if text_feature != copy_name_feature:
|
||||
continue
|
||||
max_similarity = similarity
|
||||
max_similarity_fund_name = fund_name
|
||||
max_similarity_full_name = full_name
|
||||
if max_similarity == 1:
|
||||
break
|
||||
if max_similarity < 0.35:
|
||||
return None, max_similarity
|
||||
return max_similarity_fund_name, max_similarity
|
||||
return max_similarity_full_name, max_similarity
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print_exc()
|
||||
return None, 0.0
|
||||
|
||||
def get_share_feature_from_text(text: str):
|
||||
if text is None or len(text.strip()) == 0:
|
||||
return None
|
||||
text = text.strip()
|
||||
text = text.lower()
|
||||
text_split = text.split()
|
||||
temp_share_features = [feature.lower() for feature in share_features]
|
||||
for split in text_split[::-1]:
|
||||
if split in temp_share_features:
|
||||
return split
|
||||
return None
|
||||
|
||||
def get_currency_from_text(text: str):
|
||||
if text is None or len(text.strip()) == 0:
|
||||
return None
|
||||
text = text.strip()
|
||||
text = text.lower()
|
||||
text_split = text.split()
|
||||
for split in text_split[::-1]:
|
||||
if split.upper() in total_currency_list:
|
||||
return split
|
||||
return None
|
||||
|
||||
|
||||
def update_for_currency(text: str, compare_list: list):
|
||||
text_split = text.split()
|
||||
with_currency = False
|
||||
total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY',
|
||||
'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD',
|
||||
'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN',
|
||||
'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR',
|
||||
'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW',
|
||||
'ETB', 'DZD', 'XEU', 'XFO']
|
||||
for split in text_split:
|
||||
if split.upper() in total_currency_list:
|
||||
with_currency = True
|
||||
|
|
@ -198,6 +280,16 @@ def remove_common_word(text_list: list):
|
|||
else:
|
||||
common_word_list = list(
|
||||
set(common_word_list).intersection(set(new_text_splits_list[j])))
|
||||
|
||||
remove_list = []
|
||||
# if exists the share name and currency name, remove from the list
|
||||
for word in common_word_list:
|
||||
if word.upper() in total_currency_list:
|
||||
remove_list.append(word)
|
||||
for remove in remove_list:
|
||||
if remove in common_word_list:
|
||||
common_word_list.remove(remove)
|
||||
|
||||
common_word_list = list(set(common_word_list))
|
||||
for i in range(len(new_text_splits_list)):
|
||||
for common_word in common_word_list:
|
||||
|
|
@ -219,12 +311,22 @@ def split_words_without_space(text: str):
|
|||
# if len(splits) > 1:
|
||||
# return text
|
||||
# find all words with capital letter + lower letter
|
||||
regex = r'[A-Z][a-z]+'
|
||||
regex = r"[A-Z][a-z]+"
|
||||
regex2 = r"[A-Z]{2,}[a-z]+"
|
||||
word_list = re.findall(regex, text)
|
||||
word_list2 = re.findall(regex2, text)
|
||||
if len(word_list) > 0:
|
||||
for word in word_list:
|
||||
text = text.replace(word, ' ' + word + ' ')
|
||||
text = re.sub(r'(\s)+', ' ', text)
|
||||
if len(word_list2) > 0:
|
||||
word_exists_in_word2 = False
|
||||
for word2 in word_list2:
|
||||
if word in word2:
|
||||
word_exists_in_word2 = True
|
||||
break
|
||||
if word_exists_in_word2:
|
||||
continue
|
||||
text = text.replace(word, " " + word + " ")
|
||||
text = re.sub(r"(\s)+", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
|
|
@ -332,6 +434,8 @@ def replace_abbrevation(text: str):
|
|||
text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE)
|
||||
elif 'swedish kronor' in text.lower():
|
||||
text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE)
|
||||
elif "GPB" in text.split():
|
||||
text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE)
|
||||
elif 'sterling' in text.lower().split():
|
||||
text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE)
|
||||
elif 'euro' in text.lower().split():
|
||||
|
|
@ -342,7 +446,7 @@ def replace_abbrevation(text: str):
|
|||
text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
|
||||
elif '£' in text.lower().split():
|
||||
text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
|
||||
elif 'RMB' in text.lower().split():
|
||||
elif 'RMB' in text.split():
|
||||
text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
|
||||
else:
|
||||
pass
|
||||
|
|
|
|||
Loading…
Reference in New Issue