optimize mapping algorithm, this is the fixed version to confirm mapping metrics
This commit is contained in:
parent
7eba9a52ae
commit
0c4c541319
13
main.py
13
main.py
|
|
@ -564,10 +564,8 @@ def test_data_extraction_metrics():
|
||||||
|
|
||||||
|
|
||||||
def test_mapping_raw_name():
|
def test_mapping_raw_name():
|
||||||
doc_id = "344636875"
|
doc_id = "481475385"
|
||||||
raw_fund_name = ""
|
raw_name = "Emerging Markets Fund Y-DIST Shares (USD)"
|
||||||
raw_share_name = ""
|
|
||||||
raw_name = "Aberdeen Standard Alpha Global Loans I QInc USD"
|
|
||||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||||
data_mapping = DataMapping(
|
data_mapping = DataMapping(
|
||||||
doc_id,
|
doc_id,
|
||||||
|
|
@ -578,7 +576,7 @@ def test_mapping_raw_name():
|
||||||
)
|
)
|
||||||
mapping_info = data_mapping.matching_with_database(
|
mapping_info = data_mapping.matching_with_database(
|
||||||
raw_name=raw_name,
|
raw_name=raw_name,
|
||||||
parent_id="FS0000DA0E",
|
parent_id=None,
|
||||||
matching_type="share"
|
matching_type="share"
|
||||||
)
|
)
|
||||||
print(mapping_info)
|
print(mapping_info)
|
||||||
|
|
@ -657,16 +655,15 @@ if __name__ == "__main__":
|
||||||
"486378555",
|
"486378555",
|
||||||
"506559375",
|
"506559375",
|
||||||
"479793787",
|
"479793787",
|
||||||
"333207452",
|
|
||||||
"471641628",
|
"471641628",
|
||||||
]
|
]
|
||||||
special_doc_id_list = check_mapping_doc_id_list
|
special_doc_id_list = check_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["402181770"]
|
special_doc_id_list = ["402113224"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
for extract_way in extract_ways:
|
for extract_way in extract_ways:
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,54 @@ import re
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from traceback import print_exc
|
from traceback import print_exc
|
||||||
|
|
||||||
|
|
||||||
|
total_currency_list = [
|
||||||
|
"USD",
|
||||||
|
"EUR",
|
||||||
|
"AUD",
|
||||||
|
"JPY",
|
||||||
|
"CHF",
|
||||||
|
"GBP",
|
||||||
|
"SEK",
|
||||||
|
"CNY",
|
||||||
|
"NZD",
|
||||||
|
"CNH",
|
||||||
|
"NOK",
|
||||||
|
"SGD",
|
||||||
|
"HKD",
|
||||||
|
"ZAR",
|
||||||
|
"PLN",
|
||||||
|
"CAD",
|
||||||
|
"CZK",
|
||||||
|
"HUF",
|
||||||
|
"DKK",
|
||||||
|
"BRL",
|
||||||
|
"SKK",
|
||||||
|
"RON",
|
||||||
|
"TRY",
|
||||||
|
"BGN",
|
||||||
|
"CUP",
|
||||||
|
"MXN",
|
||||||
|
"CLF",
|
||||||
|
"XCD",
|
||||||
|
"ISK",
|
||||||
|
"IDR",
|
||||||
|
"MNT",
|
||||||
|
"AED",
|
||||||
|
"AFN",
|
||||||
|
"INR",
|
||||||
|
"ESP",
|
||||||
|
"RUB",
|
||||||
|
"CLP",
|
||||||
|
"KRW",
|
||||||
|
"ETB",
|
||||||
|
"DZD",
|
||||||
|
"XEU",
|
||||||
|
"XFO",
|
||||||
|
]
|
||||||
|
|
||||||
|
share_features = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Capitalisation', 'Admin', 'Advantage']
|
||||||
|
|
||||||
def add_slash_to_text_as_regex(text: str):
|
def add_slash_to_text_as_regex(text: str):
|
||||||
if text is None or len(text) == 0:
|
if text is None or len(text) == 0:
|
||||||
return text
|
return text
|
||||||
|
|
@ -29,18 +77,18 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
|
||||||
Get the most similar fund name from fund_name_list by jacard similarity
|
Get the most similar fund name from fund_name_list by jacard similarity
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
copy_fund_name_list = deepcopy(name_list)
|
copy_name_list = deepcopy(name_list)
|
||||||
if text is None or len(text.split()) == 0 or \
|
if text is None or len(text.split()) == 0 or \
|
||||||
copy_fund_name_list is None or len(copy_fund_name_list) == 0:
|
copy_name_list is None or len(copy_name_list) == 0:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name
|
copy_name_list = [replace_abbrevation(copy_name) for copy_name
|
||||||
in copy_fund_name_list]
|
in copy_name_list]
|
||||||
|
|
||||||
# get common words in fund_name_list
|
# get common words in fund_name_list
|
||||||
common_word_list = []
|
common_word_list = []
|
||||||
if len(name_list) > 1:
|
if len(name_list) > 1:
|
||||||
_, common_word_list = remove_common_word(copy_fund_name_list)
|
_, common_word_list = remove_common_word(copy_name_list)
|
||||||
if pre_common_word_list is not None and len(pre_common_word_list) > 0:
|
if pre_common_word_list is not None and len(pre_common_word_list) > 0:
|
||||||
common_word_list.extend([word for word in pre_common_word_list
|
common_word_list.extend([word for word in pre_common_word_list
|
||||||
if word not in common_word_list])
|
if word not in common_word_list])
|
||||||
|
|
@ -63,14 +111,14 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
|
||||||
for word in common_word_list:
|
for word in common_word_list:
|
||||||
if word not in lower_new_splits:
|
if word not in lower_new_splits:
|
||||||
# remove word in fund_name_list
|
# remove word in fund_name_list
|
||||||
for i in range(len(copy_fund_name_list)):
|
for i in range(len(copy_name_list)):
|
||||||
temp_splits = copy_fund_name_list[i].split()
|
temp_splits = copy_name_list[i].split()
|
||||||
copy_fund_name_list[i] = ' '.join([split for split in temp_splits
|
copy_name_list[i] = ' '.join([split for split in temp_splits
|
||||||
if remove_special_characters(split).lower() != word])
|
if remove_special_characters(split).lower() != word])
|
||||||
|
|
||||||
for i in range(len(copy_fund_name_list)):
|
for i in range(len(copy_name_list)):
|
||||||
temp_splits = copy_fund_name_list[i].split()
|
temp_splits = copy_name_list[i].split()
|
||||||
copy_fund_name_list[i] = ' '.join([split for split in temp_splits
|
copy_name_list[i] = ' '.join([split for split in temp_splits
|
||||||
if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']])
|
if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']])
|
||||||
final_splits = []
|
final_splits = []
|
||||||
for split in new_splits:
|
for split in new_splits:
|
||||||
|
|
@ -79,38 +127,72 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
|
||||||
|
|
||||||
text = ' '.join(final_splits)
|
text = ' '.join(final_splits)
|
||||||
max_similarity = 0
|
max_similarity = 0
|
||||||
max_similarity_fund_name = None
|
max_similarity_full_name = None
|
||||||
text = remove_special_characters(text)
|
text = remove_special_characters(text)
|
||||||
text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
|
text, copy_name_list = update_for_currency(text, copy_name_list)
|
||||||
for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list):
|
text_currencty = get_currency_from_text(text)
|
||||||
copy_fund_name = remove_special_characters(copy_fund_name)
|
text_feature = get_share_feature_from_text(text)
|
||||||
copy_fund_name = split_words_without_space(copy_fund_name)
|
for full_name, copy_name in zip(name_list , copy_name_list):
|
||||||
|
copy_name = remove_special_characters(copy_name)
|
||||||
|
copy_name = split_words_without_space(copy_name)
|
||||||
similarity = get_jacard_similarity(text,
|
similarity = get_jacard_similarity(text,
|
||||||
copy_fund_name,
|
copy_name,
|
||||||
need_remove_numeric_characters=False)
|
need_remove_numeric_characters=False)
|
||||||
|
copy_name_2 = replace_abbrevation(copy_name)
|
||||||
|
if copy_name != copy_name_2:
|
||||||
|
similarity_2 = get_jacard_similarity(text,
|
||||||
|
copy_name_2,
|
||||||
|
need_remove_numeric_characters=False)
|
||||||
|
if similarity_2 > similarity:
|
||||||
|
similarity = similarity_2
|
||||||
if similarity > max_similarity:
|
if similarity > max_similarity:
|
||||||
|
copy_name_currency = get_currency_from_text(copy_name)
|
||||||
|
if text_currencty is not None and copy_name_currency is not None:
|
||||||
|
if text_currencty != copy_name_currency:
|
||||||
|
continue
|
||||||
|
copy_name_feature = get_share_feature_from_text(copy_name)
|
||||||
|
if text_feature is not None and copy_name_feature is not None:
|
||||||
|
if text_feature != copy_name_feature:
|
||||||
|
continue
|
||||||
max_similarity = similarity
|
max_similarity = similarity
|
||||||
max_similarity_fund_name = fund_name
|
max_similarity_full_name = full_name
|
||||||
if max_similarity == 1:
|
if max_similarity == 1:
|
||||||
break
|
break
|
||||||
if max_similarity < 0.35:
|
if max_similarity < 0.35:
|
||||||
return None, max_similarity
|
return None, max_similarity
|
||||||
return max_similarity_fund_name, max_similarity
|
return max_similarity_full_name, max_similarity
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
print_exc()
|
print_exc()
|
||||||
return None, 0.0
|
return None, 0.0
|
||||||
|
|
||||||
|
def get_share_feature_from_text(text: str):
|
||||||
|
if text is None or len(text.strip()) == 0:
|
||||||
|
return None
|
||||||
|
text = text.strip()
|
||||||
|
text = text.lower()
|
||||||
|
text_split = text.split()
|
||||||
|
temp_share_features = [feature.lower() for feature in share_features]
|
||||||
|
for split in text_split[::-1]:
|
||||||
|
if split in temp_share_features:
|
||||||
|
return split
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_currency_from_text(text: str):
|
||||||
|
if text is None or len(text.strip()) == 0:
|
||||||
|
return None
|
||||||
|
text = text.strip()
|
||||||
|
text = text.lower()
|
||||||
|
text_split = text.split()
|
||||||
|
for split in text_split[::-1]:
|
||||||
|
if split.upper() in total_currency_list:
|
||||||
|
return split
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def update_for_currency(text: str, compare_list: list):
|
def update_for_currency(text: str, compare_list: list):
|
||||||
text_split = text.split()
|
text_split = text.split()
|
||||||
with_currency = False
|
with_currency = False
|
||||||
total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY',
|
|
||||||
'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD',
|
|
||||||
'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN',
|
|
||||||
'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR',
|
|
||||||
'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW',
|
|
||||||
'ETB', 'DZD', 'XEU', 'XFO']
|
|
||||||
for split in text_split:
|
for split in text_split:
|
||||||
if split.upper() in total_currency_list:
|
if split.upper() in total_currency_list:
|
||||||
with_currency = True
|
with_currency = True
|
||||||
|
|
@ -198,6 +280,16 @@ def remove_common_word(text_list: list):
|
||||||
else:
|
else:
|
||||||
common_word_list = list(
|
common_word_list = list(
|
||||||
set(common_word_list).intersection(set(new_text_splits_list[j])))
|
set(common_word_list).intersection(set(new_text_splits_list[j])))
|
||||||
|
|
||||||
|
remove_list = []
|
||||||
|
# if exists the share name and currency name, remove from the list
|
||||||
|
for word in common_word_list:
|
||||||
|
if word.upper() in total_currency_list:
|
||||||
|
remove_list.append(word)
|
||||||
|
for remove in remove_list:
|
||||||
|
if remove in common_word_list:
|
||||||
|
common_word_list.remove(remove)
|
||||||
|
|
||||||
common_word_list = list(set(common_word_list))
|
common_word_list = list(set(common_word_list))
|
||||||
for i in range(len(new_text_splits_list)):
|
for i in range(len(new_text_splits_list)):
|
||||||
for common_word in common_word_list:
|
for common_word in common_word_list:
|
||||||
|
|
@ -219,12 +311,22 @@ def split_words_without_space(text: str):
|
||||||
# if len(splits) > 1:
|
# if len(splits) > 1:
|
||||||
# return text
|
# return text
|
||||||
# find all words with capital letter + lower letter
|
# find all words with capital letter + lower letter
|
||||||
regex = r'[A-Z][a-z]+'
|
regex = r"[A-Z][a-z]+"
|
||||||
|
regex2 = r"[A-Z]{2,}[a-z]+"
|
||||||
word_list = re.findall(regex, text)
|
word_list = re.findall(regex, text)
|
||||||
|
word_list2 = re.findall(regex2, text)
|
||||||
if len(word_list) > 0:
|
if len(word_list) > 0:
|
||||||
for word in word_list:
|
for word in word_list:
|
||||||
text = text.replace(word, ' ' + word + ' ')
|
if len(word_list2) > 0:
|
||||||
text = re.sub(r'(\s)+', ' ', text)
|
word_exists_in_word2 = False
|
||||||
|
for word2 in word_list2:
|
||||||
|
if word in word2:
|
||||||
|
word_exists_in_word2 = True
|
||||||
|
break
|
||||||
|
if word_exists_in_word2:
|
||||||
|
continue
|
||||||
|
text = text.replace(word, " " + word + " ")
|
||||||
|
text = re.sub(r"(\s)+", " ", text)
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -332,6 +434,8 @@ def replace_abbrevation(text: str):
|
||||||
text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE)
|
text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE)
|
||||||
elif 'swedish kronor' in text.lower():
|
elif 'swedish kronor' in text.lower():
|
||||||
text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE)
|
text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE)
|
||||||
|
elif "GPB" in text.split():
|
||||||
|
text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE)
|
||||||
elif 'sterling' in text.lower().split():
|
elif 'sterling' in text.lower().split():
|
||||||
text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE)
|
text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE)
|
||||||
elif 'euro' in text.lower().split():
|
elif 'euro' in text.lower().split():
|
||||||
|
|
@ -342,7 +446,7 @@ def replace_abbrevation(text: str):
|
||||||
text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
|
text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
|
||||||
elif '£' in text.lower().split():
|
elif '£' in text.lower().split():
|
||||||
text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
|
text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
|
||||||
elif 'RMB' in text.lower().split():
|
elif 'RMB' in text.split():
|
||||||
text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
|
text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue