optimize mapping algorithm, this is the fixed version to confirm mapping metrics

This commit is contained in:
Blade He 2024-09-27 09:25:11 -05:00
parent 7eba9a52ae
commit 0c4c541319
2 changed files with 138 additions and 37 deletions

13
main.py
View File

@ -564,10 +564,8 @@ def test_data_extraction_metrics():
def test_mapping_raw_name(): def test_mapping_raw_name():
doc_id = "344636875" doc_id = "481475385"
raw_fund_name = "" raw_name = "Emerging Markets Fund Y-DIST Shares (USD)"
raw_share_name = ""
raw_name = "Aberdeen Standard Alpha Global Loans I QInc USD"
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
data_mapping = DataMapping( data_mapping = DataMapping(
doc_id, doc_id,
@ -578,7 +576,7 @@ def test_mapping_raw_name():
) )
mapping_info = data_mapping.matching_with_database( mapping_info = data_mapping.matching_with_database(
raw_name=raw_name, raw_name=raw_name,
parent_id="FS0000DA0E", parent_id=None,
matching_type="share" matching_type="share"
) )
print(mapping_info) print(mapping_info)
@ -657,16 +655,15 @@ if __name__ == "__main__":
"486378555", "486378555",
"506559375", "506559375",
"479793787", "479793787",
"333207452",
"471641628", "471641628",
] ]
special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_mapping_doc_id_list
# special_doc_id_list = ["402181770"] special_doc_id_list = ["402113224"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = True force_save_total_data = False
extract_ways = ["text"] extract_ways = ["text"]
for extract_way in extract_ways: for extract_way in extract_ways:

View File

@ -2,6 +2,54 @@ import re
from copy import deepcopy from copy import deepcopy
from traceback import print_exc from traceback import print_exc
total_currency_list = [
"USD",
"EUR",
"AUD",
"JPY",
"CHF",
"GBP",
"SEK",
"CNY",
"NZD",
"CNH",
"NOK",
"SGD",
"HKD",
"ZAR",
"PLN",
"CAD",
"CZK",
"HUF",
"DKK",
"BRL",
"SKK",
"RON",
"TRY",
"BGN",
"CUP",
"MXN",
"CLF",
"XCD",
"ISK",
"IDR",
"MNT",
"AED",
"AFN",
"INR",
"ESP",
"RUB",
"CLP",
"KRW",
"ETB",
"DZD",
"XEU",
"XFO",
]
share_features = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Capitalisation', 'Admin', 'Advantage']
def add_slash_to_text_as_regex(text: str): def add_slash_to_text_as_regex(text: str):
if text is None or len(text) == 0: if text is None or len(text) == 0:
return text return text
@ -29,18 +77,18 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
Get the most similar fund name from fund_name_list by jacard similarity Get the most similar fund name from fund_name_list by jacard similarity
""" """
try: try:
copy_fund_name_list = deepcopy(name_list) copy_name_list = deepcopy(name_list)
if text is None or len(text.split()) == 0 or \ if text is None or len(text.split()) == 0 or \
copy_fund_name_list is None or len(copy_fund_name_list) == 0: copy_name_list is None or len(copy_name_list) == 0:
return None, None return None, None
copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name copy_name_list = [replace_abbrevation(copy_name) for copy_name
in copy_fund_name_list] in copy_name_list]
# get common words in fund_name_list # get common words in fund_name_list
common_word_list = [] common_word_list = []
if len(name_list) > 1: if len(name_list) > 1:
_, common_word_list = remove_common_word(copy_fund_name_list) _, common_word_list = remove_common_word(copy_name_list)
if pre_common_word_list is not None and len(pre_common_word_list) > 0: if pre_common_word_list is not None and len(pre_common_word_list) > 0:
common_word_list.extend([word for word in pre_common_word_list common_word_list.extend([word for word in pre_common_word_list
if word not in common_word_list]) if word not in common_word_list])
@ -63,14 +111,14 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
for word in common_word_list: for word in common_word_list:
if word not in lower_new_splits: if word not in lower_new_splits:
# remove word in fund_name_list # remove word in fund_name_list
for i in range(len(copy_fund_name_list)): for i in range(len(copy_name_list)):
temp_splits = copy_fund_name_list[i].split() temp_splits = copy_name_list[i].split()
copy_fund_name_list[i] = ' '.join([split for split in temp_splits copy_name_list[i] = ' '.join([split for split in temp_splits
if remove_special_characters(split).lower() != word]) if remove_special_characters(split).lower() != word])
for i in range(len(copy_fund_name_list)): for i in range(len(copy_name_list)):
temp_splits = copy_fund_name_list[i].split() temp_splits = copy_name_list[i].split()
copy_fund_name_list[i] = ' '.join([split for split in temp_splits copy_name_list[i] = ' '.join([split for split in temp_splits
if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']]) if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']])
final_splits = [] final_splits = []
for split in new_splits: for split in new_splits:
@ -79,38 +127,72 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
text = ' '.join(final_splits) text = ' '.join(final_splits)
max_similarity = 0 max_similarity = 0
max_similarity_fund_name = None max_similarity_full_name = None
text = remove_special_characters(text) text = remove_special_characters(text)
text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list) text, copy_name_list = update_for_currency(text, copy_name_list)
for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list): text_currencty = get_currency_from_text(text)
copy_fund_name = remove_special_characters(copy_fund_name) text_feature = get_share_feature_from_text(text)
copy_fund_name = split_words_without_space(copy_fund_name) for full_name, copy_name in zip(name_list , copy_name_list):
copy_name = remove_special_characters(copy_name)
copy_name = split_words_without_space(copy_name)
similarity = get_jacard_similarity(text, similarity = get_jacard_similarity(text,
copy_fund_name, copy_name,
need_remove_numeric_characters=False) need_remove_numeric_characters=False)
copy_name_2 = replace_abbrevation(copy_name)
if copy_name != copy_name_2:
similarity_2 = get_jacard_similarity(text,
copy_name_2,
need_remove_numeric_characters=False)
if similarity_2 > similarity:
similarity = similarity_2
if similarity > max_similarity: if similarity > max_similarity:
copy_name_currency = get_currency_from_text(copy_name)
if text_currencty is not None and copy_name_currency is not None:
if text_currencty != copy_name_currency:
continue
copy_name_feature = get_share_feature_from_text(copy_name)
if text_feature is not None and copy_name_feature is not None:
if text_feature != copy_name_feature:
continue
max_similarity = similarity max_similarity = similarity
max_similarity_fund_name = fund_name max_similarity_full_name = full_name
if max_similarity == 1: if max_similarity == 1:
break break
if max_similarity < 0.35: if max_similarity < 0.35:
return None, max_similarity return None, max_similarity
return max_similarity_fund_name, max_similarity return max_similarity_full_name, max_similarity
except Exception as e: except Exception as e:
print(e) print(e)
print_exc() print_exc()
return None, 0.0 return None, 0.0
def get_share_feature_from_text(text: str):
if text is None or len(text.strip()) == 0:
return None
text = text.strip()
text = text.lower()
text_split = text.split()
temp_share_features = [feature.lower() for feature in share_features]
for split in text_split[::-1]:
if split in temp_share_features:
return split
return None
def get_currency_from_text(text: str):
if text is None or len(text.strip()) == 0:
return None
text = text.strip()
text = text.lower()
text_split = text.split()
for split in text_split[::-1]:
if split.upper() in total_currency_list:
return split
return None
def update_for_currency(text: str, compare_list: list): def update_for_currency(text: str, compare_list: list):
text_split = text.split() text_split = text.split()
with_currency = False with_currency = False
total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY',
'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD',
'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN',
'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR',
'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW',
'ETB', 'DZD', 'XEU', 'XFO']
for split in text_split: for split in text_split:
if split.upper() in total_currency_list: if split.upper() in total_currency_list:
with_currency = True with_currency = True
@ -198,6 +280,16 @@ def remove_common_word(text_list: list):
else: else:
common_word_list = list( common_word_list = list(
set(common_word_list).intersection(set(new_text_splits_list[j]))) set(common_word_list).intersection(set(new_text_splits_list[j])))
remove_list = []
# if exists the share name and currency name, remove from the list
for word in common_word_list:
if word.upper() in total_currency_list:
remove_list.append(word)
for remove in remove_list:
if remove in common_word_list:
common_word_list.remove(remove)
common_word_list = list(set(common_word_list)) common_word_list = list(set(common_word_list))
for i in range(len(new_text_splits_list)): for i in range(len(new_text_splits_list)):
for common_word in common_word_list: for common_word in common_word_list:
@ -219,12 +311,22 @@ def split_words_without_space(text: str):
# if len(splits) > 1: # if len(splits) > 1:
# return text # return text
# find all words with capital letter + lower letter # find all words with capital letter + lower letter
regex = r'[A-Z][a-z]+' regex = r"[A-Z][a-z]+"
regex2 = r"[A-Z]{2,}[a-z]+"
word_list = re.findall(regex, text) word_list = re.findall(regex, text)
word_list2 = re.findall(regex2, text)
if len(word_list) > 0: if len(word_list) > 0:
for word in word_list: for word in word_list:
text = text.replace(word, ' ' + word + ' ') if len(word_list2) > 0:
text = re.sub(r'(\s)+', ' ', text) word_exists_in_word2 = False
for word2 in word_list2:
if word in word2:
word_exists_in_word2 = True
break
if word_exists_in_word2:
continue
text = text.replace(word, " " + word + " ")
text = re.sub(r"(\s)+", " ", text)
return text.strip() return text.strip()
@ -332,6 +434,8 @@ def replace_abbrevation(text: str):
text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE) text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE)
elif 'swedish kronor' in text.lower(): elif 'swedish kronor' in text.lower():
text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE) text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE)
elif "GPB" in text.split():
text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE)
elif 'sterling' in text.lower().split(): elif 'sterling' in text.lower().split():
text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE) text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE)
elif 'euro' in text.lower().split(): elif 'euro' in text.lower().split():
@ -342,7 +446,7 @@ def replace_abbrevation(text: str):
text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE) text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
elif '£' in text.lower().split(): elif '£' in text.lower().split():
text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE) text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
elif 'RMB' in text.lower().split(): elif 'RMB' in text.split():
text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE) text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
else: else:
pass pass