Optimize mapping algorithm:

For multiple currencies in fund/ share name, if exist USD, remove it
Fix the issue for split words without space
If there is no currency in share class name, try to get same currency from document mapping which with same fund name and same short share class name.
This commit is contained in:
Blade He 2024-10-02 13:25:08 -05:00
parent f06355e0c8
commit 3bb13947af
2 changed files with 77 additions and 36 deletions

12
main.py
View File

@ -574,9 +574,9 @@ def test_data_extraction_metrics():
def test_mapping_raw_name(): def test_mapping_raw_name():
doc_id = "481475385" doc_id = "389171486"
raw_name = "Emerging Markets Fund A-ACC Shares USD" raw_name = "Nordea 2 Emerging Market Local Debt Enhanced Fund Y - Shares"
raw_share_name = "A-ACC Shares USD" raw_share_name = "Y - Shares"
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
data_mapping = DataMapping( data_mapping = DataMapping(
doc_id, doc_id,
@ -589,7 +589,7 @@ def test_mapping_raw_name():
mapping_info = data_mapping.matching_with_database( mapping_info = data_mapping.matching_with_database(
raw_name=raw_name, raw_name=raw_name,
raw_share_name=raw_share_name, raw_share_name=raw_share_name,
parent_id=None, parent_id="FS00009Q8R",
matching_type="share", matching_type="share",
process_cache=process_cache process_cache=process_cache
) )
@ -705,12 +705,12 @@ if __name__ == "__main__":
] ]
# special_doc_id_list = check_mapping_doc_id_list # special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["481475385"] special_doc_id_list = ["483617247"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = True force_save_total_data = False
extract_ways = ["text"] extract_ways = ["text"]
for extract_way in extract_ways: for extract_way in extract_ways:

View File

@ -127,7 +127,7 @@ def get_most_similar_name(text: str,
new_splits.extend(split_words_without_space(split).split()) new_splits.extend(split_words_without_space(split).split())
else: else:
new_splits.append(split) new_splits.append(split)
text = ' '.join(new_splits)
lower_new_splits = [split.lower() for split in new_splits] lower_new_splits = [split.lower() for split in new_splits]
for word in common_word_list: for word in common_word_list:
if word not in lower_new_splits: if word not in lower_new_splits:
@ -136,30 +136,35 @@ def get_most_similar_name(text: str,
temp_splits = copy_name_list[i].split() temp_splits = copy_name_list[i].split()
copy_name_list[i] = ' '.join([split for split in temp_splits copy_name_list[i] = ' '.join([split for split in temp_splits
if remove_special_characters(split).lower() != word]) if remove_special_characters(split).lower() != word])
final_splits = []
for split in new_splits:
if split.lower() not in ['fund', "funds", 'portfolio',
'class', 'classes',
'share', 'shares']:
final_splits.append(split)
text = ' '.join(final_splits)
copy_share_name_list = get_share_part_list(copy_name_list)
for i in range(len(copy_name_list)):
temp_splits = copy_name_list[i].split()
copy_name_list[i] = ' '.join([split for split in temp_splits
if remove_special_characters(split).lower()
not in ['fund', "funds", 'portfolio',
'class', 'classes',
'share', 'shares']])
max_similarity = 0 max_similarity = 0
max_similarity_full_name = None max_similarity_full_name = None
text = remove_special_characters(text) text = remove_special_characters(text)
if matching_type == "share": if matching_type == "share":
text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list) text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list)
text = ' '.join([split for split in text.split()
if split.lower() not in ['fund', "funds", 'portfolio',
'bond', 'bonds',
'class', 'classes',
'share', 'shares']])
if share_name is not None:
share_name = ' '.join([split for split in share_name.split()
if split.lower() not in ['fund', "funds", 'portfolio',
'bond', 'bonds',
'class', 'classes',
'share', 'shares']])
copy_share_name_list = get_share_part_list(copy_name_list)
for i in range(len(copy_name_list)):
temp_splits = copy_name_list[i].split()
copy_name_list[i] = ' '.join([split for split in temp_splits
if remove_special_characters(split).lower()
not in ['fund', "funds", 'portfolio',
'bond', 'bonds',
'class', 'classes',
'share', 'shares']])
text_currency = None text_currency = None
text_feature = None text_feature = None
text_share_short_name = None text_share_short_name = None
@ -192,11 +197,18 @@ def get_most_similar_name(text: str,
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}") # logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
same_max_similarity_name_list = [] same_max_similarity_name_list = []
for full_name, copy_name, copy_share_name in zip(name_list , copy_name_list, copy_share_name_list): for full_name, copy_name, copy_share_name in zip(name_list , copy_name_list, copy_share_name_list):
if not isinstance(copy_name, str) or len(copy_name.strip()) == 0:
continue
copy_name = remove_special_characters(copy_name) copy_name = remove_special_characters(copy_name)
copy_name = split_words_without_space(copy_name) copy_name = split_words_without_space(copy_name)
try:
similarity = get_jacard_similarity(text, similarity = get_jacard_similarity(text,
copy_name, copy_name,
need_remove_numeric_characters=False) need_remove_numeric_characters=False)
except Exception as e:
print(e)
print_exc()
similarity = 0
copy_name_2 = replace_abbrevation(copy_name) copy_name_2 = replace_abbrevation(copy_name)
if copy_name != copy_name_2: if copy_name != copy_name_2:
similarity_2 = get_jacard_similarity(text, similarity_2 = get_jacard_similarity(text,
@ -356,14 +368,16 @@ def update_for_currency(text: str, share_name: str, compare_list: list):
else: else:
without_currency_list.append(index) without_currency_list.append(index)
if not with_currency and len(with_currency_list) == 0: if not with_currency and len(with_currency_list) == 0:
return text, share_name, compare_list pass
elif not with_currency and len(with_currency_list) > 0: elif not with_currency and len(with_currency_list) > 0:
last_split = text_split[-1] share_short_name = ""
if share_name is not None and len(share_name.strip()) > 0:
share_short_name = get_share_short_name_from_text(share_name)
updated = False updated = False
if len(last_split) < 4 and last_split.upper() == last_split: if len(share_short_name) < 4 and share_short_name.upper() == share_short_name:
if len(without_currency_list) > 0: if len(without_currency_list) > 0:
for index in without_currency_list: for index in without_currency_list:
if last_split in compare_list[index].split(): if share_short_name in compare_list[index].split():
text = text + ' ' + 'USD' text = text + ' ' + 'USD'
if share_name is not None: if share_name is not None:
share_name = share_name + ' ' + 'USD' share_name = share_name + ' ' + 'USD'
@ -373,7 +387,7 @@ def update_for_currency(text: str, share_name: str, compare_list: list):
currency_list = [] currency_list = []
for index in with_currency_list: for index in with_currency_list:
compare_split = compare_list[index].split() compare_split = compare_list[index].split()
if last_split in compare_split: if share_short_name in compare_split:
current_currency_list = [split for split in compare_split current_currency_list = [split for split in compare_split
if split.upper() in total_currency_list] if split.upper() in total_currency_list]
if len(current_currency_list) > 0: if len(current_currency_list) > 0:
@ -391,12 +405,39 @@ def update_for_currency(text: str, share_name: str, compare_list: list):
text = text + ' ' + 'USD' text = text + ' ' + 'USD'
if share_name is not None: if share_name is not None:
share_name = share_name + ' ' + 'USD' share_name = share_name + ' ' + 'USD'
return text, share_name, compare_list # return text, share_name, compare_list
elif with_currency and len(without_currency_list) == 0: elif with_currency and len(without_currency_list) == 0:
for index in without_currency_list: for index in without_currency_list:
compare_list[index] = compare_list[index] + ' ' + 'USD' compare_list[index] = compare_list[index] + ' ' + 'USD'
return text, share_name, compare_list # return text, share_name, compare_list
else: else:
# return text, share_name, compare_list
pass
if with_currency:
share_name_split = share_name.split()
share_name_currency_list = []
for split in share_name_split:
if split.upper() in total_currency_list and split.upper() not in share_name_currency_list:
share_name_currency_list.append(split)
if len(share_name_currency_list) > 1 and 'USD' in share_name_currency_list:
new_share_name = ' '.join([split for split in share_name_split if split.upper() != 'USD'])
if share_name in text:
text = text.replace(share_name, new_share_name)
else:
text = ' '.join([split for split in text.split() if split.upper() != 'USD'])
share_name = new_share_name
for c_i in range(len(compare_list)):
compare = compare_list[c_i]
compare_share_part = get_share_part_list([compare])[0]
compare_share_part_split = compare_share_part.split()
compare_share_part_currency_list = []
for split in compare_share_part_split:
if split.upper() in total_currency_list and split.upper() not in compare_share_part_currency_list:
compare_share_part_currency_list.append(split)
if len(compare_share_part_currency_list) > 1 and 'USD' in compare_share_part_currency_list:
compare_share_part_split = [split for split in compare_share_part_split if split.upper() != 'USD']
new_compare_share_part = ' '.join(compare_share_part_split)
compare_list[c_i] = compare.replace(compare_share_part, new_compare_share_part)
return text, share_name, compare_list return text, share_name, compare_list
@ -625,7 +666,7 @@ def replace_abbrevation(text: str):
new_text_splits.append('Accumulation') new_text_splits.append('Accumulation')
elif split.lower() in ['inc', 'inc.']: elif split.lower() in ['inc', 'inc.']:
new_text_splits.append('Income') new_text_splits.append('Income')
elif split.lower() in ['dist', 'dist.']: elif split.lower() in ['dist', 'dist.', 'dis', 'dis.']:
new_text_splits.append('Distribution') new_text_splits.append('Distribution')
elif split.lower() in ['inv', 'inv.']: elif split.lower() in ['inv', 'inv.']:
new_text_splits.append('Investor') new_text_splits.append('Investor')