Optimize mapping algorithm:
For multiple currencies in fund/ share name, if exist USD, remove it Fix the issue for split words without space If there is no currency in share class name, try to get same currency from document mapping which with same fund name and same short share class name.
This commit is contained in:
parent
f06355e0c8
commit
3bb13947af
12
main.py
12
main.py
|
|
@ -574,9 +574,9 @@ def test_data_extraction_metrics():
|
||||||
|
|
||||||
|
|
||||||
def test_mapping_raw_name():
|
def test_mapping_raw_name():
|
||||||
doc_id = "481475385"
|
doc_id = "389171486"
|
||||||
raw_name = "Emerging Markets Fund A-ACC Shares USD"
|
raw_name = "Nordea 2 Emerging Market Local Debt Enhanced Fund Y - Shares"
|
||||||
raw_share_name = "A-ACC Shares USD"
|
raw_share_name = "Y - Shares"
|
||||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||||
data_mapping = DataMapping(
|
data_mapping = DataMapping(
|
||||||
doc_id,
|
doc_id,
|
||||||
|
|
@ -589,7 +589,7 @@ def test_mapping_raw_name():
|
||||||
mapping_info = data_mapping.matching_with_database(
|
mapping_info = data_mapping.matching_with_database(
|
||||||
raw_name=raw_name,
|
raw_name=raw_name,
|
||||||
raw_share_name=raw_share_name,
|
raw_share_name=raw_share_name,
|
||||||
parent_id=None,
|
parent_id="FS00009Q8R",
|
||||||
matching_type="share",
|
matching_type="share",
|
||||||
process_cache=process_cache
|
process_cache=process_cache
|
||||||
)
|
)
|
||||||
|
|
@ -705,12 +705,12 @@ if __name__ == "__main__":
|
||||||
]
|
]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["481475385"]
|
special_doc_id_list = ["483617247"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
for extract_way in extract_ways:
|
for extract_way in extract_ways:
|
||||||
|
|
|
||||||
|
|
@ -127,7 +127,7 @@ def get_most_similar_name(text: str,
|
||||||
new_splits.extend(split_words_without_space(split).split())
|
new_splits.extend(split_words_without_space(split).split())
|
||||||
else:
|
else:
|
||||||
new_splits.append(split)
|
new_splits.append(split)
|
||||||
|
text = ' '.join(new_splits)
|
||||||
lower_new_splits = [split.lower() for split in new_splits]
|
lower_new_splits = [split.lower() for split in new_splits]
|
||||||
for word in common_word_list:
|
for word in common_word_list:
|
||||||
if word not in lower_new_splits:
|
if word not in lower_new_splits:
|
||||||
|
|
@ -136,30 +136,35 @@ def get_most_similar_name(text: str,
|
||||||
temp_splits = copy_name_list[i].split()
|
temp_splits = copy_name_list[i].split()
|
||||||
copy_name_list[i] = ' '.join([split for split in temp_splits
|
copy_name_list[i] = ' '.join([split for split in temp_splits
|
||||||
if remove_special_characters(split).lower() != word])
|
if remove_special_characters(split).lower() != word])
|
||||||
final_splits = []
|
|
||||||
for split in new_splits:
|
|
||||||
if split.lower() not in ['fund', "funds", 'portfolio',
|
|
||||||
'class', 'classes',
|
|
||||||
'share', 'shares']:
|
|
||||||
final_splits.append(split)
|
|
||||||
|
|
||||||
text = ' '.join(final_splits)
|
|
||||||
|
|
||||||
copy_share_name_list = get_share_part_list(copy_name_list)
|
|
||||||
|
|
||||||
for i in range(len(copy_name_list)):
|
|
||||||
temp_splits = copy_name_list[i].split()
|
|
||||||
copy_name_list[i] = ' '.join([split for split in temp_splits
|
|
||||||
if remove_special_characters(split).lower()
|
|
||||||
not in ['fund', "funds", 'portfolio',
|
|
||||||
'class', 'classes',
|
|
||||||
'share', 'shares']])
|
|
||||||
max_similarity = 0
|
max_similarity = 0
|
||||||
max_similarity_full_name = None
|
max_similarity_full_name = None
|
||||||
text = remove_special_characters(text)
|
text = remove_special_characters(text)
|
||||||
|
|
||||||
if matching_type == "share":
|
if matching_type == "share":
|
||||||
text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list)
|
text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list)
|
||||||
|
|
||||||
|
text = ' '.join([split for split in text.split()
|
||||||
|
if split.lower() not in ['fund', "funds", 'portfolio',
|
||||||
|
'bond', 'bonds',
|
||||||
|
'class', 'classes',
|
||||||
|
'share', 'shares']])
|
||||||
|
if share_name is not None:
|
||||||
|
share_name = ' '.join([split for split in share_name.split()
|
||||||
|
if split.lower() not in ['fund', "funds", 'portfolio',
|
||||||
|
'bond', 'bonds',
|
||||||
|
'class', 'classes',
|
||||||
|
'share', 'shares']])
|
||||||
|
|
||||||
|
copy_share_name_list = get_share_part_list(copy_name_list)
|
||||||
|
for i in range(len(copy_name_list)):
|
||||||
|
temp_splits = copy_name_list[i].split()
|
||||||
|
copy_name_list[i] = ' '.join([split for split in temp_splits
|
||||||
|
if remove_special_characters(split).lower()
|
||||||
|
not in ['fund', "funds", 'portfolio',
|
||||||
|
'bond', 'bonds',
|
||||||
|
'class', 'classes',
|
||||||
|
'share', 'shares']])
|
||||||
text_currency = None
|
text_currency = None
|
||||||
text_feature = None
|
text_feature = None
|
||||||
text_share_short_name = None
|
text_share_short_name = None
|
||||||
|
|
@ -192,11 +197,18 @@ def get_most_similar_name(text: str,
|
||||||
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
|
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
|
||||||
same_max_similarity_name_list = []
|
same_max_similarity_name_list = []
|
||||||
for full_name, copy_name, copy_share_name in zip(name_list , copy_name_list, copy_share_name_list):
|
for full_name, copy_name, copy_share_name in zip(name_list , copy_name_list, copy_share_name_list):
|
||||||
|
if not isinstance(copy_name, str) or len(copy_name.strip()) == 0:
|
||||||
|
continue
|
||||||
copy_name = remove_special_characters(copy_name)
|
copy_name = remove_special_characters(copy_name)
|
||||||
copy_name = split_words_without_space(copy_name)
|
copy_name = split_words_without_space(copy_name)
|
||||||
similarity = get_jacard_similarity(text,
|
try:
|
||||||
copy_name,
|
similarity = get_jacard_similarity(text,
|
||||||
need_remove_numeric_characters=False)
|
copy_name,
|
||||||
|
need_remove_numeric_characters=False)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
print_exc()
|
||||||
|
similarity = 0
|
||||||
copy_name_2 = replace_abbrevation(copy_name)
|
copy_name_2 = replace_abbrevation(copy_name)
|
||||||
if copy_name != copy_name_2:
|
if copy_name != copy_name_2:
|
||||||
similarity_2 = get_jacard_similarity(text,
|
similarity_2 = get_jacard_similarity(text,
|
||||||
|
|
@ -356,14 +368,16 @@ def update_for_currency(text: str, share_name: str, compare_list: list):
|
||||||
else:
|
else:
|
||||||
without_currency_list.append(index)
|
without_currency_list.append(index)
|
||||||
if not with_currency and len(with_currency_list) == 0:
|
if not with_currency and len(with_currency_list) == 0:
|
||||||
return text, share_name, compare_list
|
pass
|
||||||
elif not with_currency and len(with_currency_list) > 0:
|
elif not with_currency and len(with_currency_list) > 0:
|
||||||
last_split = text_split[-1]
|
share_short_name = ""
|
||||||
|
if share_name is not None and len(share_name.strip()) > 0:
|
||||||
|
share_short_name = get_share_short_name_from_text(share_name)
|
||||||
updated = False
|
updated = False
|
||||||
if len(last_split) < 4 and last_split.upper() == last_split:
|
if len(share_short_name) < 4 and share_short_name.upper() == share_short_name:
|
||||||
if len(without_currency_list) > 0:
|
if len(without_currency_list) > 0:
|
||||||
for index in without_currency_list:
|
for index in without_currency_list:
|
||||||
if last_split in compare_list[index].split():
|
if share_short_name in compare_list[index].split():
|
||||||
text = text + ' ' + 'USD'
|
text = text + ' ' + 'USD'
|
||||||
if share_name is not None:
|
if share_name is not None:
|
||||||
share_name = share_name + ' ' + 'USD'
|
share_name = share_name + ' ' + 'USD'
|
||||||
|
|
@ -373,7 +387,7 @@ def update_for_currency(text: str, share_name: str, compare_list: list):
|
||||||
currency_list = []
|
currency_list = []
|
||||||
for index in with_currency_list:
|
for index in with_currency_list:
|
||||||
compare_split = compare_list[index].split()
|
compare_split = compare_list[index].split()
|
||||||
if last_split in compare_split:
|
if share_short_name in compare_split:
|
||||||
current_currency_list = [split for split in compare_split
|
current_currency_list = [split for split in compare_split
|
||||||
if split.upper() in total_currency_list]
|
if split.upper() in total_currency_list]
|
||||||
if len(current_currency_list) > 0:
|
if len(current_currency_list) > 0:
|
||||||
|
|
@ -391,13 +405,40 @@ def update_for_currency(text: str, share_name: str, compare_list: list):
|
||||||
text = text + ' ' + 'USD'
|
text = text + ' ' + 'USD'
|
||||||
if share_name is not None:
|
if share_name is not None:
|
||||||
share_name = share_name + ' ' + 'USD'
|
share_name = share_name + ' ' + 'USD'
|
||||||
return text, share_name, compare_list
|
# return text, share_name, compare_list
|
||||||
elif with_currency and len(without_currency_list) == 0:
|
elif with_currency and len(without_currency_list) == 0:
|
||||||
for index in without_currency_list:
|
for index in without_currency_list:
|
||||||
compare_list[index] = compare_list[index] + ' ' + 'USD'
|
compare_list[index] = compare_list[index] + ' ' + 'USD'
|
||||||
return text, share_name, compare_list
|
# return text, share_name, compare_list
|
||||||
else:
|
else:
|
||||||
return text, share_name, compare_list
|
# return text, share_name, compare_list
|
||||||
|
pass
|
||||||
|
if with_currency:
|
||||||
|
share_name_split = share_name.split()
|
||||||
|
share_name_currency_list = []
|
||||||
|
for split in share_name_split:
|
||||||
|
if split.upper() in total_currency_list and split.upper() not in share_name_currency_list:
|
||||||
|
share_name_currency_list.append(split)
|
||||||
|
if len(share_name_currency_list) > 1 and 'USD' in share_name_currency_list:
|
||||||
|
new_share_name = ' '.join([split for split in share_name_split if split.upper() != 'USD'])
|
||||||
|
if share_name in text:
|
||||||
|
text = text.replace(share_name, new_share_name)
|
||||||
|
else:
|
||||||
|
text = ' '.join([split for split in text.split() if split.upper() != 'USD'])
|
||||||
|
share_name = new_share_name
|
||||||
|
for c_i in range(len(compare_list)):
|
||||||
|
compare = compare_list[c_i]
|
||||||
|
compare_share_part = get_share_part_list([compare])[0]
|
||||||
|
compare_share_part_split = compare_share_part.split()
|
||||||
|
compare_share_part_currency_list = []
|
||||||
|
for split in compare_share_part_split:
|
||||||
|
if split.upper() in total_currency_list and split.upper() not in compare_share_part_currency_list:
|
||||||
|
compare_share_part_currency_list.append(split)
|
||||||
|
if len(compare_share_part_currency_list) > 1 and 'USD' in compare_share_part_currency_list:
|
||||||
|
compare_share_part_split = [split for split in compare_share_part_split if split.upper() != 'USD']
|
||||||
|
new_compare_share_part = ' '.join(compare_share_part_split)
|
||||||
|
compare_list[c_i] = compare.replace(compare_share_part, new_compare_share_part)
|
||||||
|
return text, share_name, compare_list
|
||||||
|
|
||||||
|
|
||||||
def remove_common_word(text_list: list):
|
def remove_common_word(text_list: list):
|
||||||
|
|
@ -625,7 +666,7 @@ def replace_abbrevation(text: str):
|
||||||
new_text_splits.append('Accumulation')
|
new_text_splits.append('Accumulation')
|
||||||
elif split.lower() in ['inc', 'inc.']:
|
elif split.lower() in ['inc', 'inc.']:
|
||||||
new_text_splits.append('Income')
|
new_text_splits.append('Income')
|
||||||
elif split.lower() in ['dist', 'dist.']:
|
elif split.lower() in ['dist', 'dist.', 'dis', 'dis.']:
|
||||||
new_text_splits.append('Distribution')
|
new_text_splits.append('Distribution')
|
||||||
elif split.lower() in ['inv', 'inv.']:
|
elif split.lower() in ['inv', 'inv.']:
|
||||||
new_text_splits.append('Investor')
|
new_text_splits.append('Investor')
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue