investment mapping: optimize for currency logic

This commit is contained in:
Blade He 2024-09-25 17:28:22 -05:00
parent dd6701f18c
commit 598e2ab820
3 changed files with 80 additions and 1 deletions

View File

@ -312,7 +312,10 @@ class DataMapping:
max_similarity_name, max_similarity = get_most_similar_name(
raw_name, provider_compare_name_list, pre_common_word_list=pre_common_word_list
)
if max_similarity is not None and max_similarity >= 0.5:
threshold = 0.7
if matching_type == "share":
threshold = 0.5
if max_similarity is not None and max_similarity >= threshold:
data_info["id"] = provider_compare_mapping[
provider_compare_mapping[compare_name_dp] == max_similarity_name
][compare_id_dp].values[0]

View File

@ -642,8 +642,10 @@ if __name__ == "__main__":
"508854243",
"520879048",
"463081566",
"389171486"
]
special_doc_id_list = check_mapping_doc_id_list
# special_doc_id_list = ["445256897"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False

View File

@ -1,5 +1,6 @@
import re
from copy import deepcopy
from traceback import print_exc
def add_slash_to_text_as_regex(text: str):
if text is None or len(text) == 0:
@ -79,6 +80,8 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
text = ' '.join(final_splits)
max_similarity = 0
max_similarity_fund_name = None
text = remove_special_characters(text)
text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list):
copy_fund_name = remove_special_characters(copy_fund_name)
copy_fund_name = split_words_without_space(copy_fund_name)
@ -95,9 +98,76 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
return max_similarity_fund_name, max_similarity
except Exception as e:
print(e)
print_exc()
return None, 0.0
def update_for_currency(text: str, compare_list: list):
text_split = text.split()
with_currency = False
total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY',
'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD',
'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN',
'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR',
'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW',
'ETB', 'DZD', 'XEU', 'XFO']
for split in text_split:
if split.upper() in total_currency_list:
with_currency = True
break
with_currency_list = []
without_currency_list = []
for index, compare in enumerate(compare_list):
compare_split = compare.split()
with_currency_compare = False
for split in compare_split:
if split.upper() in total_currency_list:
with_currency_compare = True
break
if with_currency_compare:
with_currency_list.append(index)
else:
without_currency_list.append(index)
if not with_currency and len(with_currency_list) == 0:
return text, compare_list
elif not with_currency and len(with_currency_list) > 0:
last_split = text_split[-1]
updated = False
if len(last_split) < 4 and last_split.upper() == last_split:
if len(without_currency_list) > 0:
for index in without_currency_list:
if last_split in compare_list[index].split():
text = text + ' ' + 'USD'
updated = True
break
if not updated:
currency_list = []
for index in with_currency_list:
compare_split = compare_list[index].split()
if last_split in compare_split:
current_currency_list = [split for split in compare_split
if split.upper() in total_currency_list]
if len(current_currency_list) > 0:
currency_list.append(current_currency_list[-1])
if len(currency_list) == 1:
text = text + ' ' + currency_list[0]
updated = True
for index in without_currency_list:
compare_list[index] = compare_list[index] + ' ' + 'USD'
if not updated:
text = text + ' ' + 'USD'
return text, compare_list
elif with_currency and len(without_currency_list) == 0:
for index in without_currency_list:
compare_list[index] = compare_list[index] + ' ' + 'USD'
return text, compare_list
else:
return text, compare_list
def remove_common_word(text_list: list):
if text_list is None or len(text_list) == 0:
return text_list
@ -268,6 +338,10 @@ def replace_abbrevation(text: str):
text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE)
elif '' in text.lower().split():
text = re.sub(r'\', 'EUR', text, flags=re.IGNORECASE)
elif '$' in text.lower().split():
text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
elif '£' in text.lower().split():
text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
elif 'RMB' in text.lower().split():
text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
else: