investment mapping: optimize for currency logic
This commit is contained in:
parent
dd6701f18c
commit
598e2ab820
|
|
@ -312,7 +312,10 @@ class DataMapping:
|
|||
max_similarity_name, max_similarity = get_most_similar_name(
|
||||
raw_name, provider_compare_name_list, pre_common_word_list=pre_common_word_list
|
||||
)
|
||||
if max_similarity is not None and max_similarity >= 0.5:
|
||||
threshold = 0.7
|
||||
if matching_type == "share":
|
||||
threshold = 0.5
|
||||
if max_similarity is not None and max_similarity >= threshold:
|
||||
data_info["id"] = provider_compare_mapping[
|
||||
provider_compare_mapping[compare_name_dp] == max_similarity_name
|
||||
][compare_id_dp].values[0]
|
||||
|
|
|
|||
2
main.py
2
main.py
|
|
@ -642,8 +642,10 @@ if __name__ == "__main__":
|
|||
"508854243",
|
||||
"520879048",
|
||||
"463081566",
|
||||
"389171486"
|
||||
]
|
||||
special_doc_id_list = check_mapping_doc_id_list
|
||||
# special_doc_id_list = ["445256897"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import re
|
||||
from copy import deepcopy
|
||||
from traceback import print_exc
|
||||
|
||||
def add_slash_to_text_as_regex(text: str):
|
||||
if text is None or len(text) == 0:
|
||||
|
|
@ -79,6 +80,8 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
|
|||
text = ' '.join(final_splits)
|
||||
max_similarity = 0
|
||||
max_similarity_fund_name = None
|
||||
text = remove_special_characters(text)
|
||||
text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
|
||||
for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list):
|
||||
copy_fund_name = remove_special_characters(copy_fund_name)
|
||||
copy_fund_name = split_words_without_space(copy_fund_name)
|
||||
|
|
@ -95,9 +98,76 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
|
|||
return max_similarity_fund_name, max_similarity
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print_exc()
|
||||
return None, 0.0
|
||||
|
||||
|
||||
def update_for_currency(text: str, compare_list: list):
|
||||
text_split = text.split()
|
||||
with_currency = False
|
||||
total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY',
|
||||
'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD',
|
||||
'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN',
|
||||
'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR',
|
||||
'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW',
|
||||
'ETB', 'DZD', 'XEU', 'XFO']
|
||||
for split in text_split:
|
||||
if split.upper() in total_currency_list:
|
||||
with_currency = True
|
||||
break
|
||||
|
||||
with_currency_list = []
|
||||
without_currency_list = []
|
||||
for index, compare in enumerate(compare_list):
|
||||
compare_split = compare.split()
|
||||
with_currency_compare = False
|
||||
for split in compare_split:
|
||||
if split.upper() in total_currency_list:
|
||||
with_currency_compare = True
|
||||
break
|
||||
if with_currency_compare:
|
||||
with_currency_list.append(index)
|
||||
else:
|
||||
without_currency_list.append(index)
|
||||
if not with_currency and len(with_currency_list) == 0:
|
||||
return text, compare_list
|
||||
elif not with_currency and len(with_currency_list) > 0:
|
||||
last_split = text_split[-1]
|
||||
updated = False
|
||||
if len(last_split) < 4 and last_split.upper() == last_split:
|
||||
if len(without_currency_list) > 0:
|
||||
for index in without_currency_list:
|
||||
if last_split in compare_list[index].split():
|
||||
text = text + ' ' + 'USD'
|
||||
updated = True
|
||||
break
|
||||
if not updated:
|
||||
currency_list = []
|
||||
for index in with_currency_list:
|
||||
compare_split = compare_list[index].split()
|
||||
if last_split in compare_split:
|
||||
current_currency_list = [split for split in compare_split
|
||||
if split.upper() in total_currency_list]
|
||||
if len(current_currency_list) > 0:
|
||||
currency_list.append(current_currency_list[-1])
|
||||
if len(currency_list) == 1:
|
||||
text = text + ' ' + currency_list[0]
|
||||
updated = True
|
||||
|
||||
for index in without_currency_list:
|
||||
compare_list[index] = compare_list[index] + ' ' + 'USD'
|
||||
|
||||
if not updated:
|
||||
text = text + ' ' + 'USD'
|
||||
return text, compare_list
|
||||
elif with_currency and len(without_currency_list) == 0:
|
||||
for index in without_currency_list:
|
||||
compare_list[index] = compare_list[index] + ' ' + 'USD'
|
||||
return text, compare_list
|
||||
else:
|
||||
return text, compare_list
|
||||
|
||||
|
||||
def remove_common_word(text_list: list):
|
||||
if text_list is None or len(text_list) == 0:
|
||||
return text_list
|
||||
|
|
@ -268,6 +338,10 @@ def replace_abbrevation(text: str):
|
|||
text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE)
|
||||
elif '€' in text.lower().split():
|
||||
text = re.sub(r'\€', 'EUR', text, flags=re.IGNORECASE)
|
||||
elif '$' in text.lower().split():
|
||||
text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
|
||||
elif '£' in text.lower().split():
|
||||
text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
|
||||
elif 'RMB' in text.lower().split():
|
||||
text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
|
||||
else:
|
||||
|
|
|
|||
Loading…
Reference in New Issue