investment mapping: optimize for currency logic

2024-09-25 17:28:22 -05:00 · 2024-09-25 17:28:22 -05:00 · 598e2ab820
parent dd6701f18c
commit 598e2ab820
3 changed files with 80 additions and 1 deletions
--- a/core/data_mapping.py
+++ b/core/data_mapping.py
@ -312,7 +312,10 @@ class DataMapping:
                max_similarity_name, max_similarity = get_most_similar_name(
                    raw_name, provider_compare_name_list, pre_common_word_list=pre_common_word_list
                )
-                if max_similarity is not None and max_similarity >= 0.5:
+                threshold = 0.7
+                if matching_type == "share":
+                    threshold = 0.5
+                if max_similarity is not None and max_similarity >= threshold:
                    data_info["id"] = provider_compare_mapping[
                        provider_compare_mapping[compare_name_dp] == max_similarity_name
                    ][compare_id_dp].values[0]
--- a/main.py
+++ b/main.py
@ -642,8 +642,10 @@ if __name__ == "__main__":
        "508854243",
        "520879048",
        "463081566",
+        "389171486"
    ]
    special_doc_id_list = check_mapping_doc_id_list
+    # special_doc_id_list = ["445256897"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -1,5 +1,6 @@
 import re
 from copy import deepcopy
+from traceback import print_exc

 def add_slash_to_text_as_regex(text: str):
    if text is None or len(text) == 0:
@ -79,6 +80,8 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
            text = ' '.join(final_splits)
        max_similarity = 0
        max_similarity_fund_name = None
+        text = remove_special_characters(text)
+        text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
        for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list):
            copy_fund_name = remove_special_characters(copy_fund_name)
            copy_fund_name = split_words_without_space(copy_fund_name)
@ -95,9 +98,76 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
        return max_similarity_fund_name, max_similarity
    except Exception as e:
        print(e)
+        print_exc()
        return None, 0.0


+def update_for_currency(text: str, compare_list: list):
+    text_split = text.split()
+    with_currency = False
+    total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY', 
+                           'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD', 
+                           'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN', 
+                           'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR', 
+                           'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW', 
+                           'ETB', 'DZD', 'XEU', 'XFO']
+    for split in text_split:
+        if split.upper() in total_currency_list:
+            with_currency = True
+            break
+    
+    with_currency_list = []
+    without_currency_list = []
+    for index, compare in enumerate(compare_list):
+        compare_split = compare.split()
+        with_currency_compare = False
+        for split in compare_split:
+            if split.upper() in total_currency_list:
+                with_currency_compare = True
+                break
+        if with_currency_compare:
+            with_currency_list.append(index)
+        else:
+            without_currency_list.append(index)
+    if not with_currency and len(with_currency_list) == 0:
+        return text, compare_list
+    elif not with_currency and len(with_currency_list) > 0:
+        last_split = text_split[-1]
+        updated = False
+        if len(last_split) < 4 and last_split.upper() == last_split:
+            if len(without_currency_list) > 0:
+                for index in without_currency_list:
+                    if last_split in compare_list[index].split():
+                        text = text + ' ' + 'USD'
+                        updated = True
+                        break
+            if not updated:
+                currency_list = []
+                for index in with_currency_list:
+                    compare_split = compare_list[index].split()
+                    if last_split in compare_split:
+                        current_currency_list = [split for split in compare_split 
+                                                 if split.upper() in total_currency_list]
+                        if len(current_currency_list) > 0:
+                            currency_list.append(current_currency_list[-1])
+                if len(currency_list) == 1:
+                    text = text + ' ' + currency_list[0]
+                    updated = True        
+                        
+        for index in without_currency_list:
+            compare_list[index] = compare_list[index] + ' ' + 'USD'
+            
+        if not updated:
+            text = text + ' ' + 'USD'
+        return text, compare_list
+    elif with_currency and len(without_currency_list) == 0:
+        for index in without_currency_list:
+            compare_list[index] = compare_list[index] + ' ' + 'USD'
+        return text, compare_list
+    else:
+        return text, compare_list
+
+
 def remove_common_word(text_list: list):
    if text_list is None or len(text_list) == 0:
        return text_list
@ -268,6 +338,10 @@ def replace_abbrevation(text: str):
        text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE)
    elif '€' in text.lower().split():
        text = re.sub(r'\€', 'EUR', text, flags=re.IGNORECASE)
+    elif '$' in text.lower().split():
+        text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
+    elif '£' in text.lower().split():
+        text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
    elif 'RMB' in text.lower().split():
        text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
    else: