Optimize mapping algorithm

Consider some share class names are with multiple short name, e.g. CPR Invest Global Disruptive Opportunities Class I sw EUR - Acc The short names are I and sw The purpose is to support get all of short names from share class name.
2024-10-02 15:08:26 -05:00 · 2024-10-02 15:08:26 -05:00 · edb90c718e
parent 3bb13947af
commit edb90c718e
2 changed files with 45 additions and 27 deletions
--- a/main.py
+++ b/main.py
@ -574,9 +574,9 @@ def test_data_extraction_metrics():


 def test_mapping_raw_name():
-    doc_id = "389171486"
-    raw_name = "Nordea 2 Emerging Market Local Debt Enhanced Fund Y - Shares"
-    raw_share_name = "Y - Shares"
+    doc_id = "483617247"
+    raw_name = "CPR Invest Global Disruptive Opportunities Class I sw EUR - Acc"
+    raw_share_name = "Class I sw EUR - Acc"
    output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
    data_mapping = DataMapping(
        doc_id,
@ -589,7 +589,7 @@ def test_mapping_raw_name():
    mapping_info = data_mapping.matching_with_database(
        raw_name=raw_name,
        raw_share_name=raw_share_name,
-        parent_id="FS00009Q8R", 
+        parent_id=None, 
        matching_type="share",
        process_cache=process_cache
    )
@ -705,12 +705,12 @@ if __name__ == "__main__":
    ]
    # special_doc_id_list = check_mapping_doc_id_list
    special_doc_id_list = check_db_mapping_doc_id_list
-    special_doc_id_list = ["483617247"]
+    # special_doc_id_list = ["483617247"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
    re_run_mapping_data = True
-    force_save_total_data = False
+    force_save_total_data = True

    extract_ways = ["text"]
    for extract_way in extract_ways:
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -167,7 +167,7 @@ def get_most_similar_name(text: str,
                                                        'share', 'shares']])
        text_currency = None
        text_feature = None
-        text_share_short_name = None
+        text_share_short_name_list = None
        if matching_type == "share" and text is not None and len(text.strip()) > 0:
            if process_cache is not None and isinstance(process_cache, dict):
                if process_cache.get(text, None) is not None:
@ -177,20 +177,23 @@ def get_most_similar_name(text: str,
                    text_currency = cache.get("share_currency")
                else:
                    if share_name is not None and len(share_name.strip()) > 0:
-                        text_share_short_name = get_share_short_name_from_text(share_name)
+                        text_share_short_name_list = get_share_short_name_from_text(share_name)
                        text_feature = get_share_feature_from_text(share_name)
                        text_currency = get_currency_from_text(share_name)
                    else:
-                        text_share_short_name = get_share_short_name_from_text(text)
+                        text_share_short_name_list = get_share_short_name_from_text(text)
                        text_feature = get_share_feature_from_text(text)
                        text_currency = get_currency_from_text(text)
+                    # sort text_share_short_name_list
+                    text_share_short_name_list.sort()
                    process_cache[text] = {
-                        "share_short_name": text_share_short_name,
+                        "share_short_name": text_share_short_name_list,
                        "share_feature": text_feature,
                        "share_currency": text_currency
                    }
            else:
-                text_share_short_name = get_share_short_name_from_text(share_name)
+                text_share_short_name_list = get_share_short_name_from_text(share_name)
+                text_share_short_name_list.sort()
                text_feature = get_share_feature_from_text(share_name)
                text_currency = get_currency_from_text(share_name)
        
@ -221,20 +224,22 @@ def get_most_similar_name(text: str,
                    if  process_cache is not None and isinstance(process_cache, dict):
                        if process_cache.get(copy_name, None) is not None:
                            cache = process_cache.get(copy_name)
-                            copy_name_short_name = cache.get("share_short_name")
+                            copy_name_short_name_list = cache.get("share_short_name")
                            copy_name_feature = cache.get("share_feature")
                            copy_name_currency = cache.get("share_currency")
                        else:
-                            copy_name_short_name = get_share_short_name_from_text(copy_share_name)
+                            copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
+                            copy_name_short_name_list.sort()
                            copy_name_feature = get_share_feature_from_text(copy_share_name)
                            copy_name_currency = get_currency_from_text(copy_share_name)
                            process_cache[copy_name] = {
-                                "share_short_name": copy_name_short_name,
+                                "share_short_name": copy_name_short_name_list,
                                "share_feature": copy_name_feature,
                                "share_currency": copy_name_currency
                            }
                    else:
-                        copy_name_short_name = get_share_short_name_from_text(copy_share_name)
+                        copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
+                        copy_name_short_name_list.sort()
                        copy_name_feature = get_share_feature_from_text(copy_share_name)
                        copy_name_currency = get_currency_from_text(copy_share_name)
                        
@ -248,9 +253,9 @@ def get_most_similar_name(text: str,
                            if copy_name_feature.lower() not in text.lower().split():
                                continue
                    if matching_type == "share":
-                        if text_share_short_name is not None and len(text_share_short_name) > 0 and \
-                            copy_name_short_name is not None and len(copy_name_short_name) > 0:
-                            if text_share_short_name != copy_name_short_name:
+                        if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
+                            copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
+                            if text_share_short_name_list != copy_name_short_name_list:
                                continue
                max_similarity = similarity
                max_similarity_full_name = full_name
@ -304,15 +309,17 @@ def get_share_short_name_from_text(text: str):
    temp_share_features = [feature.lower() for feature in share_features_full_name]
    
    count = 0
+    share_short_name_list = []
+    
    for split in text_split[::-1]:
        if count == 4:
            break
        if split.lower() not in temp_share_features and \
-            split not in total_currency_list:
+            split.upper() not in total_currency_list:
            if len(split) <= 3 and split.upper() == split:
-                return split.upper()
+                share_short_name_list.append(split.upper())
        count += 1
-    return None
+    return share_short_name_list

 def get_share_feature_from_text(text: str):
    if text is None or len(text.strip()) == 0:
@ -370,14 +377,20 @@ def update_for_currency(text: str, share_name: str, compare_list: list):
    if not with_currency and len(with_currency_list) == 0:
        pass
    elif not with_currency and len(with_currency_list) > 0:
-        share_short_name = ""
+        share_short_name_list = []
        if share_name is not None and len(share_name.strip()) > 0:
-            share_short_name = get_share_short_name_from_text(share_name)
+            share_short_name_list = get_share_short_name_from_text(share_name)
        updated = False
-        if len(share_short_name) < 4 and share_short_name.upper() == share_short_name:
+        if len(share_short_name_list) > 0:
            if len(without_currency_list) > 0:
                for index in without_currency_list:
-                    if share_short_name in compare_list[index].split():
+                    all_in_list = True
+                    compare_split = [split.upper() for split in compare_list[index].split()]
+                    for share_shot_name in share_short_name_list:
+                        if share_shot_name not in compare_split:
+                            all_in_list = False
+                            break
+                    if all_in_list:
                        text = text + ' ' + 'USD'
                        if share_name is not None:
                            share_name = share_name + ' ' + 'USD'
@ -386,8 +399,13 @@ def update_for_currency(text: str, share_name: str, compare_list: list):
            if not updated:
                currency_list = []
                for index in with_currency_list:
-                    compare_split = compare_list[index].split()
-                    if share_short_name in compare_split:
+                    all_in_list = True
+                    compare_split = [split.upper() for split in compare_list[index].split()]
+                    for share_shot_name in share_short_name_list:
+                        if share_shot_name not in compare_split:
+                            all_in_list = False
+                            break
+                    if all_in_list:
                        current_currency_list = [split for split in compare_split 
                                                 if split.upper() in total_currency_list]
                        if len(current_currency_list) > 0: