From 3adbd7631af32256c5cdbad16be6372c09c1d028 Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Tue, 1 Oct 2024 15:31:15 -0500
Subject: [PATCH] optimize mapping algorithm

---
 main.py            |  8 ++---
 utils/biz_utils.py | 84 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 63 insertions(+), 29 deletions(-)

diff --git a/main.py b/main.py
index 5728e12..50b2171 100644
--- a/main.py
+++ b/main.py
@@ -574,8 +574,8 @@ def test_data_extraction_metrics():
 
 
 def test_mapping_raw_name():
-    doc_id = "469138353"
-    raw_name = "Manulife Global Fund ASEAN Equity Fund I USD"
+    doc_id = "333207452"
+    raw_name = "Rathbone SICAV Income Fund L ACC GBP"
     output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
     data_mapping = DataMapping(
         doc_id,
@@ -704,11 +704,11 @@ if __name__ == "__main__":
     ]
     # special_doc_id_list = check_mapping_doc_id_list
     special_doc_id_list = check_db_mapping_doc_id_list
-    # special_doc_id_list = ["469138353"]
+    # special_doc_id_list = ["333207452"]
     output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
     output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
     re_run_extract_data = False
-    re_run_mapping_data = False
+    re_run_mapping_data = True
     force_save_total_data = True
 
     extract_ways = ["text"]
diff --git a/utils/biz_utils.py b/utils/biz_utils.py
index fd769e0..3cb6294 100644
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@@ -104,6 +104,10 @@ def get_most_similar_name(text: str,
         text = text.strip()
         text = remove_special_characters(text)
         text = replace_abbrevation(text)
+        if share_name is not None:
+            share_name = remove_special_characters(share_name)
+            share_name = replace_abbrevation(share_name)
+        
         text_splits = text.split()
         if len(text_splits) == 1:
             text = split_words_without_space(text)
@@ -123,14 +127,6 @@ def get_most_similar_name(text: str,
                         temp_splits = copy_name_list[i].split()
                         copy_name_list[i] = ' '.join([split for split in temp_splits 
                                                            if remove_special_characters(split).lower() != word])
-
-            for i in range(len(copy_name_list)):
-                temp_splits = copy_name_list[i].split()
-                copy_name_list[i] = ' '.join([split for split in temp_splits
-                                                   if remove_special_characters(split).lower() 
-                                                   not in ['fund', "funds", 'portfolio', 
-                                                           'class', 'classes', 
-                                                           'share', 'shares']])
             final_splits = []
             for split in new_splits:
                 if split.lower() not in ['fund', "funds", 'portfolio', 
@@ -139,11 +135,22 @@ def get_most_similar_name(text: str,
                     final_splits.append(split)
 
             text = ' '.join(final_splits)
+            
+        copy_share_name_list = get_share_part_list(copy_name_list)
+        
+        for i in range(len(copy_name_list)):
+            temp_splits = copy_name_list[i].split()
+            copy_name_list[i] = ' '.join([split for split in temp_splits
+                                                if remove_special_characters(split).lower() 
+                                                not in ['fund', "funds", 'portfolio', 
+                                                        'class', 'classes', 
+                                                        'share', 'shares']])
         max_similarity = 0
         max_similarity_full_name = None
         text = remove_special_characters(text)
+        
         if matching_type == "share":
-            text, copy_name_list = update_for_currency(text, copy_name_list)
+            text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list)
         text_currency = None
         text_feature = None
         text_share_short_name = None
@@ -155,9 +162,14 @@ def get_most_similar_name(text: str,
                     text_feature = cache.get("share_feature")
                     text_currency = cache.get("share_currency")
                 else:
-                    text_share_short_name = get_share_short_name_from_text(text)
-                    text_feature = get_share_feature_from_text(text)
-                    text_currency = get_currency_from_text(text)
+                    if share_name is not None and len(share_name.strip()) > 0:
+                        text_share_short_name = get_share_short_name_from_text(share_name)
+                        text_feature = get_share_feature_from_text(share_name)
+                        text_currency = get_currency_from_text(share_name)
+                    else:
+                        text_share_short_name = get_share_short_name_from_text(text)
+                        text_feature = get_share_feature_from_text(text)
+                        text_currency = get_currency_from_text(text)
                     process_cache[text] = {
                         "share_short_name": text_share_short_name,
                         "share_feature": text_feature,
@@ -170,7 +182,7 @@ def get_most_similar_name(text: str,
         
         # logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
         same_max_similarity_name_list = []
-        for full_name, copy_name in zip(name_list , copy_name_list):
+        for full_name, copy_name, copy_share_name in zip(name_list , copy_name_list, copy_share_name_list):
             copy_name = remove_special_characters(copy_name)
             copy_name = split_words_without_space(copy_name)
             similarity = get_jacard_similarity(text,
@@ -192,18 +204,18 @@ def get_most_similar_name(text: str,
                             copy_name_feature = cache.get("share_feature")
                             copy_name_currency = cache.get("share_currency")
                         else:
-                            copy_name_short_name = get_share_short_name_from_text(copy_name)
-                            copy_name_feature = get_share_feature_from_text(copy_name)
-                            copy_name_currency = get_currency_from_text(copy_name)
+                            copy_name_short_name = get_share_short_name_from_text(copy_share_name)
+                            copy_name_feature = get_share_feature_from_text(copy_share_name)
+                            copy_name_currency = get_currency_from_text(copy_share_name)
                             process_cache[copy_name] = {
                                 "share_short_name": copy_name_short_name,
                                 "share_feature": copy_name_feature,
                                 "share_currency": copy_name_currency
                             }
                     else:
-                        copy_name_short_name = get_share_short_name_from_text(copy_name)
-                        copy_name_feature = get_share_feature_from_text(copy_name)
-                        copy_name_currency = get_currency_from_text(copy_name)
+                        copy_name_short_name = get_share_short_name_from_text(copy_share_name)
+                        copy_name_feature = get_share_feature_from_text(copy_share_name)
+                        copy_name_currency = get_currency_from_text(copy_share_name)
                         
                     if text_currency is not None and len(text_currency) > 0 and \
                         copy_name_currency is not None and len(copy_name_currency) > 0:
@@ -242,10 +254,26 @@ def get_most_similar_name(text: str,
         print_exc()
         return None, 0.0
 
+
+def get_share_part_list(text_list: list):
+    share_part_list = []
+    for text in text_list:
+        text_split = text.split("Fund")
+        if len(text_split) == 1:
+            text_split = text.split("funds")
+        if len(text_split) == 1:
+            text_split = text.split("Portfolio")
+        if len(text_split) > 1:
+            share_part_list.append(text_split[-1].strip())
+        else:
+            share_part_list.append(text)
+    return share_part_list
+    
+
 def get_share_short_name_from_text(text: str):
     if text is None or len(text.strip()) == 0:
         return None
-    text = text.strip()
+    text = remove_special_characters(text.strip())
     text_split = text.split()
     temp_share_features = [feature.lower() for feature in share_features_full_name]
     
@@ -292,7 +320,7 @@ def get_currency_from_text(text: str):
     return None
 
 
-def update_for_currency(text: str, compare_list: list):
+def update_for_currency(text: str, share_name: str, compare_list: list):
     text_split = text.split()
     with_currency = False
     for split in text_split:
@@ -314,7 +342,7 @@ def update_for_currency(text: str, compare_list: list):
         else:
             without_currency_list.append(index)
     if not with_currency and len(with_currency_list) == 0:
-        return text, compare_list
+        return text, share_name, compare_list
     elif not with_currency and len(with_currency_list) > 0:
         last_split = text_split[-1]
         updated = False
@@ -323,6 +351,8 @@ def update_for_currency(text: str, compare_list: list):
                 for index in without_currency_list:
                     if last_split in compare_list[index].split():
                         text = text + ' ' + 'USD'
+                        if share_name is not None:
+                            share_name = share_name + ' ' + 'USD'
                         updated = True
                         break
             if not updated:
@@ -336,6 +366,8 @@ def update_for_currency(text: str, compare_list: list):
                             currency_list.append(current_currency_list[-1])
                 if len(currency_list) == 1:
                     text = text + ' ' + currency_list[0]
+                    if share_name is not None:
+                        share_name = share_name + ' ' + currency_list[0]
                     updated = True        
                         
         for index in without_currency_list:
@@ -343,13 +375,15 @@ def update_for_currency(text: str, compare_list: list):
             
         if not updated:
             text = text + ' ' + 'USD'
-        return text, compare_list
+            if share_name is not None:
+                share_name = share_name + ' ' + 'USD'
+        return text, share_name, compare_list
     elif with_currency and len(without_currency_list) == 0:
         for index in without_currency_list:
             compare_list[index] = compare_list[index] + ' ' + 'USD'
-        return text, compare_list
+        return text, share_name, compare_list
     else:
-        return text, compare_list
+        return text, share_name, compare_list
 
 
 def remove_common_word(text_list: list):