From 8bd600842577eda0cae7ffa9ea81a76ad2aeee87 Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Mon, 7 Oct 2024 10:34:13 -0500
Subject: [PATCH] refactor code

---
 main.py            |  4 +--
 utils/biz_utils.py | 66 ++++++++++++++++++----------------------------
 2 files changed, 28 insertions(+), 42 deletions(-)

diff --git a/main.py b/main.py
index 557e31e..889973a 100644
--- a/main.py
+++ b/main.py
@@ -446,7 +446,7 @@ def get_metrics(
         ground_truth_sheet_name=ground_truth_sheet_name,
         output_folder=output_folder,
     )
-    missing_error_list, metrics_list, metrics_file = metrics.get_metrics(strict_model=True)
+    missing_error_list, metrics_list, metrics_file = metrics.get_metrics(strict_model=False)
     return missing_error_list, metrics_list, metrics_file
 
 
@@ -725,7 +725,7 @@ if __name__ == "__main__":
     output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
     output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
     re_run_extract_data = False
-    re_run_mapping_data = False
+    re_run_mapping_data = True
     force_save_total_data = True
 
     extract_ways = ["text"]
diff --git a/utils/biz_utils.py b/utils/biz_utils.py
index f7cfc5d..5760268 100644
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@@ -51,6 +51,9 @@ total_currency_list = [
 
 share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Admin', 'Advantage']
 share_features_abbrevation = ['Acc', 'Inc', 'Dist', 'Div', 'Inv', 'Inst', 'Adm', 'Adv']
+lower_pre_fix_fund_share = ['fund', "funds", 'portfolio', 
+                            'bond', 'bonds', 'class', 
+                            'classes', 'share', 'shares']
 
 
 def add_slash_to_text_as_regex(text: str):
@@ -145,26 +148,17 @@ def get_most_similar_name(text: str,
             text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list)
 
         text = ' '.join([split for split in text.split()
-                         if split.lower() not in ['fund', "funds", 'portfolio',
-                                                  'bond', 'bonds',
-                                                  'class', 'classes',
-                                                  'share', 'shares']])
+                         if split.lower() not in lower_pre_fix_fund_share])
         if share_name is not None:
             share_name = ' '.join([split for split in share_name.split()
-                                    if split.lower() not in ['fund', "funds", 'portfolio', 
-                                                            'bond', 'bonds',
-                                                            'class', 'classes', 
-                                                            'share', 'shares']])
+                                    if split.lower() not in lower_pre_fix_fund_share])
         
         copy_share_name_list = get_share_part_list(copy_name_list)
         for i in range(len(copy_name_list)):
             temp_splits = copy_name_list[i].split()
             copy_name_list[i] = ' '.join([split for split in temp_splits
                                                 if remove_special_characters(split).lower() 
-                                                not in ['fund', "funds", 'portfolio', 
-                                                        'bond', 'bonds',
-                                                        'class', 'classes', 
-                                                        'share', 'shares']])
+                                                not in lower_pre_fix_fund_share])
         text_currency = None
         text_feature = None
         text_share_short_name_list = None
@@ -192,10 +186,15 @@ def get_most_similar_name(text: str,
                         "share_currency": text_currency
                     }
             else:
-                text_share_short_name_list = get_share_short_name_from_text(share_name)
-                text_share_short_name_list.sort()
-                text_feature = get_share_feature_from_text(share_name)
-                text_currency = get_currency_from_text(share_name)
+                if share_name is not None and len(share_name.strip()) > 0:
+                    text_share_short_name_list = get_share_short_name_from_text(share_name)
+                    text_share_short_name_list.sort()
+                    text_feature = get_share_feature_from_text(share_name)
+                    text_currency = get_currency_from_text(share_name)
+                else:
+                    text_share_short_name_list = get_share_short_name_from_text(text)
+                    text_feature = get_share_feature_from_text(text)
+                    text_currency = get_currency_from_text(text)
         
         # logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
         same_max_similarity_name_list = []
@@ -308,10 +307,7 @@ def get_share_part_list(text_list: list):
             share_part_text = text.strip()
         share_part_text = ' '.join([split for split in share_part_text.split()
                                     if remove_special_characters(split).lower()
-                                    not in ['fund', "funds", 'portfolio',
-                                            'bond', 'bonds',
-                                            'class', 'classes',
-                                            'share', 'shares']])
+                                    not in lower_pre_fix_fund_share])
         share_part_list.append(share_part_text)
     return share_part_list
     
@@ -367,7 +363,6 @@ def get_currency_from_text(text: str):
     if text is None or len(text.strip()) == 0:
         return None
     text = text.strip()
-    text = text.lower()
     text_split = text.split()
     count = 0
     for split in text_split[::-1]:
@@ -380,22 +375,20 @@ def get_currency_from_text(text: str):
 
 
 def update_for_currency(text: str, share_name: str, compare_list: list):
-    text_split = text.split()
+    currency_in_text = get_currency_from_text(text)
     with_currency = False
-    for split in text_split:
-        if split.upper() in total_currency_list:
-            with_currency = True
-            break
+    if currency_in_text is not None:
+        with_currency = True
     
     with_currency_list = []
     without_currency_list = []
     for index, compare in enumerate(compare_list):
-        compare_split = compare.split()
+        # compare_split = compare.split()
         with_currency_compare = False
-        for split in compare_split:
-            if split.upper() in total_currency_list:
-                with_currency_compare = True
-                break
+        currecy_in_compare = get_currency_from_text(compare)
+        if currecy_in_compare is not None:
+            with_currency_compare = True
+            
         if with_currency_compare:
             with_currency_list.append(index)
         else:
@@ -493,15 +486,8 @@ def remove_common_word(text_list: list):
         text = text.lower()
         text = remove_special_characters(text)
         text_splits = text.split()
-        while 'fund' in text_splits:
-            text_splits.remove('fund')
-        while 'portfolio' in text_splits:
-            text_splits.remove('portfolio')
-        while 'share' in text_splits:
-            text_splits.remove('share')
-        while 'class' in text_splits:
-            text_splits.remove('class')
-        text = ' '.join(text_splits)
+        text = ' '.join([split for split in text_splits
+                         if split.lower() not in lower_pre_fix_fund_share])
         new_text_list.append(text)
     # remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words
     # the result is ['Global', 'Growth']