From d25bae936c522a79594b756abc9470f167bb2282 Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Thu, 26 Sep 2024 12:18:37 -0500
Subject: [PATCH] Optimize investment mapping algorithm.

---
 main.py            |  43 +++--
 prepare_data.py    |  20 +--
 utils/biz_utils.py | 379 +++++++++++++++++++++++++++++----------------
 3 files changed, 281 insertions(+), 161 deletions(-)

diff --git a/main.py b/main.py
index 0323f90..74b5098 100644
--- a/main.py
+++ b/main.py
@@ -564,8 +564,8 @@ def test_data_extraction_metrics():
 
 
 def test_mapping_raw_name():
-    doc_id = "292989214"
-    raw_name = "ENBD Saudi Arabia Equity Fund Class A USD Accumulation"
+    doc_id = "391456740"
+    raw_name = "Robeco Multi Asset Sustainable D EUR"
     output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
     data_mapping = DataMapping(
         doc_id,
@@ -575,7 +575,7 @@ def test_mapping_raw_name():
         output_data_folder=output_folder,
     )
     mapping_info = data_mapping.matching_with_database(
-        raw_name=raw_name, parent_id="FS0000B4A7", matching_type="share"
+        raw_name=raw_name, parent_id=None, matching_type="share"
     )
     print(mapping_info)
 
@@ -622,30 +622,41 @@ if __name__ == "__main__":
 
     # special_doc_id_list = ["505174428", "510326848", "349679479"]
     check_mapping_doc_id_list = [
-        "458359181",
-        "486383912",
-        "529925114",
+        "327956364",
         "391456740",
         "391736837",
+        "458359181",
+        "486383912",
         "497497599",
-        "327956364",
-        "479793787",
-        "334718372",
+        "529925114",
         "321733631",
-        "507967525",
-        "478585901",
-        "366179419",
-        "509845549",
-        "323390570",
+        "334718372",
         "344636875",
+        "362246081",
         "445256897",
+        "449623976",
+        "458291624",
+        "478585901",
+        "492121213",
+        "502821436",
+        "507967525",
+        "481475385",
         "508854243",
         "520879048",
+        "402181770",
         "463081566",
-        "389171486"
+        "502693599",
+        "509845549",
+        "389171486",
+        "323390570",
+        "366179419",
+        "486378555",
+        "506559375",
+        "479793787",
+        "333207452"
     ]
     special_doc_id_list = check_mapping_doc_id_list
-    # special_doc_id_list = ["445256897"]
+    # special_doc_id_list = ["333207452"]
     output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
     output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
     re_run_extract_data = False
diff --git a/prepare_data.py b/prepare_data.py
index 72d69b3..9453f2e 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -848,7 +848,7 @@ def compare_records_count_by_document_id():
     # get the count of records by DocumentId
     document_records_count = data_from_document_df.groupby("DocumentId").size().reset_index(name="records_count")
     
-    data_from_database = r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
+    data_from_database = r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document_from_DocumentAcquisition.xlsx"
     sheet_name = "random_small_document_all_data"
     data_from_database_df = pd.read_excel(data_from_database, sheet_name=sheet_name)
     database_records_count = data_from_database_df.groupby("DocumentId").size().reset_index(name="records_count")
@@ -870,7 +870,7 @@ def compare_records_count_by_document_id():
     records_count_compare.reset_index(drop=True, inplace=True)
     
     records_count_compare_file = (
-        r"/data/emea_ar/basic_information/English/records_count_compare_between_document_database.xlsx"
+        r"/data/emea_ar/basic_information/English/records_count_compare_between_document_database_from_DocumentAcquisition.xlsx"
     )
     with pd.ExcelWriter(records_count_compare_file) as writer:
         records_count_compare.to_excel(
@@ -886,7 +886,7 @@ def get_document_extracted_share_diff_by_db():
     db_data = pd.read_excel(db_data_file, sheet_name="Sheet1")
     extract_data = pd.read_excel(extract_data_file, sheet_name="mapping_data")
     # only get data which investment_type is 1
-    extract_data = extract_data[extract_data["investment_type"] == 1]
+    # extract_data = extract_data[extract_data["investment_type"] == 1]
     extract_data.reset_index(drop=True, inplace=True)
     unique_doc_id = extract_data["doc_id"].unique().tolist()
     
@@ -1012,7 +1012,7 @@ if __name__ == "__main__":
     #                     sheet_name="latest_doc_ar_data",
     #                     output_folder=output_data_folder,
     #                     output_file="latest_doc_ar_mapping_statistics.xlsx")
-    # get_document_extracted_share_diff_by_db()
+    get_document_extracted_share_diff_by_db()
     # statistics_provider_mapping(
     #     provider_mapping_data_file=provider_mapping_data_file,
     #     output_folder=basic_info_folder,
@@ -1021,11 +1021,11 @@ if __name__ == "__main__":
     # pickup_document_from_top_100_providers()
     # compare_records_count_by_document_id()
     
-    document_mapping_folder = r"/data/emea_ar/output/mapping/document/"
-    all_data_file = r"/data/emea_ar/output/mapping/all_document_mapping.xlsx"
-    concat_mapping(document_mapping_folder, all_data_file)
+    # document_mapping_folder = r"/data/emea_ar/output/mapping/document/"
+    # all_data_file = r"/data/emea_ar/output/mapping/all_document_mapping.xlsx"
+    # concat_mapping(document_mapping_folder, all_data_file)
     
-    provider_mapping_folder = r"/data/emea_ar/output/mapping/provider/"
-    all_data_file = r"/data/emea_ar/output/mapping/all_provider_mapping.xlsx"
-    concat_mapping(provider_mapping_folder, all_data_file)
+    # provider_mapping_folder = r"/data/emea_ar/output/mapping/provider/"
+    # all_data_file = r"/data/emea_ar/output/mapping/all_provider_mapping.xlsx"
+    # concat_mapping(provider_mapping_folder, all_data_file)
     
diff --git a/utils/biz_utils.py b/utils/biz_utils.py
index 1ca1697..31bf322 100644
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@@ -2,6 +2,55 @@ import re
 from copy import deepcopy
 from traceback import print_exc
 
+
+total_currency_list = [
+    "USD",
+    "EUR",
+    "AUD",
+    "JPY",
+    "CHF",
+    "GBP",
+    "SEK",
+    "CNY",
+    "NZD",
+    "CNH",
+    "NOK",
+    "SGD",
+    "HKD",
+    "ZAR",
+    "PLN",
+    "CAD",
+    "CZK",
+    "HUF",
+    "DKK",
+    "BRL",
+    "SKK",
+    "RON",
+    "TRY",
+    "BGN",
+    "CUP",
+    "MXN",
+    "TOP",
+    "ILS",
+    "CLF",
+    "XCD",
+    "ISK",
+    "IDR",
+    "MNT",
+    "AED",
+    "AFN",
+    "INR",
+    "ESP",
+    "RUB",
+    "CLP",
+    "KRW",
+    "ETB",
+    "DZD",
+    "XEU",
+    "XFO",
+]
+
+
 def add_slash_to_text_as_regex(text: str):
     if text is None or len(text) == 0:
         return text
@@ -19,35 +68,49 @@ def add_slash_to_text_as_regex(text: str):
 def clean_text(text: str) -> str:
     # text = text.lower()
     # update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
-    text = re.sub(r"\\u[A-Z0-9a-z]{4}", ' ', text)
-    text = re.sub(r"( ){2,}", ' ', text.strip())
+    text = re.sub(r"\\u[A-Z0-9a-z]{4}", " ", text)
+    text = re.sub(r"( ){2,}", " ", text.strip())
     return text
 
 
-def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list = None) -> str:
+def get_most_similar_name(
+    text: str, name_list: list, pre_common_word_list: list = None
+) -> str:
     """
     Get the most similar fund name from fund_name_list by jacard similarity
     """
     try:
         copy_fund_name_list = deepcopy(name_list)
-        if text is None or len(text.split()) == 0 or \
-                copy_fund_name_list is None or len(copy_fund_name_list) == 0:
+        if (
+            text is None
+            or len(text.split()) == 0
+            or copy_fund_name_list is None
+            or len(copy_fund_name_list) == 0
+        ):
             return None, None
         
-        copy_fund_name_list = [replace_abbrevation(copy_fund_name) for copy_fund_name 
-                               in copy_fund_name_list]
+        copy_fund_name_list = [
+            replace_abbrevation(copy_fund_name)
+            for copy_fund_name in copy_fund_name_list
+        ]
+        
+        copy_fund_name_list = [
+            replace_abbrevation(remove_special_characters(copy_fund_name))
+            for copy_fund_name in copy_fund_name_list
+        ]
 
         # get common words in fund_name_list
         common_word_list = []
         if len(name_list) > 1:
             _, common_word_list = remove_common_word(copy_fund_name_list)
         if pre_common_word_list is not None and len(pre_common_word_list) > 0:
-            common_word_list.extend([word for word in pre_common_word_list
-                                     if word not in common_word_list])
+            common_word_list.extend(
+                [word for word in pre_common_word_list if word not in common_word_list]
+            )
 
         text = text.strip()
-        text = remove_special_characters(text)
         text = replace_abbrevation(text)
+        text = replace_abbrevation(remove_special_characters(text))
         text_splits = text.split()
         if len(text_splits) == 1:
             text = split_words_without_space(text)
@@ -65,29 +128,46 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
                     # remove word in fund_name_list
                     for i in range(len(copy_fund_name_list)):
                         temp_splits = copy_fund_name_list[i].split()
-                        copy_fund_name_list[i] = ' '.join([split for split in temp_splits 
-                                                           if remove_special_characters(split).lower() != word])
+                        copy_fund_name_list[i] = " ".join(
+                            [
+                                split
+                                for split in temp_splits
+                                if remove_special_characters(split).lower() != word
+                            ]
+                        )
 
             for i in range(len(copy_fund_name_list)):
                 temp_splits = copy_fund_name_list[i].split()
-                copy_fund_name_list[i] = ' '.join([split for split in temp_splits
-                                                   if remove_special_characters(split).lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']])
+                copy_fund_name_list[i] = " ".join(
+                    [
+                        split
+                        for split in temp_splits
+                        if remove_special_characters(split).lower()
+                        not in ["fund", "portfolio", "class", "share", "shares"]
+                    ]
+                )
             final_splits = []
             for split in new_splits:
-                if split.lower() not in ['fund', 'portfolio', 'class', 'share', 'shares']:
+                if split.lower() not in [
+                    "fund",
+                    "portfolio",
+                    "class",
+                    "share",
+                    "shares",
+                ]:
                     final_splits.append(split)
 
-            text = ' '.join(final_splits)
+            text = " ".join(final_splits)
         max_similarity = 0
         max_similarity_fund_name = None
         text = remove_special_characters(text)
         text, copy_fund_name_list = update_for_currency(text, copy_fund_name_list)
-        for fund_name, copy_fund_name in zip(name_list , copy_fund_name_list):
+        for fund_name, copy_fund_name in zip(name_list, copy_fund_name_list):
             copy_fund_name = remove_special_characters(copy_fund_name)
             copy_fund_name = split_words_without_space(copy_fund_name)
-            similarity = get_jacard_similarity(text,
-                                            copy_fund_name,
-                                            need_remove_numeric_characters=False)
+            similarity = get_jacard_similarity(
+                text, copy_fund_name, need_remove_numeric_characters=False
+            )
             if similarity > max_similarity:
                 max_similarity = similarity
                 max_similarity_fund_name = fund_name
@@ -105,17 +185,11 @@ def get_most_similar_name(text: str, name_list: list, pre_common_word_list: list
 def update_for_currency(text: str, compare_list: list):
     text_split = text.split()
     with_currency = False
-    total_currency_list = ['USD', 'EUR', 'AUD', 'JPY', 'CHF', 'GBP', 'SEK', 'CNY', 
-                           'NZD', 'CNH', 'NOK', 'SGD', 'HKD', 'ZAR', 'PLN', 'CAD', 
-                           'CZK', 'HUF', 'DKK', 'BRL', 'SKK', 'RON', 'TRY', 'BGN', 
-                           'CUP', 'MXN', 'TOP', 'ILS', 'CLF', 'XCD', 'ISK', 'IDR', 
-                           'MNT', 'AED', 'AFN', 'INR', 'ESP', 'RUB', 'CLP', 'KRW', 
-                           'ETB', 'DZD', 'XEU', 'XFO']
     for split in text_split:
         if split.upper() in total_currency_list:
             with_currency = True
             break
-    
+
     with_currency_list = []
     without_currency_list = []
     for index, compare in enumerate(compare_list):
@@ -138,7 +212,7 @@ def update_for_currency(text: str, compare_list: list):
             if len(without_currency_list) > 0:
                 for index in without_currency_list:
                     if last_split in compare_list[index].split():
-                        text = text + ' ' + 'USD'
+                        text = text + " " + "USD"
                         updated = True
                         break
             if not updated:
@@ -146,23 +220,26 @@ def update_for_currency(text: str, compare_list: list):
                 for index in with_currency_list:
                     compare_split = compare_list[index].split()
                     if last_split in compare_split:
-                        current_currency_list = [split for split in compare_split 
-                                                 if split.upper() in total_currency_list]
+                        current_currency_list = [
+                            split
+                            for split in compare_split
+                            if split.upper() in total_currency_list
+                        ]
                         if len(current_currency_list) > 0:
                             currency_list.append(current_currency_list[-1])
                 if len(currency_list) == 1:
-                    text = text + ' ' + currency_list[0]
-                    updated = True        
-                        
+                    text = text + " " + currency_list[0]
+                    updated = True
+
         for index in without_currency_list:
-            compare_list[index] = compare_list[index] + ' ' + 'USD'
-            
+            compare_list[index] = compare_list[index] + " " + "USD"
+
         if not updated:
-            text = text + ' ' + 'USD'
+            text = text + " " + "USD"
         return text, compare_list
     elif with_currency and len(without_currency_list) == 0:
         for index in without_currency_list:
-            compare_list[index] = compare_list[index] + ' ' + 'USD'
+            compare_list[index] = compare_list[index] + " " + "USD"
         return text, compare_list
     else:
         return text, compare_list
@@ -176,35 +253,60 @@ def remove_common_word(text_list: list):
         text = text.lower()
         text = remove_special_characters(text)
         text_splits = text.split()
-        while 'fund' in text_splits:
-            text_splits.remove('fund')
-        while 'portfolio' in text_splits:
-            text_splits.remove('portfolio')
-        while 'share' in text_splits:
-            text_splits.remove('share')
-        while 'class' in text_splits:
-            text_splits.remove('class')
-        text = ' '.join(text_splits)
+        while "fund" in text_splits:
+            text_splits.remove("fund")
+        while "portfolio" in text_splits:
+            text_splits.remove("portfolio")
+        while "share" in text_splits:
+            text_splits.remove("share")
+        while "class" in text_splits:
+            text_splits.remove("class")
+        text = " ".join(text_splits)
         new_text_list.append(text)
     # remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words
     # the result is ['Global', 'Growth']
     common_word_list = []
     new_text_splits_list = [text.split() for text in new_text_list]
     for i in range(len(new_text_splits_list)):
-        for j in range(i+1, len(new_text_splits_list)):
+        for j in range(i + 1, len(new_text_splits_list)):
             if common_word_list is None or len(common_word_list) == 0:
                 common_word_list = list(
-                    set(new_text_splits_list[i]).intersection(set(new_text_splits_list[j])))
+                    set(new_text_splits_list[i]).intersection(
+                        set(new_text_splits_list[j])
+                    )
+                )
             else:
                 common_word_list = list(
-                    set(common_word_list).intersection(set(new_text_splits_list[j])))
+                    set(common_word_list).intersection(set(new_text_splits_list[j]))
+                )
     common_word_list = list(set(common_word_list))
+    
+    remove_list = []
+    # if exists the share name and currency name, remove from the list
+    for word in common_word_list:
+        if word.upper() in total_currency_list:
+            remove_list.append(word)
+    for text in new_text_list:
+        text_splits = text.split()
+        if len(text_splits) < 4:
+            continue
+        # get last 3 words from text_splits
+        last_three_words = text_splits[-3:]
+        for word in common_word_list:
+            if word not in remove_list and \
+                word.upper() == word and \
+                word in last_three_words:
+                remove_list.append(word)
+    for remove in remove_list:
+        if remove in common_word_list:
+            common_word_list.remove(remove)
+    
     for i in range(len(new_text_splits_list)):
         for common_word in common_word_list:
             if common_word in new_text_splits_list[i]:
                 new_text_splits_list[i].remove(common_word)
-    new_text_list = [' '.join(text_splits)
-                     for text_splits in new_text_splits_list]
+    new_text_list = [" ".join(text_splits) for text_splits in new_text_splits_list]
+    
     return new_text_list, common_word_list
 
 
@@ -219,21 +321,22 @@ def split_words_without_space(text: str):
     # if len(splits) > 1:
     #     return text
     # find all words with capital letter + lower letter
-    regex = r'[A-Z][a-z]+'
+    regex = r"[A-Z][a-z]+"
     word_list = re.findall(regex, text)
     if len(word_list) > 0:
         for word in word_list:
-            text = text.replace(word, ' ' + word + ' ')
-        text = re.sub(r'(\s)+', ' ', text)
+            text = text.replace(word, " " + word + " ")
+        text = re.sub(r"(\s)+", " ", text)
     return text.strip()
 
 
 def remove_special_characters(text):
-    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
-    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
+    text = re.sub(r"\s+", " ", text)
     text = text.strip()
     return text
 
+
 def get_unique_words_text(text):
     text = remove_special_characters(text)
     text = text.lower()
@@ -241,22 +344,24 @@ def get_unique_words_text(text):
     text_split = list(set(text_split))
     # sort the list
     text_split.sort()
-    return_text = ' '.join(text_split)
+    return_text = " ".join(text_split)
     return return_text
 
 
 def remove_numeric_characters(text):
     # remove numeric characters
-    text = re.sub(r'\d+', ' ', text)
-    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r"\d+", " ", text)
+    text = re.sub(r"\s+", " ", text)
     text = text.strip()
     return text
 
 
-def get_jacard_similarity(text_left,
-                          text_right,
-                          need_remove_special_characters=True,
-                          need_remove_numeric_characters=True):
+def get_jacard_similarity(
+    text_left,
+    text_right,
+    need_remove_special_characters=True,
+    need_remove_numeric_characters=True,
+):
     if need_remove_special_characters:
         text_left = remove_special_characters(text_left)
         text_right = remove_special_characters(text_right)
@@ -274,6 +379,7 @@ def get_jacard_similarity(text_left,
     else:
         return 0
 
+
 def get_beginning_common_words(text_list: list):
     """
     Get the beginning common words in text_list
@@ -297,87 +403,90 @@ def get_beginning_common_words(text_list: list):
             common_words_list.append(word)
         else:
             break
-    
-    return ' '.join(common_words_list).strip()
+
+    return " ".join(common_words_list).strip()
+
 
 def replace_abbrevation(text: str):
     if text is None or len(text.strip()) == 0:
         return text
     text = text.strip()
-    if 'swiss franc' in text.lower():
-        text = re.sub(r'swiss\s+franc', 'CHF', text, flags=re.IGNORECASE)
-    elif 'us dollar' in text.lower():
-        text = re.sub(r'us\s+dollar', 'USD', text, flags=re.IGNORECASE)
-    elif 'singapore dollar' in text.lower():
-        text = re.sub(r'singapore\s+dollar', 'SGD', text, flags=re.IGNORECASE)
-    elif 'hong kong dollar' in text.lower():
-        text = re.sub(r'hong\s+kong\s+dollar', 'HKD', text, flags=re.IGNORECASE)
-    elif 'hongkong dollar' in text.lower():
-        text = re.sub(r'hongkong\s+dollar', 'HKD', text, flags=re.IGNORECASE)
-    elif 'australian dollar' in text.lower():
-        text = re.sub(r'australian\s+dollar', 'AUD', text, flags=re.IGNORECASE)
-    elif 'japanese yen' in text.lower():
-        text = re.sub(r'japanese\s+yen', 'JPY', text, flags=re.IGNORECASE)
-    elif 'south african rand' in text.lower():
-        text = re.sub(r'South\s+African\s+rand', 'ZAR', text, flags=re.IGNORECASE)
-    elif 'canadian dollar' in text.lower():
-        text = re.sub(r'canadian\s+dollar', 'CAD', text, flags=re.IGNORECASE)
-    elif 'new zealand dollar' in text.lower():
-        text = re.sub(r'new\s+zealand\s+dollar', 'NZD', text, flags=re.IGNORECASE)
-    elif 'norwegian krone' in text.lower():
-        text = re.sub(r'norwegian\s+krone', 'NOK', text, flags=re.IGNORECASE)
-    elif 'danish krone' in text.lower():
-        text = re.sub(r'danish\s+krone', 'DKK', text, flags=re.IGNORECASE)
-    elif 'swedish krona' in text.lower():
-        text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE)
-    elif 'swedish kronor' in text.lower():
-        text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE)
-    elif 'sterling' in text.lower().split():
-        text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE)
-    elif 'euro' in text.lower().split():
-        text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE)
-    elif '€' in text.lower().split():
-        text = re.sub(r'\€', 'EUR', text, flags=re.IGNORECASE)
-    elif '$' in text.lower().split():
-        text = re.sub(r'\$', 'USD', text, flags=re.IGNORECASE)
-    elif '£' in text.lower().split():
-        text = re.sub(r'\£', 'GBP', text, flags=re.IGNORECASE)
-    elif 'RMB' in text.lower().split():
-        text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
+    if "swiss franc" in text.lower():
+        text = re.sub(r"swiss\s+franc", "CHF", text, flags=re.IGNORECASE)
+    elif "us dollar" in text.lower():
+        text = re.sub(r"us\s+dollar", "USD", text, flags=re.IGNORECASE)
+    elif "singapore dollar" in text.lower():
+        text = re.sub(r"singapore\s+dollar", "SGD", text, flags=re.IGNORECASE)
+    elif "hong kong dollar" in text.lower():
+        text = re.sub(r"hong\s+kong\s+dollar", "HKD", text, flags=re.IGNORECASE)
+    elif "hongkong dollar" in text.lower():
+        text = re.sub(r"hongkong\s+dollar", "HKD", text, flags=re.IGNORECASE)
+    elif "australian dollar" in text.lower():
+        text = re.sub(r"australian\s+dollar", "AUD", text, flags=re.IGNORECASE)
+    elif "japanese yen" in text.lower():
+        text = re.sub(r"japanese\s+yen", "JPY", text, flags=re.IGNORECASE)
+    elif "south african rand" in text.lower():
+        text = re.sub(r"South\s+African\s+rand", "ZAR", text, flags=re.IGNORECASE)
+    elif "canadian dollar" in text.lower():
+        text = re.sub(r"canadian\s+dollar", "CAD", text, flags=re.IGNORECASE)
+    elif "new zealand dollar" in text.lower():
+        text = re.sub(r"new\s+zealand\s+dollar", "NZD", text, flags=re.IGNORECASE)
+    elif "norwegian krone" in text.lower():
+        text = re.sub(r"norwegian\s+krone", "NOK", text, flags=re.IGNORECASE)
+    elif "danish krone" in text.lower():
+        text = re.sub(r"danish\s+krone", "DKK", text, flags=re.IGNORECASE)
+    elif "swedish krona" in text.lower():
+        text = re.sub(r"swedish\s+krona", "SEK", text, flags=re.IGNORECASE)
+    elif "swedish kronor" in text.lower():
+        text = re.sub(r"swedish\s+kronor", "SEK", text, flags=re.IGNORECASE)
+    elif "GPB" in text.split():
+        text = re.sub(r"GPB", "GBP", text, flags=re.IGNORECASE)
+    elif "sterling" in text.lower().split():
+        text = re.sub(r"sterling", "GBP", text, flags=re.IGNORECASE)
+    elif "euro" in text.lower().split():
+        text = re.sub(r"euro", "EUR", text, flags=re.IGNORECASE)
+    elif "€" in text.lower().split():
+        text = re.sub(r"\€", "EUR", text, flags=re.IGNORECASE)
+    elif "$" in text.lower().split():
+        text = re.sub(r"\$", "USD", text, flags=re.IGNORECASE)
+    elif "£" in text.lower().split():
+        text = re.sub(r"\£", "GBP", text, flags=re.IGNORECASE)
+    elif "RMB" in text.split():
+        text = re.sub(r"RMB", "CNY", text, flags=re.IGNORECASE)
     else:
         pass
-    
+
     text_splits = text.split()
     new_text_splits = []
     for split in text_splits:
-        if split.lower() in ['acc', 'acc.']:
-            new_text_splits.append('Accumulation')
-        elif split.lower() in ['inc', 'inc.']:
-            new_text_splits.append('Income')
-        elif split.lower() in ['dist', 'dist.']:
-            new_text_splits.append('Distribution')
-        elif split.lower() in ['inv', 'inv.']:
-            new_text_splits.append('Investor')
-        elif split.lower() in ['inst', 'inst.', 'institution']:
-            new_text_splits.append('Institutional')
-        elif split.lower() in ['cap', 'cap.']:
-            new_text_splits.append('Capitalisation')
-        elif split.lower() in ['adm', 'adm.']:
-            new_text_splits.append('Admin')
-        elif split.lower() in ['adv', 'adv.']:
-            new_text_splits.append('Advantage')
-        elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']:
-            new_text_splits.append('Hedged')
-        elif split.lower() in ['cl', 'cl.']:
-            new_text_splits.append('Class')
-        elif split.lower() in ['ser', 'ser.']:
-            new_text_splits.append('Series')
-        elif split.lower() in ['u.s.']:
-            new_text_splits.append('US')
-        elif split.lower() in ['nc', 'nc.']:
-            new_text_splits.append('no trail')
+        if split.lower() in ["acc", "acc."]:
+            new_text_splits.append("Accumulation")
+        elif split.lower() in ["inc", "inc."]:
+            new_text_splits.append("Income")
+        elif split.lower() in ["dist", "dist."]:
+            new_text_splits.append("Distribution")
+        elif split.lower() in ["inv", "inv."]:
+            new_text_splits.append("Investor")
+        elif split.lower() in ["inst", "inst.", "institution"]:
+            new_text_splits.append("Institutional")
+        elif split.lower() in ["cap", "cap."]:
+            new_text_splits.append("Capitalisation")
+        elif split.lower() in ["adm", "adm."]:
+            new_text_splits.append("Admin")
+        elif split.lower() in ["adv", "adv."]:
+            new_text_splits.append("Advantage")
+        elif split.lower() in ["hdg", "hgd", "hdg.", "hgd.", "(h)"]:
+            new_text_splits.append("Hedged")
+        elif split.lower() in ["cl", "cl."]:
+            new_text_splits.append("Class")
+        elif split.lower() in ["ser", "ser."]:
+            new_text_splits.append("Series")
+        elif split.lower() in ["u.s."]:
+            new_text_splits.append("US")
+        elif split.lower() in ["nc", "nc."]:
+            new_text_splits.append("no trail")
         else:
             new_text_splits.append(split)
-    
-    new_text = ' '.join(new_text_splits)
-    return new_text
\ No newline at end of file
+
+    new_text = " ".join(new_text_splits)
+    return new_text