Consider multiple share short names cases.

2024-10-02 17:25:25 -05:00 · 2024-10-02 17:25:25 -05:00 · f0dd7f9e89
parent edb90c718e
commit f0dd7f9e89
2 changed files with 53 additions and 17 deletions
--- a/main.py
+++ b/main.py
@ -574,9 +574,9 @@ def test_data_extraction_metrics():


 def test_mapping_raw_name():
-    doc_id = "483617247"
-    raw_name = "CPR Invest Global Disruptive Opportunities Class I sw EUR - Acc"
-    raw_share_name = "Class I sw EUR - Acc"
+    doc_id = "394778487"
+    raw_name = "Invesco Global Real Assets Fund FCP-RAIF Invesco Global Property Plus Fund Z Gross QD USD"
+    raw_share_name = "Z Gross QD USD"
    output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
    data_mapping = DataMapping(
        doc_id,
@ -589,7 +589,7 @@ def test_mapping_raw_name():
    mapping_info = data_mapping.matching_with_database(
        raw_name=raw_name,
        raw_share_name=raw_share_name,
-        parent_id=None, 
+        parent_id="FS0000H1C9", 
        matching_type="share",
        process_cache=process_cache
    )
@ -705,7 +705,7 @@ if __name__ == "__main__":
    ]
    # special_doc_id_list = check_mapping_doc_id_list
    special_doc_id_list = check_db_mapping_doc_id_list
-    # special_doc_id_list = ["483617247"]
+    # special_doc_id_list = ["380945052", "382366116", "387202452", "394778487", "469138353"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -172,7 +172,7 @@ def get_most_similar_name(text: str,
            if process_cache is not None and isinstance(process_cache, dict):
                if process_cache.get(text, None) is not None:
                    cache = process_cache.get(text)
-                    text_share_short_name = cache.get("share_short_name")
+                    text_share_short_name_list = cache.get("share_short_name")
                    text_feature = cache.get("share_feature")
                    text_currency = cache.get("share_currency")
                else:
@ -212,6 +212,8 @@ def get_most_similar_name(text: str,
                print(e)
                print_exc()
                similarity = 0
+            if similarity == 1:
+                return full_name, similarity
            copy_name_2 = replace_abbrevation(copy_name)
            if copy_name != copy_name_2:
                similarity_2 = get_jacard_similarity(text,
@ -229,7 +231,8 @@ def get_most_similar_name(text: str,
                            copy_name_currency = cache.get("share_currency")
                        else:
                            copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
-                            copy_name_short_name_list.sort()
+                            if copy_name_short_name_list is not None:
+                                copy_name_short_name_list.sort()
                            copy_name_feature = get_share_feature_from_text(copy_share_name)
                            copy_name_currency = get_currency_from_text(copy_share_name)
                            process_cache[copy_name] = {
@ -255,8 +258,13 @@ def get_most_similar_name(text: str,
                    if matching_type == "share":
                        if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
                            copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
-                            if text_share_short_name_list != copy_name_short_name_list:
-                                continue
+                                raw_short_not_in_compare = False
+                                for short in text_share_short_name_list:
+                                    if short not in copy_name_short_name_list:
+                                        raw_short_not_in_compare = True
+                                        break
+                                if raw_short_not_in_compare:
+                                    continue
                max_similarity = similarity
                max_similarity_full_name = full_name
                same_max_similarity_name_list = []
@ -285,19 +293,26 @@ def get_most_similar_name(text: str,
 def get_share_part_list(text_list: list):
    share_part_list = []
    for text in text_list:
-        text_split = text.split("Fund")
+        text_split = text.split("Funds")
        if len(text_split) == 1:
-            text_split = text.split("funds")
+            text_split = text.split("Fund")
        if len(text_split) == 1:
            text_split = text.split("Portfolio")
-        if len(text_split) == 1:
-            text_split = text.split("Bond")
        if len(text_split) == 1:
            text_split = text.split("Bonds")
+        if len(text_split) == 1:
+            text_split = text.split("Bond")
        if len(text_split) > 1:
-            share_part_list.append(text_split[-1].strip())
+            share_part_text = text_split[-1].strip()
        else:
-            share_part_list.append(text)
+            share_part_text = text.strip()
+        share_part_text = ' '.join([split for split in share_part_text.split()
+                                    if remove_special_characters(split).lower()
+                                    not in ['fund', "funds", 'portfolio',
+                                            'bond', 'bonds',
+                                            'class', 'classes',
+                                            'share', 'shares']])
+        share_part_list.append(share_part_text)
    return share_part_list
    

@ -316,9 +331,20 @@ def get_share_short_name_from_text(text: str):
            break
        if split.lower() not in temp_share_features and \
            split.upper() not in total_currency_list:
-            if len(split) <= 3 and split.upper() == split:
+            if len(split) <= 3:
                share_short_name_list.append(split.upper())
        count += 1
+    
+    if len(share_short_name_list) > 1:
+        remove_number = []
+        for short_name in share_short_name_list[::-1]:
+            if short_name.isdigit():
+                remove_number.append(short_name)
+            else:
+                break
+        for remove in remove_number:
+            if remove in share_short_name_list:
+                share_short_name_list.remove(remove)
    return share_short_name_list

 def get_share_feature_from_text(text: str):
@ -481,6 +507,7 @@ def remove_common_word(text_list: list):
    # the result is ['Global', 'Growth']
    common_word_list = []
    new_text_splits_list = [text.split() for text in new_text_list]
+    with_common_word = False
    for i in range(len(new_text_splits_list)):
        for j in range(i+1, len(new_text_splits_list)):
            if common_word_list is None or len(common_word_list) == 0:
@ -489,6 +516,12 @@ def remove_common_word(text_list: list):
            else:
                common_word_list = list(
                    set(common_word_list).intersection(set(new_text_splits_list[j])))
+            if len(common_word_list) > 0:
+                with_common_word = True
+            if with_common_word and len(common_word_list) == 0:
+                break
+        if with_common_word and len(common_word_list) == 0:
+            break
    
    remove_list = []
    # if exists the share name and currency name, remove from the list
@ -631,7 +664,8 @@ def get_beginning_common_words(text_list: list):
 def replace_abbrevation(text: str):
    if text is None or len(text.strip()) == 0:
        return text
-    text = text.strip()
+    text = text.replace('(', ' ').replace(')', ' ').replace('-', ' ')
+    text = re.sub(r'\s+', ' ', text).strip()
    if 'swiss franc' in text.lower():
        text = re.sub(r'swiss\s+franc', 'CHF', text, flags=re.IGNORECASE)
    elif 'us dollar' in text.lower():
@ -710,6 +744,8 @@ def replace_abbrevation(text: str):
            new_text_splits.append('US')
        elif split.lower() in ['nc', 'nc.']:
            new_text_splits.append('no trail')
+        elif split.lower() in ['non']:
+            new_text_splits.append('Not')
        else:
            new_text_splits.append(split)