optimize mapping algorithm

2024-10-01 16:46:59 -05:00 · 2024-10-01 16:46:59 -05:00 · 035f028155
parent 3adbd7631a
commit 035f028155
2 changed files with 12 additions and 3 deletions
--- a/main.py
+++ b/main.py
@ -704,7 +704,7 @@ if __name__ == "__main__":
    ]
    # special_doc_id_list = check_mapping_doc_id_list
    special_doc_id_list = check_db_mapping_doc_id_list
-    # special_doc_id_list = ["333207452"]
+    # special_doc_id_list = ["479042269"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -49,7 +49,7 @@ total_currency_list = [
    "XFO",
 ]

-share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Dividend', 'Investor', 'Institutional', 'Admin', 'Advantage']
+share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Admin', 'Advantage']
 share_features_abbrevation = ['Acc', 'Inc', 'Dist', 'Div', 'Inv', 'Inst', 'Adm', 'Adv']


@ -101,6 +101,10 @@ def get_most_similar_name(text: str,
            common_word_list.extend([word for word in pre_common_word_list
                                     if word not in common_word_list])
        
+        if len(common_word_list) > 0:
+            common_word_list = [word for word in common_word_list
+                                if len(word) > 1 and word.upper() not in total_currency_list]
+
        text = text.strip()
        text = remove_special_characters(text)
        text = replace_abbrevation(text)
@ -224,7 +228,8 @@ def get_most_similar_name(text: str,
                    if text_feature is not None and len(text_feature) > 0 and \
                        copy_name_feature is not None and len(copy_name_feature) > 0:
                        if text_feature != copy_name_feature:
-                            continue
+                            if copy_name_feature.lower() not in text.lower().split():
+                                continue
                    if matching_type == "share":
                        if text_share_short_name is not None and len(text_share_short_name) > 0 and \
                            copy_name_short_name is not None and len(copy_name_short_name) > 0:
@ -263,6 +268,10 @@ def get_share_part_list(text_list: list):
            text_split = text.split("funds")
        if len(text_split) == 1:
            text_split = text.split("Portfolio")
+        if len(text_split) == 1:
+            text_split = text.split("Bond")
+        if len(text_split) == 1:
+            text_split = text.split("Bonds")
        if len(text_split) > 1:
            share_part_list.append(text_split[-1].strip())
        else: