optimize investment mapping algorithm

2024-10-08 23:53:55 -05:00 · 2024-10-08 23:53:55 -05:00 · 04a2409c58
parent aa2c2332ae
commit 04a2409c58
3 changed files with 215 additions and 125 deletions
--- a/core/data_mapping.py
+++ b/core/data_mapping.py
@ -174,6 +174,7 @@ class DataMapping:
                            investment_info = self.matching_with_database(
                                raw_name=raw_name, 
                                raw_share_name=raw_share_name, 
+                                raw_fund_name=raw_fund_name,
                                parent_id=fund_id, 
                                matching_type="share",
                                process_cache=process_cache
@ -254,6 +255,7 @@ class DataMapping:
        self, 
        raw_name: str, 
        raw_share_name: str = None, 
+        raw_fund_name: str = None,
        parent_id: str = None, 
        matching_type: str = "fund",
        process_cache: dict = {}
@ -328,9 +330,14 @@ class DataMapping:
                    raw_name, 
                    doc_compare_name_list, 
                    share_name=raw_share_name, 
+                    fund_name=raw_fund_name,
                    matching_type=matching_type,
                    process_cache=process_cache)
-                if max_similarity is not None and max_similarity >= 0.9:
+                if matching_type == "fund":
+                    threshold = 0.7
+                else:
+                    threshold = 0.9
+                if max_similarity is not None and max_similarity >= threshold:
                    data_info["id"] = doc_compare_mapping[
                        doc_compare_mapping[compare_name_dp] == max_similarity_name
                    ][compare_id_dp].values[0]
@ -344,6 +351,7 @@ class DataMapping:
                    raw_name, 
                    provider_compare_name_list, 
                    share_name=raw_share_name,
+                    fund_name=raw_fund_name,
                    matching_type=matching_type, 
                    pre_common_word_list=pre_common_word_list,
                    process_cache=process_cache
--- a/main.py
+++ b/main.py
@ -338,7 +338,7 @@ def batch_start_job(


        if calculate_metrics:
-            prediction_sheet_name = "mapping_data"
+            prediction_sheet_name = "total_mapping_data"
            ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
            ground_truth_sheet_name = "mapping_data"
            metrics_output_folder = r"/data/emea_ar/output/metrics/"
@ -600,9 +600,9 @@ def test_data_extraction_metrics():


 def test_mapping_raw_name():
-    doc_id = "394778487"
-    raw_name = "Invesco Global Real Assets Fund FCP-RAIF Invesco Global Property Plus Fund Z Gross QD USD"
-    raw_share_name = "Z Gross QD USD"
+    doc_id = "382366116"
+    raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I"
+    raw_share_name = "EUR I"
    output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
    data_mapping = DataMapping(
        doc_id,
@ -615,7 +615,7 @@ def test_mapping_raw_name():
    mapping_info = data_mapping.matching_with_database(
        raw_name=raw_name,
        raw_share_name=raw_share_name,
-        parent_id="FS0000H1C9", 
+        parent_id=None, 
        matching_type="share",
        process_cache=process_cache
    )
@ -697,100 +697,102 @@ if __name__ == "__main__":
    #     "479793787",
    #     "471641628",
    # ]
-    # check_db_mapping_doc_id_list = [
-    #     "292989214",
-    #     "316237292",
-    #     "321733631",
-    #     "323390570",
-    #     "327956364",
-    #     "332223498",
-    #     "333207452",
-    #     "334718372",
-    #     "344636875",
-    #     "362246081",
-    #     "366179419",
-    #     "380945052",
-    #     "382366116",
-    #     "387202452",
-    #     "389171486",
-    #     "391456740",
-    #     "391736837",
-    #     "394778487",
-    #     "401684600",
-    #     "402113224",
-    #     "402181770",
-    #     "402397014",
-    #     "405803396",
-    #     "445102363",
-    #     "445256897",
-    #     "448265376",
-    #     "449555622",
-    #     "449623976",
-    #     "458291624",
-    #     "458359181",
-    #     "463081566",
-    #     "469138353",
-    #     "471641628",
-    #     "476492237",
-    #     "478585901",
-    #     "478586066",
-    #     "479042264",
-    #     "479042269",
-    #     "479793787",
-    #     "481475385",
-    #     "483617247",
-    #     "486378555",
-    #     "486383912",
-    #     "492121213",
-    #     "497497599",
-    #     "502693599"
-    # ]
-    
    check_db_mapping_doc_id_list = [
-        "334584772",
-        "406913630",
-        "407275419",
-        "337937633",
-        "337293427",
-        "334584772",
-        "404712928",
-        "451063582",
-        "451878128",
-        "425595958",
-        "536344026",
-        "532422548",
-        "423418540",
-        "423418395",
-        "532998065",
-        "540307575",
-        "423395975",
-        "508704368",
-        "481482392",
-        "466580448",
-        "423365707",
-        "423364758",
-        "422761666",
-        "422760156",
-        "422760148",
-        "422686965",
-        "492029971",
-        "510300817",
-        "512745032",
-        "514213638",
-        "527525440",
-        "534535767"
+        "292989214",
+        "316237292",
+        "321733631",
+        "323390570",
+        "327956364",
+        "332223498",
+        "333207452",
+        "334718372",
+        "344636875",
+        "362246081",
+        "366179419",
+        "380945052",
+        "382366116",
+        "387202452",
+        "389171486",
+        "391456740",
+        "391736837",
+        "394778487",
+        "401684600",
+        "402113224",
+        "402181770",
+        "402397014",
+        "405803396",
+        "445102363",
+        "445256897",
+        "448265376",
+        "449555622",
+        "449623976",
+        "458291624",
+        "458359181",
+        "463081566",
+        "469138353",
+        "471641628",
+        "476492237",
+        "478585901",
+        "478586066",
+        "479042264",
+        "479042269",
+        "479793787",
+        "481475385",
+        "483617247",
+        "486378555",
+        "486383912",
+        "492121213",
+        "497497599",
+        "502693599"
    ]
+    
+    # check_db_mapping_doc_id_list = [
+    #     "334584772",
+    #     "406913630",
+    #     "407275419",
+    #     "337937633",
+    #     "337293427",
+    #     "334584772",
+    #     "404712928",
+    #     "451063582",
+    #     "451878128",
+    #     "425595958",
+    #     "536344026",
+    #     "532422548",
+    #     "423418540",
+    #     "423418395",
+    #     "532998065",
+    #     "540307575",
+    #     "423395975",
+    #     "508704368",
+    #     "481482392",
+    #     "466580448",
+    #     "423365707",
+    #     "423364758",
+    #     "422761666",
+    #     "422760156",
+    #     "422760148",
+    #     "422686965",
+    #     "492029971",
+    #     "510300817",
+    #     "512745032",
+    #     "514213638",
+    #     "527525440",
+    #     "534535767"
+    # ]
    # special_doc_id_list = check_mapping_doc_id_list
    special_doc_id_list = check_db_mapping_doc_id_list
-    # special_doc_id_list = ["337937633"]
+    # special_doc_id_list = ["394778487"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
-    re_run_mapping_data = False
+    re_run_mapping_data = True
    force_save_total_data = True
-    calculate_metrics = False
+    calculate_metrics = True

    extract_ways = ["text"]
+    pdf_folder = r"/data/emea_ar/small_pdf/"
+    # pdf_folder = r"/data/emea_ar/pdf/"
    for extract_way in extract_ways:
        batch_start_job(
            pdf_folder,
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -81,6 +81,7 @@ def clean_text(text: str) -> str:
 def get_most_similar_name(text: str, 
                          name_list: list, 
                          share_name: str = None, 
+                          fund_name: str = None,
                          matching_type="share", 
                          pre_common_word_list: list = None,
                          process_cache: dict = None) -> str:
@ -116,6 +117,12 @@ def get_most_similar_name(text: str,
        text = text.strip()
        text = remove_special_characters(text)
        text = replace_abbrevation(text)
+        raw_fund_name_split = []
+        if fund_name is not None and len(fund_name.strip()) > 0:
+            fund_name = fund_name.strip()
+            fund_name = remove_special_characters(fund_name)
+            raw_fund_name_split = fund_name.upper().split()
+            
        if share_name is not None:
            share_name = remove_special_characters(share_name)
            share_name = replace_abbrevation(share_name)
@ -171,11 +178,13 @@ def get_most_similar_name(text: str,
                    text_currency = cache.get("share_currency")
                else:
                    if share_name is not None and len(share_name.strip()) > 0:
-                        text_share_short_name_list = get_share_short_name_from_text(share_name)
+                        text_share_short_name_list = get_share_short_name_from_text(share_name,
+                                                                                    confirm_text_share=True)
                        text_feature = get_share_feature_from_text(share_name)
                        text_currency = get_currency_from_text(share_name)
                    else:
-                        text_share_short_name_list = get_share_short_name_from_text(text)
+                        text_share_short_name_list = get_share_short_name_from_text(text,
+                                                                                    confirm_text_share=True)
                        text_feature = get_share_feature_from_text(text)
                        text_currency = get_currency_from_text(text)
                    # sort text_share_short_name_list
@ -187,12 +196,14 @@ def get_most_similar_name(text: str,
                    }
            else:
                if share_name is not None and len(share_name.strip()) > 0:
-                    text_share_short_name_list = get_share_short_name_from_text(share_name)
+                    text_share_short_name_list = get_share_short_name_from_text(share_name,
+                                                                                confirm_text_share=True)
                    text_share_short_name_list.sort()
                    text_feature = get_share_feature_from_text(share_name)
                    text_currency = get_currency_from_text(share_name)
                else:
-                    text_share_short_name_list = get_share_short_name_from_text(text)
+                    text_share_short_name_list = get_share_short_name_from_text(text,
+                                                                                confirm_text_share=True)
                    text_feature = get_share_feature_from_text(text)
                    text_currency = get_currency_from_text(text)
        
@ -203,24 +214,9 @@ def get_most_similar_name(text: str,
                continue
            copy_name = remove_special_characters(copy_name)
            copy_name = split_words_without_space(copy_name)
-            try:
-                similarity = get_jacard_similarity(text,
-                                                copy_name,
-                                                need_remove_numeric_characters=False)
-            except Exception as e:
-                print(e)
-                print_exc()
-                similarity = 0
-            if similarity == 1:
-                return full_name, similarity
-            copy_name_2 = replace_abbrevation(copy_name)
-            if copy_name != copy_name_2:
-                similarity_2 = get_jacard_similarity(text,
-                                            copy_name_2,
-                                            need_remove_numeric_characters=False)
-                if similarity_2 > similarity:
-                    similarity = similarity_2
-            if similarity > max_similarity:
+            copy_name_short_name_list = None
+            copy_name_feature = None
+            copy_name_currency = None
            if matching_type == "share":
                if  process_cache is not None and isinstance(process_cache, dict):
                    if process_cache.get(copy_name, None) is not None:
@ -244,7 +240,45 @@ def get_most_similar_name(text: str,
                    copy_name_short_name_list.sort()
                    copy_name_feature = get_share_feature_from_text(copy_share_name)
                    copy_name_currency = get_currency_from_text(copy_share_name)
+                try:
+                    if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
+                        copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
+                        updated_text_share_short_name_list, updated_copy_name_short_name_list = \
+                            compare_both_short_name(text_share_short_name_list, copy_name_short_name_list)
                    
+                        if updated_text_share_short_name_list != text_share_short_name_list:
+                            text = ' '.join([split for split in text.split()
+                                                if split not in text_share_short_name_list])
+                            text += ' ' + ' '.join(updated_text_share_short_name_list)
+                            text_share_short_name_list = updated_text_share_short_name_list
+                        
+                        if updated_copy_name_short_name_list != copy_name_short_name_list:
+                            copy_name = ' '.join([split for split in copy_name.split()
+                                                if split not in copy_name_short_name_list])
+                            copy_name += ' ' + ' '.join(updated_copy_name_short_name_list)
+                            copy_name_short_name_list = updated_copy_name_short_name_list
+                except Exception as e:
+                    print(e)
+            
+            try:
+                similarity = get_jacard_similarity(text,
+                                                copy_name,
+                                                need_remove_numeric_characters=False)
+            except Exception as e:
+                print(e)
+                print_exc()
+                similarity = 0
+            if similarity == 1:
+                return full_name, similarity
+            copy_name_2 = replace_abbrevation(copy_name)
+            if copy_name != copy_name_2:
+                similarity_2 = get_jacard_similarity(text,
+                                            copy_name_2,
+                                            need_remove_numeric_characters=False)
+                if similarity_2 > similarity:
+                    similarity = similarity_2
+            if similarity > max_similarity:
+                if matching_type == "share":                       
                    if text_currency is not None and len(text_currency) > 0 and \
                        copy_name_currency is not None and len(copy_name_currency) > 0:
                        if text_currency != copy_name_currency:
@ -257,12 +291,18 @@ def get_most_similar_name(text: str,
                    if matching_type == "share":
                        if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
                            copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
-                                raw_short_not_in_compare = False
+                                short_name_invalid = False
                                for short in text_share_short_name_list:
                                    if short not in copy_name_short_name_list:
-                                        raw_short_not_in_compare = True
+                                        short_name_invalid = True
                                        break
-                                if raw_short_not_in_compare:
+                                for compare_short in copy_name_short_name_list:
+                                    if compare_short not in text_share_short_name_list:
+                                        # some short word is in fund name, but not belong to share name
+                                        if compare_short.upper() not in raw_fund_name_split:
+                                            short_name_invalid = True
+                                            break
+                                if short_name_invalid:
                                    continue
                max_similarity = similarity
                max_similarity_full_name = full_name
@ -289,6 +329,43 @@ def get_most_similar_name(text: str,
        return None, 0.0


+def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list):
+    copy_text_short_name_list = deepcopy(text_short_name_list)
+    copy_compare_short_name_list = deepcopy(compare_short_name_list)
+    copy_text_short_name_list = verify_short_name_container(copy_text_short_name_list, 
+                                                            copy_compare_short_name_list)
+    copy_compare_short_name_list = verify_short_name_container(copy_compare_short_name_list, 
+                                                               copy_text_short_name_list)
+    return copy_text_short_name_list, copy_compare_short_name_list
+
+
+def verify_short_name_container(left_short_name_list: list, right_short_name_list: list):
+    length_1_over_1 = False
+    length_1_count = 0
+    length_1_list = []
+    for short_name in left_short_name_list:
+        if len(short_name) == 1:
+            length_1_count += 1
+            length_1_list.append(short_name)
+    if length_1_count > 1:
+        length_1_over_1 = True
+    
+    if length_1_over_1:
+        for compare_short_name in right_short_name_list:
+            if len(compare_short_name) == length_1_count:
+                all_in = True
+                for short_name in length_1_list:
+                    if short_name not in compare_short_name:
+                        all_in = False
+                        break
+                if all_in:
+                    for short_name in length_1_list:
+                        if short_name in left_short_name_list:
+                            left_short_name_list.remove(short_name)
+                    left_short_name_list.append(compare_short_name)
+    return left_short_name_list
+
+
 def get_share_part_list(text_list: list):
    share_part_list = []
    for text in text_list:
@ -312,7 +389,7 @@ def get_share_part_list(text_list: list):
    return share_part_list
    

-def get_share_short_name_from_text(text: str):
+def get_share_short_name_from_text(text: str, confirm_text_share: bool = False):
    if text is None or len(text.strip()) == 0:
        return None
    text = remove_special_characters(text.strip())
@ -321,9 +398,12 @@ def get_share_short_name_from_text(text: str):
    
    count = 0
    share_short_name_list = []
-    
+    if confirm_text_share:
+        count_threshold = 6
+    else:
+        count_threshold = 4
    for split in text_split[::-1]:
-        if count == 4:
+        if count == count_threshold:
            break
        if split.lower() not in temp_share_features and \
            split.upper() not in total_currency_list: