optimize share feature judgment logic:

accumulation with capitalisation and institutional income with distribution Document: 337293427
2024-12-02 13:11:49 -06:00 · 2024-12-02 13:11:49 -06:00 · c146497052
parent 352886ade2
commit c146497052
2 changed files with 48 additions and 14 deletions
--- a/main.py
+++ b/main.py
@ -697,8 +697,10 @@ def test_data_extraction_metrics():

 def test_mapping_raw_name():
    doc_id = "337293427"
-    raw_name = "KBC BONDS CAPITAL FUND Institutional F Shares"
-    raw_share_name = "Institutional F Shares"
+    # KBC Bonds Inflation-Linked Bonds Distribution Shares
+    # KBC Bonds Inflation-Linked Bonds Institutional B Shares
+    raw_name = "KBC Bonds Inflation-Linked Bonds Institutional B Shares"
+    raw_share_name = "Institutional B Shares"
    output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
    data_mapping = DataMapping(
        doc_id,
@ -711,7 +713,7 @@ def test_mapping_raw_name():
    mapping_info = data_mapping.matching_with_database(
        raw_name=raw_name,
        raw_share_name=raw_share_name,
-        parent_id="FSGBR0536J", 
+        parent_id="FSGBR051XK", 
        matching_type="share",
        process_cache=process_cache
    )
@ -862,6 +864,7 @@ if __name__ == "__main__":
    # test_calculate_metrics()
    # test_replace_abbrevation()
    # test_translate_pdf()
+    # test_mapping_raw_name()
    pdf_folder = r"/data/emea_ar/pdf/"
    page_filter_ground_truth_file = (
        r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@ -1194,12 +1197,12 @@ if __name__ == "__main__":
        "534535767"
    ]
    special_doc_id_list = check_db_mapping_doc_id_list
-    special_doc_id_list = ["334584772"]
+    # special_doc_id_list = ["337293427"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
-    re_run_extract_data = True
+    re_run_extract_data = False
    re_run_mapping_data = True
-    force_save_total_data = False
+    force_save_total_data = True
    calculate_metrics = False

    extract_ways = ["text"]
@ -1222,4 +1225,4 @@ if __name__ == "__main__":
        )

    # test_data_extraction_metrics()
-    # test_mapping_raw_name()
+    
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -268,10 +268,40 @@ def get_most_similar_name(text: str,
                except Exception as e:
                    print(e)
            
+            compare_text = text
            try:
-                similarity = get_jacard_similarity(text,
-                                                copy_name,
-                                                need_remove_numeric_characters=False)
+                text_split = text.split()
+                text_split_lower = text.lower().split()
+                copy_name_split_lower = copy_name.lower().split()
+                if copy_name_feature == "accumulation" and \
+                    (text_feature is None or len(text_feature) == 0 or 
+                     text_feature in ["capitalisation", "institutional"]
+                     or "capitalisation" in text_split_lower or "institutional" in text_split_lower):
+                    if "capitalisation" not in copy_name_split_lower:
+                        compare_text = " ".join([split for split in text_split
+                                                 if split.lower() not in ["cap", "cap.", "capitalisation"]])
+                        text_split = compare_text.split()
+                    if "institutional" not in copy_name_split_lower:
+                        compare_text = " ".join([split for split in text_split
+                                                 if split.lower() not in ["inst", "inst.", "institutional"]])
+                        text_split = compare_text.split()
+                    if text_feature is not None and len(text_feature) > 0:
+                        compare_text = " ".join([split for split in text_split
+                                                 if split.lower() != text_feature])
+                    compare_text += " accumulation"
+                    text_feature = "accumulation"
+                elif copy_name_feature == "income" and \
+                    (text_feature is None or len(text_feature) == 0 or text_feature == "distribution"):
+                    if "dist" in text_split_lower or "dist." in text_split_lower or "distribution" in text_split_lower:
+                        compare_text = " ".join([split for split in text_split
+                                                 if split.lower() not in ["dist", "dist.", "distribution"]])
+                        compare_text += " income"
+                        text_feature = "income"
+                else:
+                    pass
+                similarity = get_jacard_similarity(compare_text,
+                                                   copy_name,
+                                                   need_remove_numeric_characters=False)
            except Exception as e:
                print(e)
                print_exc()
@ -280,9 +310,9 @@ def get_most_similar_name(text: str,
                return full_name, similarity
            copy_name_2 = replace_abbrevation(copy_name)
            if copy_name != copy_name_2:
-                similarity_2 = get_jacard_similarity(text,
-                                            copy_name_2,
-                                            need_remove_numeric_characters=False)
+                similarity_2 = get_jacard_similarity(compare_text,
+                                                     copy_name_2,
+                                                     need_remove_numeric_characters=False)
                if similarity_2 > similarity:
                    similarity = similarity_2
            if similarity > max_similarity:
@ -296,7 +326,7 @@ def get_most_similar_name(text: str,
                        if text_feature != copy_name_feature:
                            if text_feature.lower() not in copy_name.lower().split() and \
                                copy_name_feature.lower() != "accmulation" and \
-                                    copy_name_feature.lower() not in text.lower().split():
+                                    copy_name_feature.lower() not in compare_text.lower().split():
                                continue
                    if matching_type == "share":
                        if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
@ -309,6 +339,7 @@ def get_most_similar_name(text: str,
                                for compare_short in copy_name_short_name_list:
                                    if compare_short not in text_share_short_name_list:
                                        # some short word is in fund name, but not belong to share name
+                                        
                                        if compare_short.upper() not in raw_fund_name_split:
                                            short_name_invalid = True
                                            break