diff --git a/main.py b/main.py index ffb44f6..ac34f78 100644 --- a/main.py +++ b/main.py @@ -697,8 +697,10 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): doc_id = "337293427" - raw_name = "KBC BONDS CAPITAL FUND Institutional F Shares" - raw_share_name = "Institutional F Shares" + # KBC Bonds Inflation-Linked Bonds Distribution Shares + # KBC Bonds Inflation-Linked Bonds Institutional B Shares + raw_name = "KBC Bonds Inflation-Linked Bonds Institutional B Shares" + raw_share_name = "Institutional B Shares" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -711,7 +713,7 @@ def test_mapping_raw_name(): mapping_info = data_mapping.matching_with_database( raw_name=raw_name, raw_share_name=raw_share_name, - parent_id="FSGBR0536J", + parent_id="FSGBR051XK", matching_type="share", process_cache=process_cache ) @@ -862,6 +864,7 @@ if __name__ == "__main__": # test_calculate_metrics() # test_replace_abbrevation() # test_translate_pdf() + # test_mapping_raw_name() pdf_folder = r"/data/emea_ar/pdf/" page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" @@ -1194,12 +1197,12 @@ if __name__ == "__main__": "534535767" ] special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["334584772"] + # special_doc_id_list = ["337293427"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = True + re_run_extract_data = False re_run_mapping_data = True - force_save_total_data = False + force_save_total_data = True calculate_metrics = False extract_ways = ["text"] @@ -1222,4 +1225,4 @@ if __name__ == "__main__": ) # test_data_extraction_metrics() - # test_mapping_raw_name() + diff --git a/utils/biz_utils.py b/utils/biz_utils.py index acd5d09..7199f7e 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -268,10 +268,40 @@ def get_most_similar_name(text: str, except Exception as e: print(e) + compare_text = text try: - similarity = get_jacard_similarity(text, - copy_name, - need_remove_numeric_characters=False) + text_split = text.split() + text_split_lower = text.lower().split() + copy_name_split_lower = copy_name.lower().split() + if copy_name_feature == "accumulation" and \ + (text_feature is None or len(text_feature) == 0 or + text_feature in ["capitalisation", "institutional"] + or "capitalisation" in text_split_lower or "institutional" in text_split_lower): + if "capitalisation" not in copy_name_split_lower: + compare_text = " ".join([split for split in text_split + if split.lower() not in ["cap", "cap.", "capitalisation"]]) + text_split = compare_text.split() + if "institutional" not in copy_name_split_lower: + compare_text = " ".join([split for split in text_split + if split.lower() not in ["inst", "inst.", "institutional"]]) + text_split = compare_text.split() + if text_feature is not None and len(text_feature) > 0: + compare_text = " ".join([split for split in text_split + if split.lower() != text_feature]) + compare_text += " accumulation" + text_feature = "accumulation" + elif copy_name_feature == "income" and \ + (text_feature is None or len(text_feature) == 0 or text_feature == "distribution"): + if "dist" in text_split_lower or "dist." in text_split_lower or "distribution" in text_split_lower: + compare_text = " ".join([split for split in text_split + if split.lower() not in ["dist", "dist.", "distribution"]]) + compare_text += " income" + text_feature = "income" + else: + pass + similarity = get_jacard_similarity(compare_text, + copy_name, + need_remove_numeric_characters=False) except Exception as e: print(e) print_exc() @@ -280,9 +310,9 @@ def get_most_similar_name(text: str, return full_name, similarity copy_name_2 = replace_abbrevation(copy_name) if copy_name != copy_name_2: - similarity_2 = get_jacard_similarity(text, - copy_name_2, - need_remove_numeric_characters=False) + similarity_2 = get_jacard_similarity(compare_text, + copy_name_2, + need_remove_numeric_characters=False) if similarity_2 > similarity: similarity = similarity_2 if similarity > max_similarity: @@ -296,7 +326,7 @@ def get_most_similar_name(text: str, if text_feature != copy_name_feature: if text_feature.lower() not in copy_name.lower().split() and \ copy_name_feature.lower() != "accmulation" and \ - copy_name_feature.lower() not in text.lower().split(): + copy_name_feature.lower() not in compare_text.lower().split(): continue if matching_type == "share": if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \ @@ -309,6 +339,7 @@ def get_most_similar_name(text: str, for compare_short in copy_name_short_name_list: if compare_short not in text_share_short_name_list: # some short word is in fund name, but not belong to share name + if compare_short.upper() not in raw_fund_name_split: short_name_invalid = True break