diff --git a/core/data_mapping.py b/core/data_mapping.py index c385158..56ad3b7 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -283,16 +283,25 @@ class DataMapping: doc_compare_mapping = self.doc_fund_class_mapping[ self.doc_fund_class_mapping["FundId"] == parent_id ] - doc_compare_name_list = ( - doc_compare_mapping["ShareClassName"].unique().tolist() - ) + if len(doc_compare_mapping) == 0: + doc_compare_name_list = self.doc_share_name_list + doc_compare_mapping = self.doc_fund_class_mapping + else: + doc_compare_name_list = ( + doc_compare_mapping["ShareClassName"].unique().tolist() + ) provider_compare_mapping = self.provider_fund_class_mapping[ self.provider_fund_class_mapping["FundId"] == parent_id ] - provider_compare_name_list = ( - provider_compare_mapping["ShareClassName"].unique().tolist() - ) + if len(provider_compare_mapping) == 0 or \ + len(provider_compare_mapping) < len(doc_compare_mapping): + provider_compare_name_list = doc_compare_name_list + provider_compare_mapping = doc_compare_mapping + else: + provider_compare_name_list = ( + provider_compare_mapping["ShareClassName"].unique().tolist() + ) else: doc_compare_name_list = self.doc_share_name_list doc_compare_mapping = self.doc_fund_class_mapping diff --git a/main.py b/main.py index 4a52cfe..de85dfc 100644 --- a/main.py +++ b/main.py @@ -574,8 +574,8 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "382366116" - raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I" + doc_id = "445102363" + raw_name = "Danske Invest SICAV Global Portfolio Solution – Defensive Class X" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -692,16 +692,21 @@ if __name__ == "__main__": "394778487", "401684600", "402113224", - "402181770" + "402181770", + "402397014", + "405803396", + "445102363", + "445256897", + "448265376" ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["402397014"] + # special_doc_id_list = ["391736837"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False re_run_mapping_data = True - force_save_total_data = False + force_save_total_data = True extract_ways = ["text"] for extract_way in extract_ways: diff --git a/utils/biz_utils.py b/utils/biz_utils.py index a6e3487..1df64d7 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -169,6 +169,7 @@ def get_most_similar_name(text: str, text_currency = get_currency_from_text(share_name) # logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}") + same_max_similarity_name_list = [] for full_name, copy_name in zip(name_list , copy_name_list): copy_name = remove_special_characters(copy_name) copy_name = split_words_without_space(copy_name) @@ -219,8 +220,20 @@ def get_most_similar_name(text: str, continue max_similarity = similarity max_similarity_full_name = full_name + same_max_similarity_name_list = [] + elif matching_type == "fund" and max_similarity > 0 and max_similarity == similarity: + if full_name is not None and max_similarity_full_name is not None and \ + len(full_name.split()) > len(max_similarity_full_name.split()): + max_similarity_full_name = full_name + same_max_similarity_name_list = [] + else: + if full_name is not None: + same_max_similarity_name_list.append(full_name) if max_similarity == 1: break + # if there are multiple names with the same similarity, return None + if len(same_max_similarity_name_list) > 0: + return None, 0.0 if max_similarity < 0.35: return None, max_similarity return max_similarity_full_name, max_similarity @@ -561,6 +574,8 @@ def replace_abbrevation(text: str): new_text_splits.append('Advantage') elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']: new_text_splits.append('Hedged') + elif split.lower() in ['unhgd']: + split = "" elif split.lower() in ['cl', 'cl.']: new_text_splits.append('Class') elif split.lower() in ['ser', 'ser.']: