diff --git a/main.py b/main.py index 291b558..5d6977c 100644 --- a/main.py +++ b/main.py @@ -574,9 +574,9 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "483617247" - raw_name = "CPR Invest Global Disruptive Opportunities Class I sw EUR - Acc" - raw_share_name = "Class I sw EUR - Acc" + doc_id = "394778487" + raw_name = "Invesco Global Real Assets Fund FCP-RAIF Invesco Global Property Plus Fund Z Gross QD USD" + raw_share_name = "Z Gross QD USD" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -589,7 +589,7 @@ def test_mapping_raw_name(): mapping_info = data_mapping.matching_with_database( raw_name=raw_name, raw_share_name=raw_share_name, - parent_id=None, + parent_id="FS0000H1C9", matching_type="share", process_cache=process_cache ) @@ -705,7 +705,7 @@ if __name__ == "__main__": ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["483617247"] + # special_doc_id_list = ["380945052", "382366116", "387202452", "394778487", "469138353"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False diff --git a/utils/biz_utils.py b/utils/biz_utils.py index f1ff20f..f7cfc5d 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -172,7 +172,7 @@ def get_most_similar_name(text: str, if process_cache is not None and isinstance(process_cache, dict): if process_cache.get(text, None) is not None: cache = process_cache.get(text) - text_share_short_name = cache.get("share_short_name") + text_share_short_name_list = cache.get("share_short_name") text_feature = cache.get("share_feature") text_currency = cache.get("share_currency") else: @@ -212,6 +212,8 @@ def get_most_similar_name(text: str, print(e) print_exc() similarity = 0 + if similarity == 1: + return full_name, similarity copy_name_2 = replace_abbrevation(copy_name) if copy_name != copy_name_2: similarity_2 = get_jacard_similarity(text, @@ -229,7 +231,8 @@ def get_most_similar_name(text: str, copy_name_currency = cache.get("share_currency") else: copy_name_short_name_list = get_share_short_name_from_text(copy_share_name) - copy_name_short_name_list.sort() + if copy_name_short_name_list is not None: + copy_name_short_name_list.sort() copy_name_feature = get_share_feature_from_text(copy_share_name) copy_name_currency = get_currency_from_text(copy_share_name) process_cache[copy_name] = { @@ -255,8 +258,13 @@ def get_most_similar_name(text: str, if matching_type == "share": if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \ copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0: - if text_share_short_name_list != copy_name_short_name_list: - continue + raw_short_not_in_compare = False + for short in text_share_short_name_list: + if short not in copy_name_short_name_list: + raw_short_not_in_compare = True + break + if raw_short_not_in_compare: + continue max_similarity = similarity max_similarity_full_name = full_name same_max_similarity_name_list = [] @@ -285,19 +293,26 @@ def get_most_similar_name(text: str, def get_share_part_list(text_list: list): share_part_list = [] for text in text_list: - text_split = text.split("Fund") + text_split = text.split("Funds") if len(text_split) == 1: - text_split = text.split("funds") + text_split = text.split("Fund") if len(text_split) == 1: text_split = text.split("Portfolio") - if len(text_split) == 1: - text_split = text.split("Bond") if len(text_split) == 1: text_split = text.split("Bonds") + if len(text_split) == 1: + text_split = text.split("Bond") if len(text_split) > 1: - share_part_list.append(text_split[-1].strip()) + share_part_text = text_split[-1].strip() else: - share_part_list.append(text) + share_part_text = text.strip() + share_part_text = ' '.join([split for split in share_part_text.split() + if remove_special_characters(split).lower() + not in ['fund', "funds", 'portfolio', + 'bond', 'bonds', + 'class', 'classes', + 'share', 'shares']]) + share_part_list.append(share_part_text) return share_part_list @@ -316,9 +331,20 @@ def get_share_short_name_from_text(text: str): break if split.lower() not in temp_share_features and \ split.upper() not in total_currency_list: - if len(split) <= 3 and split.upper() == split: + if len(split) <= 3: share_short_name_list.append(split.upper()) count += 1 + + if len(share_short_name_list) > 1: + remove_number = [] + for short_name in share_short_name_list[::-1]: + if short_name.isdigit(): + remove_number.append(short_name) + else: + break + for remove in remove_number: + if remove in share_short_name_list: + share_short_name_list.remove(remove) return share_short_name_list def get_share_feature_from_text(text: str): @@ -481,6 +507,7 @@ def remove_common_word(text_list: list): # the result is ['Global', 'Growth'] common_word_list = [] new_text_splits_list = [text.split() for text in new_text_list] + with_common_word = False for i in range(len(new_text_splits_list)): for j in range(i+1, len(new_text_splits_list)): if common_word_list is None or len(common_word_list) == 0: @@ -489,6 +516,12 @@ def remove_common_word(text_list: list): else: common_word_list = list( set(common_word_list).intersection(set(new_text_splits_list[j]))) + if len(common_word_list) > 0: + with_common_word = True + if with_common_word and len(common_word_list) == 0: + break + if with_common_word and len(common_word_list) == 0: + break remove_list = [] # if exists the share name and currency name, remove from the list @@ -631,7 +664,8 @@ def get_beginning_common_words(text_list: list): def replace_abbrevation(text: str): if text is None or len(text.strip()) == 0: return text - text = text.strip() + text = text.replace('(', ' ').replace(')', ' ').replace('-', ' ') + text = re.sub(r'\s+', ' ', text).strip() if 'swiss franc' in text.lower(): text = re.sub(r'swiss\s+franc', 'CHF', text, flags=re.IGNORECASE) elif 'us dollar' in text.lower(): @@ -710,6 +744,8 @@ def replace_abbrevation(text: str): new_text_splits.append('US') elif split.lower() in ['nc', 'nc.']: new_text_splits.append('no trail') + elif split.lower() in ['non']: + new_text_splits.append('Not') else: new_text_splits.append(split)