From 3bb13947af98f55604ec788e3c284c81b0f893e7 Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 2 Oct 2024 13:25:08 -0500 Subject: [PATCH] Optimize mapping algorithm: For multiple currencies in fund/ share name, if exist USD, remove it Fix the issue for split words without space If there is no currency in share class name, try to get same currency from document mapping which with same fund name and same short share class name. --- main.py | 12 +++--- utils/biz_utils.py | 101 +++++++++++++++++++++++++++++++-------------- 2 files changed, 77 insertions(+), 36 deletions(-) diff --git a/main.py b/main.py index 3149f41..fe56b08 100644 --- a/main.py +++ b/main.py @@ -574,9 +574,9 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "481475385" - raw_name = "Emerging Markets Fund A-ACC Shares USD" - raw_share_name = "A-ACC Shares USD" + doc_id = "389171486" + raw_name = "Nordea 2 Emerging Market Local Debt Enhanced Fund Y - Shares" + raw_share_name = "Y - Shares" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -589,7 +589,7 @@ def test_mapping_raw_name(): mapping_info = data_mapping.matching_with_database( raw_name=raw_name, raw_share_name=raw_share_name, - parent_id=None, + parent_id="FS00009Q8R", matching_type="share", process_cache=process_cache ) @@ -705,12 +705,12 @@ if __name__ == "__main__": ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["481475385"] + special_doc_id_list = ["483617247"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False extract_ways = ["text"] for extract_way in extract_ways: diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 1a671ff..1e48699 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -127,7 +127,7 @@ def get_most_similar_name(text: str, new_splits.extend(split_words_without_space(split).split()) else: new_splits.append(split) - + text = ' '.join(new_splits) lower_new_splits = [split.lower() for split in new_splits] for word in common_word_list: if word not in lower_new_splits: @@ -136,30 +136,35 @@ def get_most_similar_name(text: str, temp_splits = copy_name_list[i].split() copy_name_list[i] = ' '.join([split for split in temp_splits if remove_special_characters(split).lower() != word]) - final_splits = [] - for split in new_splits: - if split.lower() not in ['fund', "funds", 'portfolio', - 'class', 'classes', - 'share', 'shares']: - final_splits.append(split) - text = ' '.join(final_splits) - - copy_share_name_list = get_share_part_list(copy_name_list) - - for i in range(len(copy_name_list)): - temp_splits = copy_name_list[i].split() - copy_name_list[i] = ' '.join([split for split in temp_splits - if remove_special_characters(split).lower() - not in ['fund', "funds", 'portfolio', - 'class', 'classes', - 'share', 'shares']]) max_similarity = 0 max_similarity_full_name = None text = remove_special_characters(text) if matching_type == "share": text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list) + + text = ' '.join([split for split in text.split() + if split.lower() not in ['fund', "funds", 'portfolio', + 'bond', 'bonds', + 'class', 'classes', + 'share', 'shares']]) + if share_name is not None: + share_name = ' '.join([split for split in share_name.split() + if split.lower() not in ['fund', "funds", 'portfolio', + 'bond', 'bonds', + 'class', 'classes', + 'share', 'shares']]) + + copy_share_name_list = get_share_part_list(copy_name_list) + for i in range(len(copy_name_list)): + temp_splits = copy_name_list[i].split() + copy_name_list[i] = ' '.join([split for split in temp_splits + if remove_special_characters(split).lower() + not in ['fund', "funds", 'portfolio', + 'bond', 'bonds', + 'class', 'classes', + 'share', 'shares']]) text_currency = None text_feature = None text_share_short_name = None @@ -192,11 +197,18 @@ def get_most_similar_name(text: str, # logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}") same_max_similarity_name_list = [] for full_name, copy_name, copy_share_name in zip(name_list , copy_name_list, copy_share_name_list): + if not isinstance(copy_name, str) or len(copy_name.strip()) == 0: + continue copy_name = remove_special_characters(copy_name) copy_name = split_words_without_space(copy_name) - similarity = get_jacard_similarity(text, - copy_name, - need_remove_numeric_characters=False) + try: + similarity = get_jacard_similarity(text, + copy_name, + need_remove_numeric_characters=False) + except Exception as e: + print(e) + print_exc() + similarity = 0 copy_name_2 = replace_abbrevation(copy_name) if copy_name != copy_name_2: similarity_2 = get_jacard_similarity(text, @@ -356,14 +368,16 @@ def update_for_currency(text: str, share_name: str, compare_list: list): else: without_currency_list.append(index) if not with_currency and len(with_currency_list) == 0: - return text, share_name, compare_list + pass elif not with_currency and len(with_currency_list) > 0: - last_split = text_split[-1] + share_short_name = "" + if share_name is not None and len(share_name.strip()) > 0: + share_short_name = get_share_short_name_from_text(share_name) updated = False - if len(last_split) < 4 and last_split.upper() == last_split: + if len(share_short_name) < 4 and share_short_name.upper() == share_short_name: if len(without_currency_list) > 0: for index in without_currency_list: - if last_split in compare_list[index].split(): + if share_short_name in compare_list[index].split(): text = text + ' ' + 'USD' if share_name is not None: share_name = share_name + ' ' + 'USD' @@ -373,7 +387,7 @@ def update_for_currency(text: str, share_name: str, compare_list: list): currency_list = [] for index in with_currency_list: compare_split = compare_list[index].split() - if last_split in compare_split: + if share_short_name in compare_split: current_currency_list = [split for split in compare_split if split.upper() in total_currency_list] if len(current_currency_list) > 0: @@ -391,13 +405,40 @@ def update_for_currency(text: str, share_name: str, compare_list: list): text = text + ' ' + 'USD' if share_name is not None: share_name = share_name + ' ' + 'USD' - return text, share_name, compare_list + # return text, share_name, compare_list elif with_currency and len(without_currency_list) == 0: for index in without_currency_list: compare_list[index] = compare_list[index] + ' ' + 'USD' - return text, share_name, compare_list + # return text, share_name, compare_list else: - return text, share_name, compare_list + # return text, share_name, compare_list + pass + if with_currency: + share_name_split = share_name.split() + share_name_currency_list = [] + for split in share_name_split: + if split.upper() in total_currency_list and split.upper() not in share_name_currency_list: + share_name_currency_list.append(split) + if len(share_name_currency_list) > 1 and 'USD' in share_name_currency_list: + new_share_name = ' '.join([split for split in share_name_split if split.upper() != 'USD']) + if share_name in text: + text = text.replace(share_name, new_share_name) + else: + text = ' '.join([split for split in text.split() if split.upper() != 'USD']) + share_name = new_share_name + for c_i in range(len(compare_list)): + compare = compare_list[c_i] + compare_share_part = get_share_part_list([compare])[0] + compare_share_part_split = compare_share_part.split() + compare_share_part_currency_list = [] + for split in compare_share_part_split: + if split.upper() in total_currency_list and split.upper() not in compare_share_part_currency_list: + compare_share_part_currency_list.append(split) + if len(compare_share_part_currency_list) > 1 and 'USD' in compare_share_part_currency_list: + compare_share_part_split = [split for split in compare_share_part_split if split.upper() != 'USD'] + new_compare_share_part = ' '.join(compare_share_part_split) + compare_list[c_i] = compare.replace(compare_share_part, new_compare_share_part) + return text, share_name, compare_list def remove_common_word(text_list: list): @@ -625,7 +666,7 @@ def replace_abbrevation(text: str): new_text_splits.append('Accumulation') elif split.lower() in ['inc', 'inc.']: new_text_splits.append('Income') - elif split.lower() in ['dist', 'dist.']: + elif split.lower() in ['dist', 'dist.', 'dis', 'dis.']: new_text_splits.append('Distribution') elif split.lower() in ['inv', 'inv.']: new_text_splits.append('Investor')