diff --git a/main.py b/main.py index 3149f41..fe56b08 100644 --- a/main.py +++ b/main.py @@ -574,9 +574,9 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "481475385" - raw_name = "Emerging Markets Fund A-ACC Shares USD" - raw_share_name = "A-ACC Shares USD" + doc_id = "389171486" + raw_name = "Nordea 2 Emerging Market Local Debt Enhanced Fund Y - Shares" + raw_share_name = "Y - Shares" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -589,7 +589,7 @@ def test_mapping_raw_name(): mapping_info = data_mapping.matching_with_database( raw_name=raw_name, raw_share_name=raw_share_name, - parent_id=None, + parent_id="FS00009Q8R", matching_type="share", process_cache=process_cache ) @@ -705,12 +705,12 @@ if __name__ == "__main__": ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["481475385"] + special_doc_id_list = ["483617247"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False extract_ways = ["text"] for extract_way in extract_ways: diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 1a671ff..1e48699 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -127,7 +127,7 @@ def get_most_similar_name(text: str, new_splits.extend(split_words_without_space(split).split()) else: new_splits.append(split) - + text = ' '.join(new_splits) lower_new_splits = [split.lower() for split in new_splits] for word in common_word_list: if word not in lower_new_splits: @@ -136,30 +136,35 @@ def get_most_similar_name(text: str, temp_splits = copy_name_list[i].split() copy_name_list[i] = ' '.join([split for split in temp_splits if remove_special_characters(split).lower() != word]) - final_splits = [] - for split in new_splits: - if split.lower() not in ['fund', "funds", 'portfolio', - 'class', 'classes', - 'share', 'shares']: - final_splits.append(split) - text = ' '.join(final_splits) - - copy_share_name_list = get_share_part_list(copy_name_list) - - for i in range(len(copy_name_list)): - temp_splits = copy_name_list[i].split() - copy_name_list[i] = ' '.join([split for split in temp_splits - if remove_special_characters(split).lower() - not in ['fund', "funds", 'portfolio', - 'class', 'classes', - 'share', 'shares']]) max_similarity = 0 max_similarity_full_name = None text = remove_special_characters(text) if matching_type == "share": text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list) + + text = ' '.join([split for split in text.split() + if split.lower() not in ['fund', "funds", 'portfolio', + 'bond', 'bonds', + 'class', 'classes', + 'share', 'shares']]) + if share_name is not None: + share_name = ' '.join([split for split in share_name.split() + if split.lower() not in ['fund', "funds", 'portfolio', + 'bond', 'bonds', + 'class', 'classes', + 'share', 'shares']]) + + copy_share_name_list = get_share_part_list(copy_name_list) + for i in range(len(copy_name_list)): + temp_splits = copy_name_list[i].split() + copy_name_list[i] = ' '.join([split for split in temp_splits + if remove_special_characters(split).lower() + not in ['fund', "funds", 'portfolio', + 'bond', 'bonds', + 'class', 'classes', + 'share', 'shares']]) text_currency = None text_feature = None text_share_short_name = None @@ -192,11 +197,18 @@ def get_most_similar_name(text: str, # logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}") same_max_similarity_name_list = [] for full_name, copy_name, copy_share_name in zip(name_list , copy_name_list, copy_share_name_list): + if not isinstance(copy_name, str) or len(copy_name.strip()) == 0: + continue copy_name = remove_special_characters(copy_name) copy_name = split_words_without_space(copy_name) - similarity = get_jacard_similarity(text, - copy_name, - need_remove_numeric_characters=False) + try: + similarity = get_jacard_similarity(text, + copy_name, + need_remove_numeric_characters=False) + except Exception as e: + print(e) + print_exc() + similarity = 0 copy_name_2 = replace_abbrevation(copy_name) if copy_name != copy_name_2: similarity_2 = get_jacard_similarity(text, @@ -356,14 +368,16 @@ def update_for_currency(text: str, share_name: str, compare_list: list): else: without_currency_list.append(index) if not with_currency and len(with_currency_list) == 0: - return text, share_name, compare_list + pass elif not with_currency and len(with_currency_list) > 0: - last_split = text_split[-1] + share_short_name = "" + if share_name is not None and len(share_name.strip()) > 0: + share_short_name = get_share_short_name_from_text(share_name) updated = False - if len(last_split) < 4 and last_split.upper() == last_split: + if len(share_short_name) < 4 and share_short_name.upper() == share_short_name: if len(without_currency_list) > 0: for index in without_currency_list: - if last_split in compare_list[index].split(): + if share_short_name in compare_list[index].split(): text = text + ' ' + 'USD' if share_name is not None: share_name = share_name + ' ' + 'USD' @@ -373,7 +387,7 @@ def update_for_currency(text: str, share_name: str, compare_list: list): currency_list = [] for index in with_currency_list: compare_split = compare_list[index].split() - if last_split in compare_split: + if share_short_name in compare_split: current_currency_list = [split for split in compare_split if split.upper() in total_currency_list] if len(current_currency_list) > 0: @@ -391,13 +405,40 @@ def update_for_currency(text: str, share_name: str, compare_list: list): text = text + ' ' + 'USD' if share_name is not None: share_name = share_name + ' ' + 'USD' - return text, share_name, compare_list + # return text, share_name, compare_list elif with_currency and len(without_currency_list) == 0: for index in without_currency_list: compare_list[index] = compare_list[index] + ' ' + 'USD' - return text, share_name, compare_list + # return text, share_name, compare_list else: - return text, share_name, compare_list + # return text, share_name, compare_list + pass + if with_currency: + share_name_split = share_name.split() + share_name_currency_list = [] + for split in share_name_split: + if split.upper() in total_currency_list and split.upper() not in share_name_currency_list: + share_name_currency_list.append(split) + if len(share_name_currency_list) > 1 and 'USD' in share_name_currency_list: + new_share_name = ' '.join([split for split in share_name_split if split.upper() != 'USD']) + if share_name in text: + text = text.replace(share_name, new_share_name) + else: + text = ' '.join([split for split in text.split() if split.upper() != 'USD']) + share_name = new_share_name + for c_i in range(len(compare_list)): + compare = compare_list[c_i] + compare_share_part = get_share_part_list([compare])[0] + compare_share_part_split = compare_share_part.split() + compare_share_part_currency_list = [] + for split in compare_share_part_split: + if split.upper() in total_currency_list and split.upper() not in compare_share_part_currency_list: + compare_share_part_currency_list.append(split) + if len(compare_share_part_currency_list) > 1 and 'USD' in compare_share_part_currency_list: + compare_share_part_split = [split for split in compare_share_part_split if split.upper() != 'USD'] + new_compare_share_part = ' '.join(compare_share_part_split) + compare_list[c_i] = compare.replace(compare_share_part, new_compare_share_part) + return text, share_name, compare_list def remove_common_word(text_list: list): @@ -625,7 +666,7 @@ def replace_abbrevation(text: str): new_text_splits.append('Accumulation') elif split.lower() in ['inc', 'inc.']: new_text_splits.append('Income') - elif split.lower() in ['dist', 'dist.']: + elif split.lower() in ['dist', 'dist.', 'dis', 'dis.']: new_text_splits.append('Distribution') elif split.lower() in ['inv', 'inv.']: new_text_splits.append('Investor')