From 3adbd7631af32256c5cdbad16be6372c09c1d028 Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 1 Oct 2024 15:31:15 -0500 Subject: [PATCH] optimize mapping algorithm --- main.py | 8 ++--- utils/biz_utils.py | 84 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 63 insertions(+), 29 deletions(-) diff --git a/main.py b/main.py index 5728e12..50b2171 100644 --- a/main.py +++ b/main.py @@ -574,8 +574,8 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "469138353" - raw_name = "Manulife Global Fund ASEAN Equity Fund I USD" + doc_id = "333207452" + raw_name = "Rathbone SICAV Income Fund L ACC GBP" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -704,11 +704,11 @@ if __name__ == "__main__": ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["469138353"] + # special_doc_id_list = ["333207452"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False - re_run_mapping_data = False + re_run_mapping_data = True force_save_total_data = True extract_ways = ["text"] diff --git a/utils/biz_utils.py b/utils/biz_utils.py index fd769e0..3cb6294 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -104,6 +104,10 @@ def get_most_similar_name(text: str, text = text.strip() text = remove_special_characters(text) text = replace_abbrevation(text) + if share_name is not None: + share_name = remove_special_characters(share_name) + share_name = replace_abbrevation(share_name) + text_splits = text.split() if len(text_splits) == 1: text = split_words_without_space(text) @@ -123,14 +127,6 @@ def get_most_similar_name(text: str, temp_splits = copy_name_list[i].split() copy_name_list[i] = ' '.join([split for split in temp_splits if remove_special_characters(split).lower() != word]) - - for i in range(len(copy_name_list)): - temp_splits = copy_name_list[i].split() - copy_name_list[i] = ' '.join([split for split in temp_splits - if remove_special_characters(split).lower() - not in ['fund', "funds", 'portfolio', - 'class', 'classes', - 'share', 'shares']]) final_splits = [] for split in new_splits: if split.lower() not in ['fund', "funds", 'portfolio', @@ -139,11 +135,22 @@ def get_most_similar_name(text: str, final_splits.append(split) text = ' '.join(final_splits) + + copy_share_name_list = get_share_part_list(copy_name_list) + + for i in range(len(copy_name_list)): + temp_splits = copy_name_list[i].split() + copy_name_list[i] = ' '.join([split for split in temp_splits + if remove_special_characters(split).lower() + not in ['fund', "funds", 'portfolio', + 'class', 'classes', + 'share', 'shares']]) max_similarity = 0 max_similarity_full_name = None text = remove_special_characters(text) + if matching_type == "share": - text, copy_name_list = update_for_currency(text, copy_name_list) + text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list) text_currency = None text_feature = None text_share_short_name = None @@ -155,9 +162,14 @@ def get_most_similar_name(text: str, text_feature = cache.get("share_feature") text_currency = cache.get("share_currency") else: - text_share_short_name = get_share_short_name_from_text(text) - text_feature = get_share_feature_from_text(text) - text_currency = get_currency_from_text(text) + if share_name is not None and len(share_name.strip()) > 0: + text_share_short_name = get_share_short_name_from_text(share_name) + text_feature = get_share_feature_from_text(share_name) + text_currency = get_currency_from_text(share_name) + else: + text_share_short_name = get_share_short_name_from_text(text) + text_feature = get_share_feature_from_text(text) + text_currency = get_currency_from_text(text) process_cache[text] = { "share_short_name": text_share_short_name, "share_feature": text_feature, @@ -170,7 +182,7 @@ def get_most_similar_name(text: str, # logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}") same_max_similarity_name_list = [] - for full_name, copy_name in zip(name_list , copy_name_list): + for full_name, copy_name, copy_share_name in zip(name_list , copy_name_list, copy_share_name_list): copy_name = remove_special_characters(copy_name) copy_name = split_words_without_space(copy_name) similarity = get_jacard_similarity(text, @@ -192,18 +204,18 @@ def get_most_similar_name(text: str, copy_name_feature = cache.get("share_feature") copy_name_currency = cache.get("share_currency") else: - copy_name_short_name = get_share_short_name_from_text(copy_name) - copy_name_feature = get_share_feature_from_text(copy_name) - copy_name_currency = get_currency_from_text(copy_name) + copy_name_short_name = get_share_short_name_from_text(copy_share_name) + copy_name_feature = get_share_feature_from_text(copy_share_name) + copy_name_currency = get_currency_from_text(copy_share_name) process_cache[copy_name] = { "share_short_name": copy_name_short_name, "share_feature": copy_name_feature, "share_currency": copy_name_currency } else: - copy_name_short_name = get_share_short_name_from_text(copy_name) - copy_name_feature = get_share_feature_from_text(copy_name) - copy_name_currency = get_currency_from_text(copy_name) + copy_name_short_name = get_share_short_name_from_text(copy_share_name) + copy_name_feature = get_share_feature_from_text(copy_share_name) + copy_name_currency = get_currency_from_text(copy_share_name) if text_currency is not None and len(text_currency) > 0 and \ copy_name_currency is not None and len(copy_name_currency) > 0: @@ -242,10 +254,26 @@ def get_most_similar_name(text: str, print_exc() return None, 0.0 + +def get_share_part_list(text_list: list): + share_part_list = [] + for text in text_list: + text_split = text.split("Fund") + if len(text_split) == 1: + text_split = text.split("funds") + if len(text_split) == 1: + text_split = text.split("Portfolio") + if len(text_split) > 1: + share_part_list.append(text_split[-1].strip()) + else: + share_part_list.append(text) + return share_part_list + + def get_share_short_name_from_text(text: str): if text is None or len(text.strip()) == 0: return None - text = text.strip() + text = remove_special_characters(text.strip()) text_split = text.split() temp_share_features = [feature.lower() for feature in share_features_full_name] @@ -292,7 +320,7 @@ def get_currency_from_text(text: str): return None -def update_for_currency(text: str, compare_list: list): +def update_for_currency(text: str, share_name: str, compare_list: list): text_split = text.split() with_currency = False for split in text_split: @@ -314,7 +342,7 @@ def update_for_currency(text: str, compare_list: list): else: without_currency_list.append(index) if not with_currency and len(with_currency_list) == 0: - return text, compare_list + return text, share_name, compare_list elif not with_currency and len(with_currency_list) > 0: last_split = text_split[-1] updated = False @@ -323,6 +351,8 @@ def update_for_currency(text: str, compare_list: list): for index in without_currency_list: if last_split in compare_list[index].split(): text = text + ' ' + 'USD' + if share_name is not None: + share_name = share_name + ' ' + 'USD' updated = True break if not updated: @@ -336,6 +366,8 @@ def update_for_currency(text: str, compare_list: list): currency_list.append(current_currency_list[-1]) if len(currency_list) == 1: text = text + ' ' + currency_list[0] + if share_name is not None: + share_name = share_name + ' ' + currency_list[0] updated = True for index in without_currency_list: @@ -343,13 +375,15 @@ def update_for_currency(text: str, compare_list: list): if not updated: text = text + ' ' + 'USD' - return text, compare_list + if share_name is not None: + share_name = share_name + ' ' + 'USD' + return text, share_name, compare_list elif with_currency and len(without_currency_list) == 0: for index in without_currency_list: compare_list[index] = compare_list[index] + ' ' + 'USD' - return text, compare_list + return text, share_name, compare_list else: - return text, compare_list + return text, share_name, compare_list def remove_common_word(text_list: list):