diff --git a/main.py b/main.py index 50b2171..edd27b7 100644 --- a/main.py +++ b/main.py @@ -704,7 +704,7 @@ if __name__ == "__main__": ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["333207452"] + # special_doc_id_list = ["479042269"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 3cb6294..62c769f 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -49,7 +49,7 @@ total_currency_list = [ "XFO", ] -share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Dividend', 'Investor', 'Institutional', 'Admin', 'Advantage'] +share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Admin', 'Advantage'] share_features_abbrevation = ['Acc', 'Inc', 'Dist', 'Div', 'Inv', 'Inst', 'Adm', 'Adv'] @@ -100,6 +100,10 @@ def get_most_similar_name(text: str, if pre_common_word_list is not None and len(pre_common_word_list) > 0: common_word_list.extend([word for word in pre_common_word_list if word not in common_word_list]) + + if len(common_word_list) > 0: + common_word_list = [word for word in common_word_list + if len(word) > 1 and word.upper() not in total_currency_list] text = text.strip() text = remove_special_characters(text) @@ -224,7 +228,8 @@ def get_most_similar_name(text: str, if text_feature is not None and len(text_feature) > 0 and \ copy_name_feature is not None and len(copy_name_feature) > 0: if text_feature != copy_name_feature: - continue + if copy_name_feature.lower() not in text.lower().split(): + continue if matching_type == "share": if text_share_short_name is not None and len(text_share_short_name) > 0 and \ copy_name_short_name is not None and len(copy_name_short_name) > 0: @@ -263,6 +268,10 @@ def get_share_part_list(text_list: list): text_split = text.split("funds") if len(text_split) == 1: text_split = text.split("Portfolio") + if len(text_split) == 1: + text_split = text.split("Bond") + if len(text_split) == 1: + text_split = text.split("Bonds") if len(text_split) > 1: share_part_list.append(text_split[-1].strip()) else: