From 8bd600842577eda0cae7ffa9ea81a76ad2aeee87 Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 7 Oct 2024 10:34:13 -0500 Subject: [PATCH] refactor code --- main.py | 4 +-- utils/biz_utils.py | 66 ++++++++++++++++++---------------------------- 2 files changed, 28 insertions(+), 42 deletions(-) diff --git a/main.py b/main.py index 557e31e..889973a 100644 --- a/main.py +++ b/main.py @@ -446,7 +446,7 @@ def get_metrics( ground_truth_sheet_name=ground_truth_sheet_name, output_folder=output_folder, ) - missing_error_list, metrics_list, metrics_file = metrics.get_metrics(strict_model=True) + missing_error_list, metrics_list, metrics_file = metrics.get_metrics(strict_model=False) return missing_error_list, metrics_list, metrics_file @@ -725,7 +725,7 @@ if __name__ == "__main__": output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False - re_run_mapping_data = False + re_run_mapping_data = True force_save_total_data = True extract_ways = ["text"] diff --git a/utils/biz_utils.py b/utils/biz_utils.py index f7cfc5d..5760268 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -51,6 +51,9 @@ total_currency_list = [ share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Admin', 'Advantage'] share_features_abbrevation = ['Acc', 'Inc', 'Dist', 'Div', 'Inv', 'Inst', 'Adm', 'Adv'] +lower_pre_fix_fund_share = ['fund', "funds", 'portfolio', + 'bond', 'bonds', 'class', + 'classes', 'share', 'shares'] def add_slash_to_text_as_regex(text: str): @@ -145,26 +148,17 @@ def get_most_similar_name(text: str, text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list) text = ' '.join([split for split in text.split() - if split.lower() not in ['fund', "funds", 'portfolio', - 'bond', 'bonds', - 'class', 'classes', - 'share', 'shares']]) + if split.lower() not in lower_pre_fix_fund_share]) if share_name is not None: share_name = ' '.join([split for split in share_name.split() - if split.lower() not in ['fund', "funds", 'portfolio', - 'bond', 'bonds', - 'class', 'classes', - 'share', 'shares']]) + if split.lower() not in lower_pre_fix_fund_share]) copy_share_name_list = get_share_part_list(copy_name_list) for i in range(len(copy_name_list)): temp_splits = copy_name_list[i].split() copy_name_list[i] = ' '.join([split for split in temp_splits if remove_special_characters(split).lower() - not in ['fund', "funds", 'portfolio', - 'bond', 'bonds', - 'class', 'classes', - 'share', 'shares']]) + not in lower_pre_fix_fund_share]) text_currency = None text_feature = None text_share_short_name_list = None @@ -192,10 +186,15 @@ def get_most_similar_name(text: str, "share_currency": text_currency } else: - text_share_short_name_list = get_share_short_name_from_text(share_name) - text_share_short_name_list.sort() - text_feature = get_share_feature_from_text(share_name) - text_currency = get_currency_from_text(share_name) + if share_name is not None and len(share_name.strip()) > 0: + text_share_short_name_list = get_share_short_name_from_text(share_name) + text_share_short_name_list.sort() + text_feature = get_share_feature_from_text(share_name) + text_currency = get_currency_from_text(share_name) + else: + text_share_short_name_list = get_share_short_name_from_text(text) + text_feature = get_share_feature_from_text(text) + text_currency = get_currency_from_text(text) # logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}") same_max_similarity_name_list = [] @@ -308,10 +307,7 @@ def get_share_part_list(text_list: list): share_part_text = text.strip() share_part_text = ' '.join([split for split in share_part_text.split() if remove_special_characters(split).lower() - not in ['fund', "funds", 'portfolio', - 'bond', 'bonds', - 'class', 'classes', - 'share', 'shares']]) + not in lower_pre_fix_fund_share]) share_part_list.append(share_part_text) return share_part_list @@ -367,7 +363,6 @@ def get_currency_from_text(text: str): if text is None or len(text.strip()) == 0: return None text = text.strip() - text = text.lower() text_split = text.split() count = 0 for split in text_split[::-1]: @@ -380,22 +375,20 @@ def get_currency_from_text(text: str): def update_for_currency(text: str, share_name: str, compare_list: list): - text_split = text.split() + currency_in_text = get_currency_from_text(text) with_currency = False - for split in text_split: - if split.upper() in total_currency_list: - with_currency = True - break + if currency_in_text is not None: + with_currency = True with_currency_list = [] without_currency_list = [] for index, compare in enumerate(compare_list): - compare_split = compare.split() + # compare_split = compare.split() with_currency_compare = False - for split in compare_split: - if split.upper() in total_currency_list: - with_currency_compare = True - break + currecy_in_compare = get_currency_from_text(compare) + if currecy_in_compare is not None: + with_currency_compare = True + if with_currency_compare: with_currency_list.append(index) else: @@ -493,15 +486,8 @@ def remove_common_word(text_list: list): text = text.lower() text = remove_special_characters(text) text_splits = text.split() - while 'fund' in text_splits: - text_splits.remove('fund') - while 'portfolio' in text_splits: - text_splits.remove('portfolio') - while 'share' in text_splits: - text_splits.remove('share') - while 'class' in text_splits: - text_splits.remove('class') - text = ' '.join(text_splits) + text = ' '.join([split for split in text_splits + if split.lower() not in lower_pre_fix_fund_share]) new_text_list.append(text) # remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words # the result is ['Global', 'Growth']