diff --git a/main.py b/main.py index edd27b7..3149f41 100644 --- a/main.py +++ b/main.py @@ -574,8 +574,9 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "333207452" - raw_name = "Rathbone SICAV Income Fund L ACC GBP" + doc_id = "481475385" + raw_name = "Emerging Markets Fund A-ACC Shares USD" + raw_share_name = "A-ACC Shares USD" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -587,7 +588,7 @@ def test_mapping_raw_name(): process_cache = {} mapping_info = data_mapping.matching_with_database( raw_name=raw_name, - raw_share_name=None, + raw_share_name=raw_share_name, parent_id=None, matching_type="share", process_cache=process_cache @@ -704,7 +705,7 @@ if __name__ == "__main__": ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["479042269"] + # special_doc_id_list = ["481475385"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 62c769f..1a671ff 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -90,8 +90,13 @@ def get_most_similar_name(text: str, copy_name_list is None or len(copy_name_list) == 0: return None, None - copy_name_list = [replace_abbrevation(copy_name) for copy_name - in copy_name_list] + for i in range(len(copy_name_list)): + copy_name = copy_name_list[i] + share_part = get_share_part_list([copy_name])[0] + if '-' in share_part: + copy_name = copy_name.replace('-', ' ') + copy_name = replace_abbrevation(copy_name) + copy_name_list[i] = copy_name # get common words in fund_name_list common_word_list = []