diff --git a/main.py b/main.py index fe56b08..291b558 100644 --- a/main.py +++ b/main.py @@ -574,9 +574,9 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "389171486" - raw_name = "Nordea 2 Emerging Market Local Debt Enhanced Fund Y - Shares" - raw_share_name = "Y - Shares" + doc_id = "483617247" + raw_name = "CPR Invest Global Disruptive Opportunities Class I sw EUR - Acc" + raw_share_name = "Class I sw EUR - Acc" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -589,7 +589,7 @@ def test_mapping_raw_name(): mapping_info = data_mapping.matching_with_database( raw_name=raw_name, raw_share_name=raw_share_name, - parent_id="FS00009Q8R", + parent_id=None, matching_type="share", process_cache=process_cache ) @@ -705,12 +705,12 @@ if __name__ == "__main__": ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["483617247"] + # special_doc_id_list = ["483617247"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False re_run_mapping_data = True - force_save_total_data = False + force_save_total_data = True extract_ways = ["text"] for extract_way in extract_ways: diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 1e48699..f1ff20f 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -167,7 +167,7 @@ def get_most_similar_name(text: str, 'share', 'shares']]) text_currency = None text_feature = None - text_share_short_name = None + text_share_short_name_list = None if matching_type == "share" and text is not None and len(text.strip()) > 0: if process_cache is not None and isinstance(process_cache, dict): if process_cache.get(text, None) is not None: @@ -177,20 +177,23 @@ def get_most_similar_name(text: str, text_currency = cache.get("share_currency") else: if share_name is not None and len(share_name.strip()) > 0: - text_share_short_name = get_share_short_name_from_text(share_name) + text_share_short_name_list = get_share_short_name_from_text(share_name) text_feature = get_share_feature_from_text(share_name) text_currency = get_currency_from_text(share_name) else: - text_share_short_name = get_share_short_name_from_text(text) + text_share_short_name_list = get_share_short_name_from_text(text) text_feature = get_share_feature_from_text(text) text_currency = get_currency_from_text(text) + # sort text_share_short_name_list + text_share_short_name_list.sort() process_cache[text] = { - "share_short_name": text_share_short_name, + "share_short_name": text_share_short_name_list, "share_feature": text_feature, "share_currency": text_currency } else: - text_share_short_name = get_share_short_name_from_text(share_name) + text_share_short_name_list = get_share_short_name_from_text(share_name) + text_share_short_name_list.sort() text_feature = get_share_feature_from_text(share_name) text_currency = get_currency_from_text(share_name) @@ -221,20 +224,22 @@ def get_most_similar_name(text: str, if process_cache is not None and isinstance(process_cache, dict): if process_cache.get(copy_name, None) is not None: cache = process_cache.get(copy_name) - copy_name_short_name = cache.get("share_short_name") + copy_name_short_name_list = cache.get("share_short_name") copy_name_feature = cache.get("share_feature") copy_name_currency = cache.get("share_currency") else: - copy_name_short_name = get_share_short_name_from_text(copy_share_name) + copy_name_short_name_list = get_share_short_name_from_text(copy_share_name) + copy_name_short_name_list.sort() copy_name_feature = get_share_feature_from_text(copy_share_name) copy_name_currency = get_currency_from_text(copy_share_name) process_cache[copy_name] = { - "share_short_name": copy_name_short_name, + "share_short_name": copy_name_short_name_list, "share_feature": copy_name_feature, "share_currency": copy_name_currency } else: - copy_name_short_name = get_share_short_name_from_text(copy_share_name) + copy_name_short_name_list = get_share_short_name_from_text(copy_share_name) + copy_name_short_name_list.sort() copy_name_feature = get_share_feature_from_text(copy_share_name) copy_name_currency = get_currency_from_text(copy_share_name) @@ -248,9 +253,9 @@ def get_most_similar_name(text: str, if copy_name_feature.lower() not in text.lower().split(): continue if matching_type == "share": - if text_share_short_name is not None and len(text_share_short_name) > 0 and \ - copy_name_short_name is not None and len(copy_name_short_name) > 0: - if text_share_short_name != copy_name_short_name: + if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \ + copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0: + if text_share_short_name_list != copy_name_short_name_list: continue max_similarity = similarity max_similarity_full_name = full_name @@ -304,15 +309,17 @@ def get_share_short_name_from_text(text: str): temp_share_features = [feature.lower() for feature in share_features_full_name] count = 0 + share_short_name_list = [] + for split in text_split[::-1]: if count == 4: break if split.lower() not in temp_share_features and \ - split not in total_currency_list: + split.upper() not in total_currency_list: if len(split) <= 3 and split.upper() == split: - return split.upper() + share_short_name_list.append(split.upper()) count += 1 - return None + return share_short_name_list def get_share_feature_from_text(text: str): if text is None or len(text.strip()) == 0: @@ -370,14 +377,20 @@ def update_for_currency(text: str, share_name: str, compare_list: list): if not with_currency and len(with_currency_list) == 0: pass elif not with_currency and len(with_currency_list) > 0: - share_short_name = "" + share_short_name_list = [] if share_name is not None and len(share_name.strip()) > 0: - share_short_name = get_share_short_name_from_text(share_name) + share_short_name_list = get_share_short_name_from_text(share_name) updated = False - if len(share_short_name) < 4 and share_short_name.upper() == share_short_name: + if len(share_short_name_list) > 0: if len(without_currency_list) > 0: for index in without_currency_list: - if share_short_name in compare_list[index].split(): + all_in_list = True + compare_split = [split.upper() for split in compare_list[index].split()] + for share_shot_name in share_short_name_list: + if share_shot_name not in compare_split: + all_in_list = False + break + if all_in_list: text = text + ' ' + 'USD' if share_name is not None: share_name = share_name + ' ' + 'USD' @@ -386,8 +399,13 @@ def update_for_currency(text: str, share_name: str, compare_list: list): if not updated: currency_list = [] for index in with_currency_list: - compare_split = compare_list[index].split() - if share_short_name in compare_split: + all_in_list = True + compare_split = [split.upper() for split in compare_list[index].split()] + for share_shot_name in share_short_name_list: + if share_shot_name not in compare_split: + all_in_list = False + break + if all_in_list: current_currency_list = [split for split in compare_split if split.upper() in total_currency_list] if len(current_currency_list) > 0: