diff --git a/core/data_mapping.py b/core/data_mapping.py index a3dfc40..79cbeaf 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -174,6 +174,7 @@ class DataMapping: investment_info = self.matching_with_database( raw_name=raw_name, raw_share_name=raw_share_name, + raw_fund_name=raw_fund_name, parent_id=fund_id, matching_type="share", process_cache=process_cache @@ -254,6 +255,7 @@ class DataMapping: self, raw_name: str, raw_share_name: str = None, + raw_fund_name: str = None, parent_id: str = None, matching_type: str = "fund", process_cache: dict = {} @@ -328,9 +330,14 @@ class DataMapping: raw_name, doc_compare_name_list, share_name=raw_share_name, + fund_name=raw_fund_name, matching_type=matching_type, process_cache=process_cache) - if max_similarity is not None and max_similarity >= 0.9: + if matching_type == "fund": + threshold = 0.7 + else: + threshold = 0.9 + if max_similarity is not None and max_similarity >= threshold: data_info["id"] = doc_compare_mapping[ doc_compare_mapping[compare_name_dp] == max_similarity_name ][compare_id_dp].values[0] @@ -344,6 +351,7 @@ class DataMapping: raw_name, provider_compare_name_list, share_name=raw_share_name, + fund_name=raw_fund_name, matching_type=matching_type, pre_common_word_list=pre_common_word_list, process_cache=process_cache diff --git a/main.py b/main.py index 9dd739b..f50f738 100644 --- a/main.py +++ b/main.py @@ -338,7 +338,7 @@ def batch_start_job( if calculate_metrics: - prediction_sheet_name = "mapping_data" + prediction_sheet_name = "total_mapping_data" ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx" ground_truth_sheet_name = "mapping_data" metrics_output_folder = r"/data/emea_ar/output/metrics/" @@ -600,9 +600,9 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "394778487" - raw_name = "Invesco Global Real Assets Fund FCP-RAIF Invesco Global Property Plus Fund Z Gross QD USD" - raw_share_name = "Z Gross QD USD" + doc_id = "382366116" + raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I" + raw_share_name = "EUR I" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -615,7 +615,7 @@ def test_mapping_raw_name(): mapping_info = data_mapping.matching_with_database( raw_name=raw_name, raw_share_name=raw_share_name, - parent_id="FS0000H1C9", + parent_id=None, matching_type="share", process_cache=process_cache ) @@ -697,100 +697,102 @@ if __name__ == "__main__": # "479793787", # "471641628", # ] - # check_db_mapping_doc_id_list = [ - # "292989214", - # "316237292", - # "321733631", - # "323390570", - # "327956364", - # "332223498", - # "333207452", - # "334718372", - # "344636875", - # "362246081", - # "366179419", - # "380945052", - # "382366116", - # "387202452", - # "389171486", - # "391456740", - # "391736837", - # "394778487", - # "401684600", - # "402113224", - # "402181770", - # "402397014", - # "405803396", - # "445102363", - # "445256897", - # "448265376", - # "449555622", - # "449623976", - # "458291624", - # "458359181", - # "463081566", - # "469138353", - # "471641628", - # "476492237", - # "478585901", - # "478586066", - # "479042264", - # "479042269", - # "479793787", - # "481475385", - # "483617247", - # "486378555", - # "486383912", - # "492121213", - # "497497599", - # "502693599" - # ] - check_db_mapping_doc_id_list = [ - "334584772", - "406913630", - "407275419", - "337937633", - "337293427", - "334584772", - "404712928", - "451063582", - "451878128", - "425595958", - "536344026", - "532422548", - "423418540", - "423418395", - "532998065", - "540307575", - "423395975", - "508704368", - "481482392", - "466580448", - "423365707", - "423364758", - "422761666", - "422760156", - "422760148", - "422686965", - "492029971", - "510300817", - "512745032", - "514213638", - "527525440", - "534535767" + "292989214", + "316237292", + "321733631", + "323390570", + "327956364", + "332223498", + "333207452", + "334718372", + "344636875", + "362246081", + "366179419", + "380945052", + "382366116", + "387202452", + "389171486", + "391456740", + "391736837", + "394778487", + "401684600", + "402113224", + "402181770", + "402397014", + "405803396", + "445102363", + "445256897", + "448265376", + "449555622", + "449623976", + "458291624", + "458359181", + "463081566", + "469138353", + "471641628", + "476492237", + "478585901", + "478586066", + "479042264", + "479042269", + "479793787", + "481475385", + "483617247", + "486378555", + "486383912", + "492121213", + "497497599", + "502693599" ] + + # check_db_mapping_doc_id_list = [ + # "334584772", + # "406913630", + # "407275419", + # "337937633", + # "337293427", + # "334584772", + # "404712928", + # "451063582", + # "451878128", + # "425595958", + # "536344026", + # "532422548", + # "423418540", + # "423418395", + # "532998065", + # "540307575", + # "423395975", + # "508704368", + # "481482392", + # "466580448", + # "423365707", + # "423364758", + # "422761666", + # "422760156", + # "422760148", + # "422686965", + # "492029971", + # "510300817", + # "512745032", + # "514213638", + # "527525440", + # "534535767" + # ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["337937633"] + # special_doc_id_list = ["394778487"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False - re_run_mapping_data = False + re_run_mapping_data = True force_save_total_data = True - calculate_metrics = False + calculate_metrics = True extract_ways = ["text"] + pdf_folder = r"/data/emea_ar/small_pdf/" + # pdf_folder = r"/data/emea_ar/pdf/" for extract_way in extract_ways: batch_start_job( pdf_folder, diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 4fa8539..f9ef772 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -81,6 +81,7 @@ def clean_text(text: str) -> str: def get_most_similar_name(text: str, name_list: list, share_name: str = None, + fund_name: str = None, matching_type="share", pre_common_word_list: list = None, process_cache: dict = None) -> str: @@ -116,6 +117,12 @@ def get_most_similar_name(text: str, text = text.strip() text = remove_special_characters(text) text = replace_abbrevation(text) + raw_fund_name_split = [] + if fund_name is not None and len(fund_name.strip()) > 0: + fund_name = fund_name.strip() + fund_name = remove_special_characters(fund_name) + raw_fund_name_split = fund_name.upper().split() + if share_name is not None: share_name = remove_special_characters(share_name) share_name = replace_abbrevation(share_name) @@ -171,11 +178,13 @@ def get_most_similar_name(text: str, text_currency = cache.get("share_currency") else: if share_name is not None and len(share_name.strip()) > 0: - text_share_short_name_list = get_share_short_name_from_text(share_name) + text_share_short_name_list = get_share_short_name_from_text(share_name, + confirm_text_share=True) text_feature = get_share_feature_from_text(share_name) text_currency = get_currency_from_text(share_name) else: - text_share_short_name_list = get_share_short_name_from_text(text) + text_share_short_name_list = get_share_short_name_from_text(text, + confirm_text_share=True) text_feature = get_share_feature_from_text(text) text_currency = get_currency_from_text(text) # sort text_share_short_name_list @@ -187,12 +196,14 @@ def get_most_similar_name(text: str, } else: if share_name is not None and len(share_name.strip()) > 0: - text_share_short_name_list = get_share_short_name_from_text(share_name) + text_share_short_name_list = get_share_short_name_from_text(share_name, + confirm_text_share=True) text_share_short_name_list.sort() text_feature = get_share_feature_from_text(share_name) text_currency = get_currency_from_text(share_name) else: - text_share_short_name_list = get_share_short_name_from_text(text) + text_share_short_name_list = get_share_short_name_from_text(text, + confirm_text_share=True) text_feature = get_share_feature_from_text(text) text_currency = get_currency_from_text(text) @@ -203,6 +214,52 @@ def get_most_similar_name(text: str, continue copy_name = remove_special_characters(copy_name) copy_name = split_words_without_space(copy_name) + copy_name_short_name_list = None + copy_name_feature = None + copy_name_currency = None + if matching_type == "share": + if process_cache is not None and isinstance(process_cache, dict): + if process_cache.get(copy_name, None) is not None: + cache = process_cache.get(copy_name) + copy_name_short_name_list = cache.get("share_short_name") + copy_name_feature = cache.get("share_feature") + copy_name_currency = cache.get("share_currency") + else: + copy_name_short_name_list = get_share_short_name_from_text(copy_share_name) + if copy_name_short_name_list is not None: + copy_name_short_name_list.sort() + copy_name_feature = get_share_feature_from_text(copy_share_name) + copy_name_currency = get_currency_from_text(copy_share_name) + process_cache[copy_name] = { + "share_short_name": copy_name_short_name_list, + "share_feature": copy_name_feature, + "share_currency": copy_name_currency + } + else: + copy_name_short_name_list = get_share_short_name_from_text(copy_share_name) + copy_name_short_name_list.sort() + copy_name_feature = get_share_feature_from_text(copy_share_name) + copy_name_currency = get_currency_from_text(copy_share_name) + try: + if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \ + copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0: + updated_text_share_short_name_list, updated_copy_name_short_name_list = \ + compare_both_short_name(text_share_short_name_list, copy_name_short_name_list) + + if updated_text_share_short_name_list != text_share_short_name_list: + text = ' '.join([split for split in text.split() + if split not in text_share_short_name_list]) + text += ' ' + ' '.join(updated_text_share_short_name_list) + text_share_short_name_list = updated_text_share_short_name_list + + if updated_copy_name_short_name_list != copy_name_short_name_list: + copy_name = ' '.join([split for split in copy_name.split() + if split not in copy_name_short_name_list]) + copy_name += ' ' + ' '.join(updated_copy_name_short_name_list) + copy_name_short_name_list = updated_copy_name_short_name_list + except Exception as e: + print(e) + try: similarity = get_jacard_similarity(text, copy_name, @@ -221,30 +278,7 @@ def get_most_similar_name(text: str, if similarity_2 > similarity: similarity = similarity_2 if similarity > max_similarity: - if matching_type == "share": - if process_cache is not None and isinstance(process_cache, dict): - if process_cache.get(copy_name, None) is not None: - cache = process_cache.get(copy_name) - copy_name_short_name_list = cache.get("share_short_name") - copy_name_feature = cache.get("share_feature") - copy_name_currency = cache.get("share_currency") - else: - copy_name_short_name_list = get_share_short_name_from_text(copy_share_name) - if copy_name_short_name_list is not None: - copy_name_short_name_list.sort() - copy_name_feature = get_share_feature_from_text(copy_share_name) - copy_name_currency = get_currency_from_text(copy_share_name) - process_cache[copy_name] = { - "share_short_name": copy_name_short_name_list, - "share_feature": copy_name_feature, - "share_currency": copy_name_currency - } - else: - copy_name_short_name_list = get_share_short_name_from_text(copy_share_name) - copy_name_short_name_list.sort() - copy_name_feature = get_share_feature_from_text(copy_share_name) - copy_name_currency = get_currency_from_text(copy_share_name) - + if matching_type == "share": if text_currency is not None and len(text_currency) > 0 and \ copy_name_currency is not None and len(copy_name_currency) > 0: if text_currency != copy_name_currency: @@ -257,12 +291,18 @@ def get_most_similar_name(text: str, if matching_type == "share": if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \ copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0: - raw_short_not_in_compare = False + short_name_invalid = False for short in text_share_short_name_list: if short not in copy_name_short_name_list: - raw_short_not_in_compare = True + short_name_invalid = True break - if raw_short_not_in_compare: + for compare_short in copy_name_short_name_list: + if compare_short not in text_share_short_name_list: + # some short word is in fund name, but not belong to share name + if compare_short.upper() not in raw_fund_name_split: + short_name_invalid = True + break + if short_name_invalid: continue max_similarity = similarity max_similarity_full_name = full_name @@ -289,6 +329,43 @@ def get_most_similar_name(text: str, return None, 0.0 +def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list): + copy_text_short_name_list = deepcopy(text_short_name_list) + copy_compare_short_name_list = deepcopy(compare_short_name_list) + copy_text_short_name_list = verify_short_name_container(copy_text_short_name_list, + copy_compare_short_name_list) + copy_compare_short_name_list = verify_short_name_container(copy_compare_short_name_list, + copy_text_short_name_list) + return copy_text_short_name_list, copy_compare_short_name_list + + +def verify_short_name_container(left_short_name_list: list, right_short_name_list: list): + length_1_over_1 = False + length_1_count = 0 + length_1_list = [] + for short_name in left_short_name_list: + if len(short_name) == 1: + length_1_count += 1 + length_1_list.append(short_name) + if length_1_count > 1: + length_1_over_1 = True + + if length_1_over_1: + for compare_short_name in right_short_name_list: + if len(compare_short_name) == length_1_count: + all_in = True + for short_name in length_1_list: + if short_name not in compare_short_name: + all_in = False + break + if all_in: + for short_name in length_1_list: + if short_name in left_short_name_list: + left_short_name_list.remove(short_name) + left_short_name_list.append(compare_short_name) + return left_short_name_list + + def get_share_part_list(text_list: list): share_part_list = [] for text in text_list: @@ -312,7 +389,7 @@ def get_share_part_list(text_list: list): return share_part_list -def get_share_short_name_from_text(text: str): +def get_share_short_name_from_text(text: str, confirm_text_share: bool = False): if text is None or len(text.strip()) == 0: return None text = remove_special_characters(text.strip()) @@ -321,15 +398,18 @@ def get_share_short_name_from_text(text: str): count = 0 share_short_name_list = [] - + if confirm_text_share: + count_threshold = 6 + else: + count_threshold = 4 for split in text_split[::-1]: - if count == 4: + if count == count_threshold: break if split.lower() not in temp_share_features and \ split.upper() not in total_currency_list: if len(split) <= 3: share_short_name_list.append(split.upper()) - count += 1 + count += 1 if len(share_short_name_list) > 1: remove_number = []