diff --git a/core/data_mapping.py b/core/data_mapping.py index 56ad3b7..7a3015b 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -283,17 +283,25 @@ class DataMapping: doc_compare_mapping = self.doc_fund_class_mapping[ self.doc_fund_class_mapping["FundId"] == parent_id ] + provider_compare_mapping = self.provider_fund_class_mapping\ + [self.provider_fund_class_mapping["FundId"] == parent_id] if len(doc_compare_mapping) == 0: - doc_compare_name_list = self.doc_share_name_list - doc_compare_mapping = self.doc_fund_class_mapping + if len(provider_compare_mapping) == 0: + doc_compare_name_list = self.doc_share_name_list + doc_compare_mapping = self.doc_fund_class_mapping + provider_compare_name_list = self.provider_share_name_list + provider_compare_mapping = self.provider_fund_class_mapping + else: + provider_compare_name_list = ( + provider_compare_mapping["ShareClassName"].unique().tolist() + ) + doc_compare_name_list = [] + doc_compare_mapping = pd.DataFrame() else: doc_compare_name_list = ( doc_compare_mapping["ShareClassName"].unique().tolist() ) - - provider_compare_mapping = self.provider_fund_class_mapping[ - self.provider_fund_class_mapping["FundId"] == parent_id - ] + if len(provider_compare_mapping) == 0 or \ len(provider_compare_mapping) < len(doc_compare_mapping): provider_compare_name_list = doc_compare_name_list diff --git a/main.py b/main.py index de85dfc..8934327 100644 --- a/main.py +++ b/main.py @@ -574,8 +574,8 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "445102363" - raw_name = "Danske Invest SICAV Global Portfolio Solution – Defensive Class X" + doc_id = "469138353" + raw_name = "Manulife Global Fund ASEAN Equity Fund I USD" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -697,11 +697,13 @@ if __name__ == "__main__": "405803396", "445102363", "445256897", - "448265376" + "448265376", + "449555622", + "449623976", ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["391736837"] + # special_doc_id_list = ["469138353"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 1df64d7..50e748a 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -568,6 +568,8 @@ def replace_abbrevation(text: str): new_text_splits.append('Institutional') elif split.lower() in ['cap', 'cap.']: new_text_splits.append('Capitalisation') + elif split.lower() in ['div', 'div.']: + new_text_splits.append('Dividend') elif split.lower() in ['adm', 'adm.']: new_text_splits.append('Admin') elif split.lower() in ['adv', 'adv.']: