optimize mapping: choose proper candidates mapping list.

This commit is contained in:
Blade He 2024-10-01 11:35:29 -05:00
parent 60a26377e5
commit 18174bf1cf
3 changed files with 22 additions and 10 deletions

View File

@ -283,17 +283,25 @@ class DataMapping:
doc_compare_mapping = self.doc_fund_class_mapping[ doc_compare_mapping = self.doc_fund_class_mapping[
self.doc_fund_class_mapping["FundId"] == parent_id self.doc_fund_class_mapping["FundId"] == parent_id
] ]
provider_compare_mapping = self.provider_fund_class_mapping\
[self.provider_fund_class_mapping["FundId"] == parent_id]
if len(doc_compare_mapping) == 0: if len(doc_compare_mapping) == 0:
if len(provider_compare_mapping) == 0:
doc_compare_name_list = self.doc_share_name_list doc_compare_name_list = self.doc_share_name_list
doc_compare_mapping = self.doc_fund_class_mapping doc_compare_mapping = self.doc_fund_class_mapping
provider_compare_name_list = self.provider_share_name_list
provider_compare_mapping = self.provider_fund_class_mapping
else:
provider_compare_name_list = (
provider_compare_mapping["ShareClassName"].unique().tolist()
)
doc_compare_name_list = []
doc_compare_mapping = pd.DataFrame()
else: else:
doc_compare_name_list = ( doc_compare_name_list = (
doc_compare_mapping["ShareClassName"].unique().tolist() doc_compare_mapping["ShareClassName"].unique().tolist()
) )
provider_compare_mapping = self.provider_fund_class_mapping[
self.provider_fund_class_mapping["FundId"] == parent_id
]
if len(provider_compare_mapping) == 0 or \ if len(provider_compare_mapping) == 0 or \
len(provider_compare_mapping) < len(doc_compare_mapping): len(provider_compare_mapping) < len(doc_compare_mapping):
provider_compare_name_list = doc_compare_name_list provider_compare_name_list = doc_compare_name_list

10
main.py
View File

@ -574,8 +574,8 @@ def test_data_extraction_metrics():
def test_mapping_raw_name(): def test_mapping_raw_name():
doc_id = "445102363" doc_id = "469138353"
raw_name = "Danske Invest SICAV Global Portfolio Solution Defensive Class X" raw_name = "Manulife Global Fund ASEAN Equity Fund I USD"
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
data_mapping = DataMapping( data_mapping = DataMapping(
doc_id, doc_id,
@ -697,11 +697,13 @@ if __name__ == "__main__":
"405803396", "405803396",
"445102363", "445102363",
"445256897", "445256897",
"448265376" "448265376",
"449555622",
"449623976",
] ]
# special_doc_id_list = check_mapping_doc_id_list # special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["391736837"] # special_doc_id_list = ["469138353"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False

View File

@ -568,6 +568,8 @@ def replace_abbrevation(text: str):
new_text_splits.append('Institutional') new_text_splits.append('Institutional')
elif split.lower() in ['cap', 'cap.']: elif split.lower() in ['cap', 'cap.']:
new_text_splits.append('Capitalisation') new_text_splits.append('Capitalisation')
elif split.lower() in ['div', 'div.']:
new_text_splits.append('Dividend')
elif split.lower() in ['adm', 'adm.']: elif split.lower() in ['adm', 'adm.']:
new_text_splits.append('Admin') new_text_splits.append('Admin')
elif split.lower() in ['adv', 'adv.']: elif split.lower() in ['adv', 'adv.']: