optimize mapping: choose proper candidates mapping list.

This commit is contained in:
Blade He 2024-10-01 11:35:29 -05:00
parent 60a26377e5
commit 18174bf1cf
3 changed files with 22 additions and 10 deletions

View File

@ -283,17 +283,25 @@ class DataMapping:
doc_compare_mapping = self.doc_fund_class_mapping[
self.doc_fund_class_mapping["FundId"] == parent_id
]
provider_compare_mapping = self.provider_fund_class_mapping\
[self.provider_fund_class_mapping["FundId"] == parent_id]
if len(doc_compare_mapping) == 0:
if len(provider_compare_mapping) == 0:
doc_compare_name_list = self.doc_share_name_list
doc_compare_mapping = self.doc_fund_class_mapping
provider_compare_name_list = self.provider_share_name_list
provider_compare_mapping = self.provider_fund_class_mapping
else:
provider_compare_name_list = (
provider_compare_mapping["ShareClassName"].unique().tolist()
)
doc_compare_name_list = []
doc_compare_mapping = pd.DataFrame()
else:
doc_compare_name_list = (
doc_compare_mapping["ShareClassName"].unique().tolist()
)
provider_compare_mapping = self.provider_fund_class_mapping[
self.provider_fund_class_mapping["FundId"] == parent_id
]
if len(provider_compare_mapping) == 0 or \
len(provider_compare_mapping) < len(doc_compare_mapping):
provider_compare_name_list = doc_compare_name_list

10
main.py
View File

@ -574,8 +574,8 @@ def test_data_extraction_metrics():
def test_mapping_raw_name():
doc_id = "445102363"
raw_name = "Danske Invest SICAV Global Portfolio Solution Defensive Class X"
doc_id = "469138353"
raw_name = "Manulife Global Fund ASEAN Equity Fund I USD"
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
data_mapping = DataMapping(
doc_id,
@ -697,11 +697,13 @@ if __name__ == "__main__":
"405803396",
"445102363",
"445256897",
"448265376"
"448265376",
"449555622",
"449623976",
]
# special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["391736837"]
# special_doc_id_list = ["469138353"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False

View File

@ -568,6 +568,8 @@ def replace_abbrevation(text: str):
new_text_splits.append('Institutional')
elif split.lower() in ['cap', 'cap.']:
new_text_splits.append('Capitalisation')
elif split.lower() in ['div', 'div.']:
new_text_splits.append('Dividend')
elif split.lower() in ['adm', 'adm.']:
new_text_splits.append('Admin')
elif split.lower() in ['adv', 'adv.']: