optimize investment mapping algorithm

This commit is contained in:
Blade He 2024-09-30 16:32:56 -05:00
parent 3aa596ea33
commit 60a26377e5
3 changed files with 40 additions and 11 deletions

View File

@ -283,16 +283,25 @@ class DataMapping:
doc_compare_mapping = self.doc_fund_class_mapping[ doc_compare_mapping = self.doc_fund_class_mapping[
self.doc_fund_class_mapping["FundId"] == parent_id self.doc_fund_class_mapping["FundId"] == parent_id
] ]
doc_compare_name_list = ( if len(doc_compare_mapping) == 0:
doc_compare_mapping["ShareClassName"].unique().tolist() doc_compare_name_list = self.doc_share_name_list
) doc_compare_mapping = self.doc_fund_class_mapping
else:
doc_compare_name_list = (
doc_compare_mapping["ShareClassName"].unique().tolist()
)
provider_compare_mapping = self.provider_fund_class_mapping[ provider_compare_mapping = self.provider_fund_class_mapping[
self.provider_fund_class_mapping["FundId"] == parent_id self.provider_fund_class_mapping["FundId"] == parent_id
] ]
provider_compare_name_list = ( if len(provider_compare_mapping) == 0 or \
provider_compare_mapping["ShareClassName"].unique().tolist() len(provider_compare_mapping) < len(doc_compare_mapping):
) provider_compare_name_list = doc_compare_name_list
provider_compare_mapping = doc_compare_mapping
else:
provider_compare_name_list = (
provider_compare_mapping["ShareClassName"].unique().tolist()
)
else: else:
doc_compare_name_list = self.doc_share_name_list doc_compare_name_list = self.doc_share_name_list
doc_compare_mapping = self.doc_fund_class_mapping doc_compare_mapping = self.doc_fund_class_mapping

15
main.py
View File

@ -574,8 +574,8 @@ def test_data_extraction_metrics():
def test_mapping_raw_name(): def test_mapping_raw_name():
doc_id = "382366116" doc_id = "445102363"
raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I" raw_name = "Danske Invest SICAV Global Portfolio Solution Defensive Class X"
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
data_mapping = DataMapping( data_mapping = DataMapping(
doc_id, doc_id,
@ -692,16 +692,21 @@ if __name__ == "__main__":
"394778487", "394778487",
"401684600", "401684600",
"402113224", "402113224",
"402181770" "402181770",
"402397014",
"405803396",
"445102363",
"445256897",
"448265376"
] ]
# special_doc_id_list = check_mapping_doc_id_list # special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["402397014"] # special_doc_id_list = ["391736837"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = False force_save_total_data = True
extract_ways = ["text"] extract_ways = ["text"]
for extract_way in extract_ways: for extract_way in extract_ways:

View File

@ -169,6 +169,7 @@ def get_most_similar_name(text: str,
text_currency = get_currency_from_text(share_name) text_currency = get_currency_from_text(share_name)
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}") # logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
same_max_similarity_name_list = []
for full_name, copy_name in zip(name_list , copy_name_list): for full_name, copy_name in zip(name_list , copy_name_list):
copy_name = remove_special_characters(copy_name) copy_name = remove_special_characters(copy_name)
copy_name = split_words_without_space(copy_name) copy_name = split_words_without_space(copy_name)
@ -219,8 +220,20 @@ def get_most_similar_name(text: str,
continue continue
max_similarity = similarity max_similarity = similarity
max_similarity_full_name = full_name max_similarity_full_name = full_name
same_max_similarity_name_list = []
elif matching_type == "fund" and max_similarity > 0 and max_similarity == similarity:
if full_name is not None and max_similarity_full_name is not None and \
len(full_name.split()) > len(max_similarity_full_name.split()):
max_similarity_full_name = full_name
same_max_similarity_name_list = []
else:
if full_name is not None:
same_max_similarity_name_list.append(full_name)
if max_similarity == 1: if max_similarity == 1:
break break
# if there are multiple names with the same similarity, return None
if len(same_max_similarity_name_list) > 0:
return None, 0.0
if max_similarity < 0.35: if max_similarity < 0.35:
return None, max_similarity return None, max_similarity
return max_similarity_full_name, max_similarity return max_similarity_full_name, max_similarity
@ -561,6 +574,8 @@ def replace_abbrevation(text: str):
new_text_splits.append('Advantage') new_text_splits.append('Advantage')
elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']: elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']:
new_text_splits.append('Hedged') new_text_splits.append('Hedged')
elif split.lower() in ['unhgd']:
split = ""
elif split.lower() in ['cl', 'cl.']: elif split.lower() in ['cl', 'cl.']:
new_text_splits.append('Class') new_text_splits.append('Class')
elif split.lower() in ['ser', 'ser.']: elif split.lower() in ['ser', 'ser.']: