optimize investment mapping algorithm
This commit is contained in:
parent
3aa596ea33
commit
60a26377e5
|
|
@ -283,16 +283,25 @@ class DataMapping:
|
||||||
doc_compare_mapping = self.doc_fund_class_mapping[
|
doc_compare_mapping = self.doc_fund_class_mapping[
|
||||||
self.doc_fund_class_mapping["FundId"] == parent_id
|
self.doc_fund_class_mapping["FundId"] == parent_id
|
||||||
]
|
]
|
||||||
doc_compare_name_list = (
|
if len(doc_compare_mapping) == 0:
|
||||||
doc_compare_mapping["ShareClassName"].unique().tolist()
|
doc_compare_name_list = self.doc_share_name_list
|
||||||
)
|
doc_compare_mapping = self.doc_fund_class_mapping
|
||||||
|
else:
|
||||||
|
doc_compare_name_list = (
|
||||||
|
doc_compare_mapping["ShareClassName"].unique().tolist()
|
||||||
|
)
|
||||||
|
|
||||||
provider_compare_mapping = self.provider_fund_class_mapping[
|
provider_compare_mapping = self.provider_fund_class_mapping[
|
||||||
self.provider_fund_class_mapping["FundId"] == parent_id
|
self.provider_fund_class_mapping["FundId"] == parent_id
|
||||||
]
|
]
|
||||||
provider_compare_name_list = (
|
if len(provider_compare_mapping) == 0 or \
|
||||||
provider_compare_mapping["ShareClassName"].unique().tolist()
|
len(provider_compare_mapping) < len(doc_compare_mapping):
|
||||||
)
|
provider_compare_name_list = doc_compare_name_list
|
||||||
|
provider_compare_mapping = doc_compare_mapping
|
||||||
|
else:
|
||||||
|
provider_compare_name_list = (
|
||||||
|
provider_compare_mapping["ShareClassName"].unique().tolist()
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
doc_compare_name_list = self.doc_share_name_list
|
doc_compare_name_list = self.doc_share_name_list
|
||||||
doc_compare_mapping = self.doc_fund_class_mapping
|
doc_compare_mapping = self.doc_fund_class_mapping
|
||||||
|
|
|
||||||
15
main.py
15
main.py
|
|
@ -574,8 +574,8 @@ def test_data_extraction_metrics():
|
||||||
|
|
||||||
|
|
||||||
def test_mapping_raw_name():
|
def test_mapping_raw_name():
|
||||||
doc_id = "382366116"
|
doc_id = "445102363"
|
||||||
raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I"
|
raw_name = "Danske Invest SICAV Global Portfolio Solution – Defensive Class X"
|
||||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||||
data_mapping = DataMapping(
|
data_mapping = DataMapping(
|
||||||
doc_id,
|
doc_id,
|
||||||
|
|
@ -692,16 +692,21 @@ if __name__ == "__main__":
|
||||||
"394778487",
|
"394778487",
|
||||||
"401684600",
|
"401684600",
|
||||||
"402113224",
|
"402113224",
|
||||||
"402181770"
|
"402181770",
|
||||||
|
"402397014",
|
||||||
|
"405803396",
|
||||||
|
"445102363",
|
||||||
|
"445256897",
|
||||||
|
"448265376"
|
||||||
]
|
]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
special_doc_id_list = ["402397014"]
|
# special_doc_id_list = ["391736837"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = True
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
for extract_way in extract_ways:
|
for extract_way in extract_ways:
|
||||||
|
|
|
||||||
|
|
@ -169,6 +169,7 @@ def get_most_similar_name(text: str,
|
||||||
text_currency = get_currency_from_text(share_name)
|
text_currency = get_currency_from_text(share_name)
|
||||||
|
|
||||||
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
|
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
|
||||||
|
same_max_similarity_name_list = []
|
||||||
for full_name, copy_name in zip(name_list , copy_name_list):
|
for full_name, copy_name in zip(name_list , copy_name_list):
|
||||||
copy_name = remove_special_characters(copy_name)
|
copy_name = remove_special_characters(copy_name)
|
||||||
copy_name = split_words_without_space(copy_name)
|
copy_name = split_words_without_space(copy_name)
|
||||||
|
|
@ -219,8 +220,20 @@ def get_most_similar_name(text: str,
|
||||||
continue
|
continue
|
||||||
max_similarity = similarity
|
max_similarity = similarity
|
||||||
max_similarity_full_name = full_name
|
max_similarity_full_name = full_name
|
||||||
|
same_max_similarity_name_list = []
|
||||||
|
elif matching_type == "fund" and max_similarity > 0 and max_similarity == similarity:
|
||||||
|
if full_name is not None and max_similarity_full_name is not None and \
|
||||||
|
len(full_name.split()) > len(max_similarity_full_name.split()):
|
||||||
|
max_similarity_full_name = full_name
|
||||||
|
same_max_similarity_name_list = []
|
||||||
|
else:
|
||||||
|
if full_name is not None:
|
||||||
|
same_max_similarity_name_list.append(full_name)
|
||||||
if max_similarity == 1:
|
if max_similarity == 1:
|
||||||
break
|
break
|
||||||
|
# if there are multiple names with the same similarity, return None
|
||||||
|
if len(same_max_similarity_name_list) > 0:
|
||||||
|
return None, 0.0
|
||||||
if max_similarity < 0.35:
|
if max_similarity < 0.35:
|
||||||
return None, max_similarity
|
return None, max_similarity
|
||||||
return max_similarity_full_name, max_similarity
|
return max_similarity_full_name, max_similarity
|
||||||
|
|
@ -561,6 +574,8 @@ def replace_abbrevation(text: str):
|
||||||
new_text_splits.append('Advantage')
|
new_text_splits.append('Advantage')
|
||||||
elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']:
|
elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']:
|
||||||
new_text_splits.append('Hedged')
|
new_text_splits.append('Hedged')
|
||||||
|
elif split.lower() in ['unhgd']:
|
||||||
|
split = ""
|
||||||
elif split.lower() in ['cl', 'cl.']:
|
elif split.lower() in ['cl', 'cl.']:
|
||||||
new_text_splits.append('Class')
|
new_text_splits.append('Class')
|
||||||
elif split.lower() in ['ser', 'ser.']:
|
elif split.lower() in ['ser', 'ser.']:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue