optimize investment mapping algorithm
This commit is contained in:
parent
3aa596ea33
commit
60a26377e5
|
|
@ -283,16 +283,25 @@ class DataMapping:
|
|||
doc_compare_mapping = self.doc_fund_class_mapping[
|
||||
self.doc_fund_class_mapping["FundId"] == parent_id
|
||||
]
|
||||
doc_compare_name_list = (
|
||||
doc_compare_mapping["ShareClassName"].unique().tolist()
|
||||
)
|
||||
if len(doc_compare_mapping) == 0:
|
||||
doc_compare_name_list = self.doc_share_name_list
|
||||
doc_compare_mapping = self.doc_fund_class_mapping
|
||||
else:
|
||||
doc_compare_name_list = (
|
||||
doc_compare_mapping["ShareClassName"].unique().tolist()
|
||||
)
|
||||
|
||||
provider_compare_mapping = self.provider_fund_class_mapping[
|
||||
self.provider_fund_class_mapping["FundId"] == parent_id
|
||||
]
|
||||
provider_compare_name_list = (
|
||||
provider_compare_mapping["ShareClassName"].unique().tolist()
|
||||
)
|
||||
if len(provider_compare_mapping) == 0 or \
|
||||
len(provider_compare_mapping) < len(doc_compare_mapping):
|
||||
provider_compare_name_list = doc_compare_name_list
|
||||
provider_compare_mapping = doc_compare_mapping
|
||||
else:
|
||||
provider_compare_name_list = (
|
||||
provider_compare_mapping["ShareClassName"].unique().tolist()
|
||||
)
|
||||
else:
|
||||
doc_compare_name_list = self.doc_share_name_list
|
||||
doc_compare_mapping = self.doc_fund_class_mapping
|
||||
|
|
|
|||
15
main.py
15
main.py
|
|
@ -574,8 +574,8 @@ def test_data_extraction_metrics():
|
|||
|
||||
|
||||
def test_mapping_raw_name():
|
||||
doc_id = "382366116"
|
||||
raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I"
|
||||
doc_id = "445102363"
|
||||
raw_name = "Danske Invest SICAV Global Portfolio Solution – Defensive Class X"
|
||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||
data_mapping = DataMapping(
|
||||
doc_id,
|
||||
|
|
@ -692,16 +692,21 @@ if __name__ == "__main__":
|
|||
"394778487",
|
||||
"401684600",
|
||||
"402113224",
|
||||
"402181770"
|
||||
"402181770",
|
||||
"402397014",
|
||||
"405803396",
|
||||
"445102363",
|
||||
"445256897",
|
||||
"448265376"
|
||||
]
|
||||
# special_doc_id_list = check_mapping_doc_id_list
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
special_doc_id_list = ["402397014"]
|
||||
# special_doc_id_list = ["391736837"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = False
|
||||
force_save_total_data = True
|
||||
|
||||
extract_ways = ["text"]
|
||||
for extract_way in extract_ways:
|
||||
|
|
|
|||
|
|
@ -169,6 +169,7 @@ def get_most_similar_name(text: str,
|
|||
text_currency = get_currency_from_text(share_name)
|
||||
|
||||
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
|
||||
same_max_similarity_name_list = []
|
||||
for full_name, copy_name in zip(name_list , copy_name_list):
|
||||
copy_name = remove_special_characters(copy_name)
|
||||
copy_name = split_words_without_space(copy_name)
|
||||
|
|
@ -219,8 +220,20 @@ def get_most_similar_name(text: str,
|
|||
continue
|
||||
max_similarity = similarity
|
||||
max_similarity_full_name = full_name
|
||||
same_max_similarity_name_list = []
|
||||
elif matching_type == "fund" and max_similarity > 0 and max_similarity == similarity:
|
||||
if full_name is not None and max_similarity_full_name is not None and \
|
||||
len(full_name.split()) > len(max_similarity_full_name.split()):
|
||||
max_similarity_full_name = full_name
|
||||
same_max_similarity_name_list = []
|
||||
else:
|
||||
if full_name is not None:
|
||||
same_max_similarity_name_list.append(full_name)
|
||||
if max_similarity == 1:
|
||||
break
|
||||
# if there are multiple names with the same similarity, return None
|
||||
if len(same_max_similarity_name_list) > 0:
|
||||
return None, 0.0
|
||||
if max_similarity < 0.35:
|
||||
return None, max_similarity
|
||||
return max_similarity_full_name, max_similarity
|
||||
|
|
@ -561,6 +574,8 @@ def replace_abbrevation(text: str):
|
|||
new_text_splits.append('Advantage')
|
||||
elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']:
|
||||
new_text_splits.append('Hedged')
|
||||
elif split.lower() in ['unhgd']:
|
||||
split = ""
|
||||
elif split.lower() in ['cl', 'cl.']:
|
||||
new_text_splits.append('Class')
|
||||
elif split.lower() in ['ser', 'ser.']:
|
||||
|
|
|
|||
Loading…
Reference in New Issue