Optimize mapping algorithm
Consider some share class names are with multiple short name, e.g. CPR Invest Global Disruptive Opportunities Class I sw EUR - Acc The short names are I and sw The purpose is to support get all of short names from share class name.
This commit is contained in:
parent
3bb13947af
commit
edb90c718e
12
main.py
12
main.py
|
|
@ -574,9 +574,9 @@ def test_data_extraction_metrics():
|
||||||
|
|
||||||
|
|
||||||
def test_mapping_raw_name():
|
def test_mapping_raw_name():
|
||||||
doc_id = "389171486"
|
doc_id = "483617247"
|
||||||
raw_name = "Nordea 2 Emerging Market Local Debt Enhanced Fund Y - Shares"
|
raw_name = "CPR Invest Global Disruptive Opportunities Class I sw EUR - Acc"
|
||||||
raw_share_name = "Y - Shares"
|
raw_share_name = "Class I sw EUR - Acc"
|
||||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||||
data_mapping = DataMapping(
|
data_mapping = DataMapping(
|
||||||
doc_id,
|
doc_id,
|
||||||
|
|
@ -589,7 +589,7 @@ def test_mapping_raw_name():
|
||||||
mapping_info = data_mapping.matching_with_database(
|
mapping_info = data_mapping.matching_with_database(
|
||||||
raw_name=raw_name,
|
raw_name=raw_name,
|
||||||
raw_share_name=raw_share_name,
|
raw_share_name=raw_share_name,
|
||||||
parent_id="FS00009Q8R",
|
parent_id=None,
|
||||||
matching_type="share",
|
matching_type="share",
|
||||||
process_cache=process_cache
|
process_cache=process_cache
|
||||||
)
|
)
|
||||||
|
|
@ -705,12 +705,12 @@ if __name__ == "__main__":
|
||||||
]
|
]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
special_doc_id_list = ["483617247"]
|
# special_doc_id_list = ["483617247"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = True
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
for extract_way in extract_ways:
|
for extract_way in extract_ways:
|
||||||
|
|
|
||||||
|
|
@ -167,7 +167,7 @@ def get_most_similar_name(text: str,
|
||||||
'share', 'shares']])
|
'share', 'shares']])
|
||||||
text_currency = None
|
text_currency = None
|
||||||
text_feature = None
|
text_feature = None
|
||||||
text_share_short_name = None
|
text_share_short_name_list = None
|
||||||
if matching_type == "share" and text is not None and len(text.strip()) > 0:
|
if matching_type == "share" and text is not None and len(text.strip()) > 0:
|
||||||
if process_cache is not None and isinstance(process_cache, dict):
|
if process_cache is not None and isinstance(process_cache, dict):
|
||||||
if process_cache.get(text, None) is not None:
|
if process_cache.get(text, None) is not None:
|
||||||
|
|
@ -177,20 +177,23 @@ def get_most_similar_name(text: str,
|
||||||
text_currency = cache.get("share_currency")
|
text_currency = cache.get("share_currency")
|
||||||
else:
|
else:
|
||||||
if share_name is not None and len(share_name.strip()) > 0:
|
if share_name is not None and len(share_name.strip()) > 0:
|
||||||
text_share_short_name = get_share_short_name_from_text(share_name)
|
text_share_short_name_list = get_share_short_name_from_text(share_name)
|
||||||
text_feature = get_share_feature_from_text(share_name)
|
text_feature = get_share_feature_from_text(share_name)
|
||||||
text_currency = get_currency_from_text(share_name)
|
text_currency = get_currency_from_text(share_name)
|
||||||
else:
|
else:
|
||||||
text_share_short_name = get_share_short_name_from_text(text)
|
text_share_short_name_list = get_share_short_name_from_text(text)
|
||||||
text_feature = get_share_feature_from_text(text)
|
text_feature = get_share_feature_from_text(text)
|
||||||
text_currency = get_currency_from_text(text)
|
text_currency = get_currency_from_text(text)
|
||||||
|
# sort text_share_short_name_list
|
||||||
|
text_share_short_name_list.sort()
|
||||||
process_cache[text] = {
|
process_cache[text] = {
|
||||||
"share_short_name": text_share_short_name,
|
"share_short_name": text_share_short_name_list,
|
||||||
"share_feature": text_feature,
|
"share_feature": text_feature,
|
||||||
"share_currency": text_currency
|
"share_currency": text_currency
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
text_share_short_name = get_share_short_name_from_text(share_name)
|
text_share_short_name_list = get_share_short_name_from_text(share_name)
|
||||||
|
text_share_short_name_list.sort()
|
||||||
text_feature = get_share_feature_from_text(share_name)
|
text_feature = get_share_feature_from_text(share_name)
|
||||||
text_currency = get_currency_from_text(share_name)
|
text_currency = get_currency_from_text(share_name)
|
||||||
|
|
||||||
|
|
@ -221,20 +224,22 @@ def get_most_similar_name(text: str,
|
||||||
if process_cache is not None and isinstance(process_cache, dict):
|
if process_cache is not None and isinstance(process_cache, dict):
|
||||||
if process_cache.get(copy_name, None) is not None:
|
if process_cache.get(copy_name, None) is not None:
|
||||||
cache = process_cache.get(copy_name)
|
cache = process_cache.get(copy_name)
|
||||||
copy_name_short_name = cache.get("share_short_name")
|
copy_name_short_name_list = cache.get("share_short_name")
|
||||||
copy_name_feature = cache.get("share_feature")
|
copy_name_feature = cache.get("share_feature")
|
||||||
copy_name_currency = cache.get("share_currency")
|
copy_name_currency = cache.get("share_currency")
|
||||||
else:
|
else:
|
||||||
copy_name_short_name = get_share_short_name_from_text(copy_share_name)
|
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
|
||||||
|
copy_name_short_name_list.sort()
|
||||||
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
||||||
copy_name_currency = get_currency_from_text(copy_share_name)
|
copy_name_currency = get_currency_from_text(copy_share_name)
|
||||||
process_cache[copy_name] = {
|
process_cache[copy_name] = {
|
||||||
"share_short_name": copy_name_short_name,
|
"share_short_name": copy_name_short_name_list,
|
||||||
"share_feature": copy_name_feature,
|
"share_feature": copy_name_feature,
|
||||||
"share_currency": copy_name_currency
|
"share_currency": copy_name_currency
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
copy_name_short_name = get_share_short_name_from_text(copy_share_name)
|
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
|
||||||
|
copy_name_short_name_list.sort()
|
||||||
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
||||||
copy_name_currency = get_currency_from_text(copy_share_name)
|
copy_name_currency = get_currency_from_text(copy_share_name)
|
||||||
|
|
||||||
|
|
@ -248,9 +253,9 @@ def get_most_similar_name(text: str,
|
||||||
if copy_name_feature.lower() not in text.lower().split():
|
if copy_name_feature.lower() not in text.lower().split():
|
||||||
continue
|
continue
|
||||||
if matching_type == "share":
|
if matching_type == "share":
|
||||||
if text_share_short_name is not None and len(text_share_short_name) > 0 and \
|
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
||||||
copy_name_short_name is not None and len(copy_name_short_name) > 0:
|
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
|
||||||
if text_share_short_name != copy_name_short_name:
|
if text_share_short_name_list != copy_name_short_name_list:
|
||||||
continue
|
continue
|
||||||
max_similarity = similarity
|
max_similarity = similarity
|
||||||
max_similarity_full_name = full_name
|
max_similarity_full_name = full_name
|
||||||
|
|
@ -304,15 +309,17 @@ def get_share_short_name_from_text(text: str):
|
||||||
temp_share_features = [feature.lower() for feature in share_features_full_name]
|
temp_share_features = [feature.lower() for feature in share_features_full_name]
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
|
share_short_name_list = []
|
||||||
|
|
||||||
for split in text_split[::-1]:
|
for split in text_split[::-1]:
|
||||||
if count == 4:
|
if count == 4:
|
||||||
break
|
break
|
||||||
if split.lower() not in temp_share_features and \
|
if split.lower() not in temp_share_features and \
|
||||||
split not in total_currency_list:
|
split.upper() not in total_currency_list:
|
||||||
if len(split) <= 3 and split.upper() == split:
|
if len(split) <= 3 and split.upper() == split:
|
||||||
return split.upper()
|
share_short_name_list.append(split.upper())
|
||||||
count += 1
|
count += 1
|
||||||
return None
|
return share_short_name_list
|
||||||
|
|
||||||
def get_share_feature_from_text(text: str):
|
def get_share_feature_from_text(text: str):
|
||||||
if text is None or len(text.strip()) == 0:
|
if text is None or len(text.strip()) == 0:
|
||||||
|
|
@ -370,14 +377,20 @@ def update_for_currency(text: str, share_name: str, compare_list: list):
|
||||||
if not with_currency and len(with_currency_list) == 0:
|
if not with_currency and len(with_currency_list) == 0:
|
||||||
pass
|
pass
|
||||||
elif not with_currency and len(with_currency_list) > 0:
|
elif not with_currency and len(with_currency_list) > 0:
|
||||||
share_short_name = ""
|
share_short_name_list = []
|
||||||
if share_name is not None and len(share_name.strip()) > 0:
|
if share_name is not None and len(share_name.strip()) > 0:
|
||||||
share_short_name = get_share_short_name_from_text(share_name)
|
share_short_name_list = get_share_short_name_from_text(share_name)
|
||||||
updated = False
|
updated = False
|
||||||
if len(share_short_name) < 4 and share_short_name.upper() == share_short_name:
|
if len(share_short_name_list) > 0:
|
||||||
if len(without_currency_list) > 0:
|
if len(without_currency_list) > 0:
|
||||||
for index in without_currency_list:
|
for index in without_currency_list:
|
||||||
if share_short_name in compare_list[index].split():
|
all_in_list = True
|
||||||
|
compare_split = [split.upper() for split in compare_list[index].split()]
|
||||||
|
for share_shot_name in share_short_name_list:
|
||||||
|
if share_shot_name not in compare_split:
|
||||||
|
all_in_list = False
|
||||||
|
break
|
||||||
|
if all_in_list:
|
||||||
text = text + ' ' + 'USD'
|
text = text + ' ' + 'USD'
|
||||||
if share_name is not None:
|
if share_name is not None:
|
||||||
share_name = share_name + ' ' + 'USD'
|
share_name = share_name + ' ' + 'USD'
|
||||||
|
|
@ -386,8 +399,13 @@ def update_for_currency(text: str, share_name: str, compare_list: list):
|
||||||
if not updated:
|
if not updated:
|
||||||
currency_list = []
|
currency_list = []
|
||||||
for index in with_currency_list:
|
for index in with_currency_list:
|
||||||
compare_split = compare_list[index].split()
|
all_in_list = True
|
||||||
if share_short_name in compare_split:
|
compare_split = [split.upper() for split in compare_list[index].split()]
|
||||||
|
for share_shot_name in share_short_name_list:
|
||||||
|
if share_shot_name not in compare_split:
|
||||||
|
all_in_list = False
|
||||||
|
break
|
||||||
|
if all_in_list:
|
||||||
current_currency_list = [split for split in compare_split
|
current_currency_list = [split for split in compare_split
|
||||||
if split.upper() in total_currency_list]
|
if split.upper() in total_currency_list]
|
||||||
if len(current_currency_list) > 0:
|
if len(current_currency_list) > 0:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue