optimize mapping algorithm
This commit is contained in:
parent
d92053a16e
commit
3adbd7631a
8
main.py
8
main.py
|
|
@ -574,8 +574,8 @@ def test_data_extraction_metrics():
|
||||||
|
|
||||||
|
|
||||||
def test_mapping_raw_name():
|
def test_mapping_raw_name():
|
||||||
doc_id = "469138353"
|
doc_id = "333207452"
|
||||||
raw_name = "Manulife Global Fund ASEAN Equity Fund I USD"
|
raw_name = "Rathbone SICAV Income Fund L ACC GBP"
|
||||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||||
data_mapping = DataMapping(
|
data_mapping = DataMapping(
|
||||||
doc_id,
|
doc_id,
|
||||||
|
|
@ -704,11 +704,11 @@ if __name__ == "__main__":
|
||||||
]
|
]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["469138353"]
|
# special_doc_id_list = ["333207452"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
|
||||||
|
|
@ -104,6 +104,10 @@ def get_most_similar_name(text: str,
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
text = remove_special_characters(text)
|
text = remove_special_characters(text)
|
||||||
text = replace_abbrevation(text)
|
text = replace_abbrevation(text)
|
||||||
|
if share_name is not None:
|
||||||
|
share_name = remove_special_characters(share_name)
|
||||||
|
share_name = replace_abbrevation(share_name)
|
||||||
|
|
||||||
text_splits = text.split()
|
text_splits = text.split()
|
||||||
if len(text_splits) == 1:
|
if len(text_splits) == 1:
|
||||||
text = split_words_without_space(text)
|
text = split_words_without_space(text)
|
||||||
|
|
@ -123,14 +127,6 @@ def get_most_similar_name(text: str,
|
||||||
temp_splits = copy_name_list[i].split()
|
temp_splits = copy_name_list[i].split()
|
||||||
copy_name_list[i] = ' '.join([split for split in temp_splits
|
copy_name_list[i] = ' '.join([split for split in temp_splits
|
||||||
if remove_special_characters(split).lower() != word])
|
if remove_special_characters(split).lower() != word])
|
||||||
|
|
||||||
for i in range(len(copy_name_list)):
|
|
||||||
temp_splits = copy_name_list[i].split()
|
|
||||||
copy_name_list[i] = ' '.join([split for split in temp_splits
|
|
||||||
if remove_special_characters(split).lower()
|
|
||||||
not in ['fund', "funds", 'portfolio',
|
|
||||||
'class', 'classes',
|
|
||||||
'share', 'shares']])
|
|
||||||
final_splits = []
|
final_splits = []
|
||||||
for split in new_splits:
|
for split in new_splits:
|
||||||
if split.lower() not in ['fund', "funds", 'portfolio',
|
if split.lower() not in ['fund', "funds", 'portfolio',
|
||||||
|
|
@ -139,11 +135,22 @@ def get_most_similar_name(text: str,
|
||||||
final_splits.append(split)
|
final_splits.append(split)
|
||||||
|
|
||||||
text = ' '.join(final_splits)
|
text = ' '.join(final_splits)
|
||||||
|
|
||||||
|
copy_share_name_list = get_share_part_list(copy_name_list)
|
||||||
|
|
||||||
|
for i in range(len(copy_name_list)):
|
||||||
|
temp_splits = copy_name_list[i].split()
|
||||||
|
copy_name_list[i] = ' '.join([split for split in temp_splits
|
||||||
|
if remove_special_characters(split).lower()
|
||||||
|
not in ['fund', "funds", 'portfolio',
|
||||||
|
'class', 'classes',
|
||||||
|
'share', 'shares']])
|
||||||
max_similarity = 0
|
max_similarity = 0
|
||||||
max_similarity_full_name = None
|
max_similarity_full_name = None
|
||||||
text = remove_special_characters(text)
|
text = remove_special_characters(text)
|
||||||
|
|
||||||
if matching_type == "share":
|
if matching_type == "share":
|
||||||
text, copy_name_list = update_for_currency(text, copy_name_list)
|
text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list)
|
||||||
text_currency = None
|
text_currency = None
|
||||||
text_feature = None
|
text_feature = None
|
||||||
text_share_short_name = None
|
text_share_short_name = None
|
||||||
|
|
@ -155,9 +162,14 @@ def get_most_similar_name(text: str,
|
||||||
text_feature = cache.get("share_feature")
|
text_feature = cache.get("share_feature")
|
||||||
text_currency = cache.get("share_currency")
|
text_currency = cache.get("share_currency")
|
||||||
else:
|
else:
|
||||||
text_share_short_name = get_share_short_name_from_text(text)
|
if share_name is not None and len(share_name.strip()) > 0:
|
||||||
text_feature = get_share_feature_from_text(text)
|
text_share_short_name = get_share_short_name_from_text(share_name)
|
||||||
text_currency = get_currency_from_text(text)
|
text_feature = get_share_feature_from_text(share_name)
|
||||||
|
text_currency = get_currency_from_text(share_name)
|
||||||
|
else:
|
||||||
|
text_share_short_name = get_share_short_name_from_text(text)
|
||||||
|
text_feature = get_share_feature_from_text(text)
|
||||||
|
text_currency = get_currency_from_text(text)
|
||||||
process_cache[text] = {
|
process_cache[text] = {
|
||||||
"share_short_name": text_share_short_name,
|
"share_short_name": text_share_short_name,
|
||||||
"share_feature": text_feature,
|
"share_feature": text_feature,
|
||||||
|
|
@ -170,7 +182,7 @@ def get_most_similar_name(text: str,
|
||||||
|
|
||||||
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
|
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
|
||||||
same_max_similarity_name_list = []
|
same_max_similarity_name_list = []
|
||||||
for full_name, copy_name in zip(name_list , copy_name_list):
|
for full_name, copy_name, copy_share_name in zip(name_list , copy_name_list, copy_share_name_list):
|
||||||
copy_name = remove_special_characters(copy_name)
|
copy_name = remove_special_characters(copy_name)
|
||||||
copy_name = split_words_without_space(copy_name)
|
copy_name = split_words_without_space(copy_name)
|
||||||
similarity = get_jacard_similarity(text,
|
similarity = get_jacard_similarity(text,
|
||||||
|
|
@ -192,18 +204,18 @@ def get_most_similar_name(text: str,
|
||||||
copy_name_feature = cache.get("share_feature")
|
copy_name_feature = cache.get("share_feature")
|
||||||
copy_name_currency = cache.get("share_currency")
|
copy_name_currency = cache.get("share_currency")
|
||||||
else:
|
else:
|
||||||
copy_name_short_name = get_share_short_name_from_text(copy_name)
|
copy_name_short_name = get_share_short_name_from_text(copy_share_name)
|
||||||
copy_name_feature = get_share_feature_from_text(copy_name)
|
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
||||||
copy_name_currency = get_currency_from_text(copy_name)
|
copy_name_currency = get_currency_from_text(copy_share_name)
|
||||||
process_cache[copy_name] = {
|
process_cache[copy_name] = {
|
||||||
"share_short_name": copy_name_short_name,
|
"share_short_name": copy_name_short_name,
|
||||||
"share_feature": copy_name_feature,
|
"share_feature": copy_name_feature,
|
||||||
"share_currency": copy_name_currency
|
"share_currency": copy_name_currency
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
copy_name_short_name = get_share_short_name_from_text(copy_name)
|
copy_name_short_name = get_share_short_name_from_text(copy_share_name)
|
||||||
copy_name_feature = get_share_feature_from_text(copy_name)
|
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
||||||
copy_name_currency = get_currency_from_text(copy_name)
|
copy_name_currency = get_currency_from_text(copy_share_name)
|
||||||
|
|
||||||
if text_currency is not None and len(text_currency) > 0 and \
|
if text_currency is not None and len(text_currency) > 0 and \
|
||||||
copy_name_currency is not None and len(copy_name_currency) > 0:
|
copy_name_currency is not None and len(copy_name_currency) > 0:
|
||||||
|
|
@ -242,10 +254,26 @@ def get_most_similar_name(text: str,
|
||||||
print_exc()
|
print_exc()
|
||||||
return None, 0.0
|
return None, 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def get_share_part_list(text_list: list):
|
||||||
|
share_part_list = []
|
||||||
|
for text in text_list:
|
||||||
|
text_split = text.split("Fund")
|
||||||
|
if len(text_split) == 1:
|
||||||
|
text_split = text.split("funds")
|
||||||
|
if len(text_split) == 1:
|
||||||
|
text_split = text.split("Portfolio")
|
||||||
|
if len(text_split) > 1:
|
||||||
|
share_part_list.append(text_split[-1].strip())
|
||||||
|
else:
|
||||||
|
share_part_list.append(text)
|
||||||
|
return share_part_list
|
||||||
|
|
||||||
|
|
||||||
def get_share_short_name_from_text(text: str):
|
def get_share_short_name_from_text(text: str):
|
||||||
if text is None or len(text.strip()) == 0:
|
if text is None or len(text.strip()) == 0:
|
||||||
return None
|
return None
|
||||||
text = text.strip()
|
text = remove_special_characters(text.strip())
|
||||||
text_split = text.split()
|
text_split = text.split()
|
||||||
temp_share_features = [feature.lower() for feature in share_features_full_name]
|
temp_share_features = [feature.lower() for feature in share_features_full_name]
|
||||||
|
|
||||||
|
|
@ -292,7 +320,7 @@ def get_currency_from_text(text: str):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def update_for_currency(text: str, compare_list: list):
|
def update_for_currency(text: str, share_name: str, compare_list: list):
|
||||||
text_split = text.split()
|
text_split = text.split()
|
||||||
with_currency = False
|
with_currency = False
|
||||||
for split in text_split:
|
for split in text_split:
|
||||||
|
|
@ -314,7 +342,7 @@ def update_for_currency(text: str, compare_list: list):
|
||||||
else:
|
else:
|
||||||
without_currency_list.append(index)
|
without_currency_list.append(index)
|
||||||
if not with_currency and len(with_currency_list) == 0:
|
if not with_currency and len(with_currency_list) == 0:
|
||||||
return text, compare_list
|
return text, share_name, compare_list
|
||||||
elif not with_currency and len(with_currency_list) > 0:
|
elif not with_currency and len(with_currency_list) > 0:
|
||||||
last_split = text_split[-1]
|
last_split = text_split[-1]
|
||||||
updated = False
|
updated = False
|
||||||
|
|
@ -323,6 +351,8 @@ def update_for_currency(text: str, compare_list: list):
|
||||||
for index in without_currency_list:
|
for index in without_currency_list:
|
||||||
if last_split in compare_list[index].split():
|
if last_split in compare_list[index].split():
|
||||||
text = text + ' ' + 'USD'
|
text = text + ' ' + 'USD'
|
||||||
|
if share_name is not None:
|
||||||
|
share_name = share_name + ' ' + 'USD'
|
||||||
updated = True
|
updated = True
|
||||||
break
|
break
|
||||||
if not updated:
|
if not updated:
|
||||||
|
|
@ -336,6 +366,8 @@ def update_for_currency(text: str, compare_list: list):
|
||||||
currency_list.append(current_currency_list[-1])
|
currency_list.append(current_currency_list[-1])
|
||||||
if len(currency_list) == 1:
|
if len(currency_list) == 1:
|
||||||
text = text + ' ' + currency_list[0]
|
text = text + ' ' + currency_list[0]
|
||||||
|
if share_name is not None:
|
||||||
|
share_name = share_name + ' ' + currency_list[0]
|
||||||
updated = True
|
updated = True
|
||||||
|
|
||||||
for index in without_currency_list:
|
for index in without_currency_list:
|
||||||
|
|
@ -343,13 +375,15 @@ def update_for_currency(text: str, compare_list: list):
|
||||||
|
|
||||||
if not updated:
|
if not updated:
|
||||||
text = text + ' ' + 'USD'
|
text = text + ' ' + 'USD'
|
||||||
return text, compare_list
|
if share_name is not None:
|
||||||
|
share_name = share_name + ' ' + 'USD'
|
||||||
|
return text, share_name, compare_list
|
||||||
elif with_currency and len(without_currency_list) == 0:
|
elif with_currency and len(without_currency_list) == 0:
|
||||||
for index in without_currency_list:
|
for index in without_currency_list:
|
||||||
compare_list[index] = compare_list[index] + ' ' + 'USD'
|
compare_list[index] = compare_list[index] + ' ' + 'USD'
|
||||||
return text, compare_list
|
return text, share_name, compare_list
|
||||||
else:
|
else:
|
||||||
return text, compare_list
|
return text, share_name, compare_list
|
||||||
|
|
||||||
|
|
||||||
def remove_common_word(text_list: list):
|
def remove_common_word(text_list: list):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue