optimize mapping algorithm
This commit is contained in:
parent
3adbd7631a
commit
035f028155
2
main.py
2
main.py
|
|
@ -704,7 +704,7 @@ if __name__ == "__main__":
|
||||||
]
|
]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["333207452"]
|
# special_doc_id_list = ["479042269"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,7 @@ total_currency_list = [
|
||||||
"XFO",
|
"XFO",
|
||||||
]
|
]
|
||||||
|
|
||||||
share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Dividend', 'Investor', 'Institutional', 'Admin', 'Advantage']
|
share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Admin', 'Advantage']
|
||||||
share_features_abbrevation = ['Acc', 'Inc', 'Dist', 'Div', 'Inv', 'Inst', 'Adm', 'Adv']
|
share_features_abbrevation = ['Acc', 'Inc', 'Dist', 'Div', 'Inv', 'Inst', 'Adm', 'Adv']
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -101,6 +101,10 @@ def get_most_similar_name(text: str,
|
||||||
common_word_list.extend([word for word in pre_common_word_list
|
common_word_list.extend([word for word in pre_common_word_list
|
||||||
if word not in common_word_list])
|
if word not in common_word_list])
|
||||||
|
|
||||||
|
if len(common_word_list) > 0:
|
||||||
|
common_word_list = [word for word in common_word_list
|
||||||
|
if len(word) > 1 and word.upper() not in total_currency_list]
|
||||||
|
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
text = remove_special_characters(text)
|
text = remove_special_characters(text)
|
||||||
text = replace_abbrevation(text)
|
text = replace_abbrevation(text)
|
||||||
|
|
@ -224,6 +228,7 @@ def get_most_similar_name(text: str,
|
||||||
if text_feature is not None and len(text_feature) > 0 and \
|
if text_feature is not None and len(text_feature) > 0 and \
|
||||||
copy_name_feature is not None and len(copy_name_feature) > 0:
|
copy_name_feature is not None and len(copy_name_feature) > 0:
|
||||||
if text_feature != copy_name_feature:
|
if text_feature != copy_name_feature:
|
||||||
|
if copy_name_feature.lower() not in text.lower().split():
|
||||||
continue
|
continue
|
||||||
if matching_type == "share":
|
if matching_type == "share":
|
||||||
if text_share_short_name is not None and len(text_share_short_name) > 0 and \
|
if text_share_short_name is not None and len(text_share_short_name) > 0 and \
|
||||||
|
|
@ -263,6 +268,10 @@ def get_share_part_list(text_list: list):
|
||||||
text_split = text.split("funds")
|
text_split = text.split("funds")
|
||||||
if len(text_split) == 1:
|
if len(text_split) == 1:
|
||||||
text_split = text.split("Portfolio")
|
text_split = text.split("Portfolio")
|
||||||
|
if len(text_split) == 1:
|
||||||
|
text_split = text.split("Bond")
|
||||||
|
if len(text_split) == 1:
|
||||||
|
text_split = text.split("Bonds")
|
||||||
if len(text_split) > 1:
|
if len(text_split) > 1:
|
||||||
share_part_list.append(text_split[-1].strip())
|
share_part_list.append(text_split[-1].strip())
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue