optimize mapping algorithm
This commit is contained in:
parent
3adbd7631a
commit
035f028155
2
main.py
2
main.py
|
|
@ -704,7 +704,7 @@ if __name__ == "__main__":
|
|||
]
|
||||
# special_doc_id_list = check_mapping_doc_id_list
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
# special_doc_id_list = ["333207452"]
|
||||
# special_doc_id_list = ["479042269"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ total_currency_list = [
|
|||
"XFO",
|
||||
]
|
||||
|
||||
share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Dividend', 'Investor', 'Institutional', 'Admin', 'Advantage']
|
||||
share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Admin', 'Advantage']
|
||||
share_features_abbrevation = ['Acc', 'Inc', 'Dist', 'Div', 'Inv', 'Inst', 'Adm', 'Adv']
|
||||
|
||||
|
||||
|
|
@ -101,6 +101,10 @@ def get_most_similar_name(text: str,
|
|||
common_word_list.extend([word for word in pre_common_word_list
|
||||
if word not in common_word_list])
|
||||
|
||||
if len(common_word_list) > 0:
|
||||
common_word_list = [word for word in common_word_list
|
||||
if len(word) > 1 and word.upper() not in total_currency_list]
|
||||
|
||||
text = text.strip()
|
||||
text = remove_special_characters(text)
|
||||
text = replace_abbrevation(text)
|
||||
|
|
@ -224,7 +228,8 @@ def get_most_similar_name(text: str,
|
|||
if text_feature is not None and len(text_feature) > 0 and \
|
||||
copy_name_feature is not None and len(copy_name_feature) > 0:
|
||||
if text_feature != copy_name_feature:
|
||||
continue
|
||||
if copy_name_feature.lower() not in text.lower().split():
|
||||
continue
|
||||
if matching_type == "share":
|
||||
if text_share_short_name is not None and len(text_share_short_name) > 0 and \
|
||||
copy_name_short_name is not None and len(copy_name_short_name) > 0:
|
||||
|
|
@ -263,6 +268,10 @@ def get_share_part_list(text_list: list):
|
|||
text_split = text.split("funds")
|
||||
if len(text_split) == 1:
|
||||
text_split = text.split("Portfolio")
|
||||
if len(text_split) == 1:
|
||||
text_split = text.split("Bond")
|
||||
if len(text_split) == 1:
|
||||
text_split = text.split("Bonds")
|
||||
if len(text_split) > 1:
|
||||
share_part_list.append(text_split[-1].strip())
|
||||
else:
|
||||
|
|
|
|||
Loading…
Reference in New Issue