optimize mapping algorithm: check whether exist "-" to connect share names

This commit is contained in:
Blade He 2024-10-02 11:38:11 -05:00
parent 035f028155
commit f06355e0c8
2 changed files with 12 additions and 6 deletions

View File

@ -574,8 +574,9 @@ def test_data_extraction_metrics():
def test_mapping_raw_name(): def test_mapping_raw_name():
doc_id = "333207452" doc_id = "481475385"
raw_name = "Rathbone SICAV Income Fund L ACC GBP" raw_name = "Emerging Markets Fund A-ACC Shares USD"
raw_share_name = "A-ACC Shares USD"
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
data_mapping = DataMapping( data_mapping = DataMapping(
doc_id, doc_id,
@ -587,7 +588,7 @@ def test_mapping_raw_name():
process_cache = {} process_cache = {}
mapping_info = data_mapping.matching_with_database( mapping_info = data_mapping.matching_with_database(
raw_name=raw_name, raw_name=raw_name,
raw_share_name=None, raw_share_name=raw_share_name,
parent_id=None, parent_id=None,
matching_type="share", matching_type="share",
process_cache=process_cache process_cache=process_cache
@ -704,7 +705,7 @@ if __name__ == "__main__":
] ]
# special_doc_id_list = check_mapping_doc_id_list # special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["479042269"] # special_doc_id_list = ["481475385"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False

View File

@ -90,8 +90,13 @@ def get_most_similar_name(text: str,
copy_name_list is None or len(copy_name_list) == 0: copy_name_list is None or len(copy_name_list) == 0:
return None, None return None, None
copy_name_list = [replace_abbrevation(copy_name) for copy_name for i in range(len(copy_name_list)):
in copy_name_list] copy_name = copy_name_list[i]
share_part = get_share_part_list([copy_name])[0]
if '-' in share_part:
copy_name = copy_name.replace('-', ' ')
copy_name = replace_abbrevation(copy_name)
copy_name_list[i] = copy_name
# get common words in fund_name_list # get common words in fund_name_list
common_word_list = [] common_word_list = []