Consider multiple share short names cases.
This commit is contained in:
parent
edb90c718e
commit
f0dd7f9e89
10
main.py
10
main.py
|
|
@ -574,9 +574,9 @@ def test_data_extraction_metrics():
|
||||||
|
|
||||||
|
|
||||||
def test_mapping_raw_name():
|
def test_mapping_raw_name():
|
||||||
doc_id = "483617247"
|
doc_id = "394778487"
|
||||||
raw_name = "CPR Invest Global Disruptive Opportunities Class I sw EUR - Acc"
|
raw_name = "Invesco Global Real Assets Fund FCP-RAIF Invesco Global Property Plus Fund Z Gross QD USD"
|
||||||
raw_share_name = "Class I sw EUR - Acc"
|
raw_share_name = "Z Gross QD USD"
|
||||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||||
data_mapping = DataMapping(
|
data_mapping = DataMapping(
|
||||||
doc_id,
|
doc_id,
|
||||||
|
|
@ -589,7 +589,7 @@ def test_mapping_raw_name():
|
||||||
mapping_info = data_mapping.matching_with_database(
|
mapping_info = data_mapping.matching_with_database(
|
||||||
raw_name=raw_name,
|
raw_name=raw_name,
|
||||||
raw_share_name=raw_share_name,
|
raw_share_name=raw_share_name,
|
||||||
parent_id=None,
|
parent_id="FS0000H1C9",
|
||||||
matching_type="share",
|
matching_type="share",
|
||||||
process_cache=process_cache
|
process_cache=process_cache
|
||||||
)
|
)
|
||||||
|
|
@ -705,7 +705,7 @@ if __name__ == "__main__":
|
||||||
]
|
]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["483617247"]
|
# special_doc_id_list = ["380945052", "382366116", "387202452", "394778487", "469138353"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
|
|
|
||||||
|
|
@ -172,7 +172,7 @@ def get_most_similar_name(text: str,
|
||||||
if process_cache is not None and isinstance(process_cache, dict):
|
if process_cache is not None and isinstance(process_cache, dict):
|
||||||
if process_cache.get(text, None) is not None:
|
if process_cache.get(text, None) is not None:
|
||||||
cache = process_cache.get(text)
|
cache = process_cache.get(text)
|
||||||
text_share_short_name = cache.get("share_short_name")
|
text_share_short_name_list = cache.get("share_short_name")
|
||||||
text_feature = cache.get("share_feature")
|
text_feature = cache.get("share_feature")
|
||||||
text_currency = cache.get("share_currency")
|
text_currency = cache.get("share_currency")
|
||||||
else:
|
else:
|
||||||
|
|
@ -212,6 +212,8 @@ def get_most_similar_name(text: str,
|
||||||
print(e)
|
print(e)
|
||||||
print_exc()
|
print_exc()
|
||||||
similarity = 0
|
similarity = 0
|
||||||
|
if similarity == 1:
|
||||||
|
return full_name, similarity
|
||||||
copy_name_2 = replace_abbrevation(copy_name)
|
copy_name_2 = replace_abbrevation(copy_name)
|
||||||
if copy_name != copy_name_2:
|
if copy_name != copy_name_2:
|
||||||
similarity_2 = get_jacard_similarity(text,
|
similarity_2 = get_jacard_similarity(text,
|
||||||
|
|
@ -229,7 +231,8 @@ def get_most_similar_name(text: str,
|
||||||
copy_name_currency = cache.get("share_currency")
|
copy_name_currency = cache.get("share_currency")
|
||||||
else:
|
else:
|
||||||
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
|
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
|
||||||
copy_name_short_name_list.sort()
|
if copy_name_short_name_list is not None:
|
||||||
|
copy_name_short_name_list.sort()
|
||||||
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
||||||
copy_name_currency = get_currency_from_text(copy_share_name)
|
copy_name_currency = get_currency_from_text(copy_share_name)
|
||||||
process_cache[copy_name] = {
|
process_cache[copy_name] = {
|
||||||
|
|
@ -255,8 +258,13 @@ def get_most_similar_name(text: str,
|
||||||
if matching_type == "share":
|
if matching_type == "share":
|
||||||
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
||||||
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
|
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
|
||||||
if text_share_short_name_list != copy_name_short_name_list:
|
raw_short_not_in_compare = False
|
||||||
continue
|
for short in text_share_short_name_list:
|
||||||
|
if short not in copy_name_short_name_list:
|
||||||
|
raw_short_not_in_compare = True
|
||||||
|
break
|
||||||
|
if raw_short_not_in_compare:
|
||||||
|
continue
|
||||||
max_similarity = similarity
|
max_similarity = similarity
|
||||||
max_similarity_full_name = full_name
|
max_similarity_full_name = full_name
|
||||||
same_max_similarity_name_list = []
|
same_max_similarity_name_list = []
|
||||||
|
|
@ -285,19 +293,26 @@ def get_most_similar_name(text: str,
|
||||||
def get_share_part_list(text_list: list):
|
def get_share_part_list(text_list: list):
|
||||||
share_part_list = []
|
share_part_list = []
|
||||||
for text in text_list:
|
for text in text_list:
|
||||||
text_split = text.split("Fund")
|
text_split = text.split("Funds")
|
||||||
if len(text_split) == 1:
|
if len(text_split) == 1:
|
||||||
text_split = text.split("funds")
|
text_split = text.split("Fund")
|
||||||
if len(text_split) == 1:
|
if len(text_split) == 1:
|
||||||
text_split = text.split("Portfolio")
|
text_split = text.split("Portfolio")
|
||||||
if len(text_split) == 1:
|
|
||||||
text_split = text.split("Bond")
|
|
||||||
if len(text_split) == 1:
|
if len(text_split) == 1:
|
||||||
text_split = text.split("Bonds")
|
text_split = text.split("Bonds")
|
||||||
|
if len(text_split) == 1:
|
||||||
|
text_split = text.split("Bond")
|
||||||
if len(text_split) > 1:
|
if len(text_split) > 1:
|
||||||
share_part_list.append(text_split[-1].strip())
|
share_part_text = text_split[-1].strip()
|
||||||
else:
|
else:
|
||||||
share_part_list.append(text)
|
share_part_text = text.strip()
|
||||||
|
share_part_text = ' '.join([split for split in share_part_text.split()
|
||||||
|
if remove_special_characters(split).lower()
|
||||||
|
not in ['fund', "funds", 'portfolio',
|
||||||
|
'bond', 'bonds',
|
||||||
|
'class', 'classes',
|
||||||
|
'share', 'shares']])
|
||||||
|
share_part_list.append(share_part_text)
|
||||||
return share_part_list
|
return share_part_list
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -316,9 +331,20 @@ def get_share_short_name_from_text(text: str):
|
||||||
break
|
break
|
||||||
if split.lower() not in temp_share_features and \
|
if split.lower() not in temp_share_features and \
|
||||||
split.upper() not in total_currency_list:
|
split.upper() not in total_currency_list:
|
||||||
if len(split) <= 3 and split.upper() == split:
|
if len(split) <= 3:
|
||||||
share_short_name_list.append(split.upper())
|
share_short_name_list.append(split.upper())
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
if len(share_short_name_list) > 1:
|
||||||
|
remove_number = []
|
||||||
|
for short_name in share_short_name_list[::-1]:
|
||||||
|
if short_name.isdigit():
|
||||||
|
remove_number.append(short_name)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
for remove in remove_number:
|
||||||
|
if remove in share_short_name_list:
|
||||||
|
share_short_name_list.remove(remove)
|
||||||
return share_short_name_list
|
return share_short_name_list
|
||||||
|
|
||||||
def get_share_feature_from_text(text: str):
|
def get_share_feature_from_text(text: str):
|
||||||
|
|
@ -481,6 +507,7 @@ def remove_common_word(text_list: list):
|
||||||
# the result is ['Global', 'Growth']
|
# the result is ['Global', 'Growth']
|
||||||
common_word_list = []
|
common_word_list = []
|
||||||
new_text_splits_list = [text.split() for text in new_text_list]
|
new_text_splits_list = [text.split() for text in new_text_list]
|
||||||
|
with_common_word = False
|
||||||
for i in range(len(new_text_splits_list)):
|
for i in range(len(new_text_splits_list)):
|
||||||
for j in range(i+1, len(new_text_splits_list)):
|
for j in range(i+1, len(new_text_splits_list)):
|
||||||
if common_word_list is None or len(common_word_list) == 0:
|
if common_word_list is None or len(common_word_list) == 0:
|
||||||
|
|
@ -489,6 +516,12 @@ def remove_common_word(text_list: list):
|
||||||
else:
|
else:
|
||||||
common_word_list = list(
|
common_word_list = list(
|
||||||
set(common_word_list).intersection(set(new_text_splits_list[j])))
|
set(common_word_list).intersection(set(new_text_splits_list[j])))
|
||||||
|
if len(common_word_list) > 0:
|
||||||
|
with_common_word = True
|
||||||
|
if with_common_word and len(common_word_list) == 0:
|
||||||
|
break
|
||||||
|
if with_common_word and len(common_word_list) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
remove_list = []
|
remove_list = []
|
||||||
# if exists the share name and currency name, remove from the list
|
# if exists the share name and currency name, remove from the list
|
||||||
|
|
@ -631,7 +664,8 @@ def get_beginning_common_words(text_list: list):
|
||||||
def replace_abbrevation(text: str):
|
def replace_abbrevation(text: str):
|
||||||
if text is None or len(text.strip()) == 0:
|
if text is None or len(text.strip()) == 0:
|
||||||
return text
|
return text
|
||||||
text = text.strip()
|
text = text.replace('(', ' ').replace(')', ' ').replace('-', ' ')
|
||||||
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
if 'swiss franc' in text.lower():
|
if 'swiss franc' in text.lower():
|
||||||
text = re.sub(r'swiss\s+franc', 'CHF', text, flags=re.IGNORECASE)
|
text = re.sub(r'swiss\s+franc', 'CHF', text, flags=re.IGNORECASE)
|
||||||
elif 'us dollar' in text.lower():
|
elif 'us dollar' in text.lower():
|
||||||
|
|
@ -710,6 +744,8 @@ def replace_abbrevation(text: str):
|
||||||
new_text_splits.append('US')
|
new_text_splits.append('US')
|
||||||
elif split.lower() in ['nc', 'nc.']:
|
elif split.lower() in ['nc', 'nc.']:
|
||||||
new_text_splits.append('no trail')
|
new_text_splits.append('no trail')
|
||||||
|
elif split.lower() in ['non']:
|
||||||
|
new_text_splits.append('Not')
|
||||||
else:
|
else:
|
||||||
new_text_splits.append(split)
|
new_text_splits.append(split)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue