optimize investment mapping algorithm
This commit is contained in:
parent
aa2c2332ae
commit
04a2409c58
|
|
@ -174,6 +174,7 @@ class DataMapping:
|
||||||
investment_info = self.matching_with_database(
|
investment_info = self.matching_with_database(
|
||||||
raw_name=raw_name,
|
raw_name=raw_name,
|
||||||
raw_share_name=raw_share_name,
|
raw_share_name=raw_share_name,
|
||||||
|
raw_fund_name=raw_fund_name,
|
||||||
parent_id=fund_id,
|
parent_id=fund_id,
|
||||||
matching_type="share",
|
matching_type="share",
|
||||||
process_cache=process_cache
|
process_cache=process_cache
|
||||||
|
|
@ -254,6 +255,7 @@ class DataMapping:
|
||||||
self,
|
self,
|
||||||
raw_name: str,
|
raw_name: str,
|
||||||
raw_share_name: str = None,
|
raw_share_name: str = None,
|
||||||
|
raw_fund_name: str = None,
|
||||||
parent_id: str = None,
|
parent_id: str = None,
|
||||||
matching_type: str = "fund",
|
matching_type: str = "fund",
|
||||||
process_cache: dict = {}
|
process_cache: dict = {}
|
||||||
|
|
@ -328,9 +330,14 @@ class DataMapping:
|
||||||
raw_name,
|
raw_name,
|
||||||
doc_compare_name_list,
|
doc_compare_name_list,
|
||||||
share_name=raw_share_name,
|
share_name=raw_share_name,
|
||||||
|
fund_name=raw_fund_name,
|
||||||
matching_type=matching_type,
|
matching_type=matching_type,
|
||||||
process_cache=process_cache)
|
process_cache=process_cache)
|
||||||
if max_similarity is not None and max_similarity >= 0.9:
|
if matching_type == "fund":
|
||||||
|
threshold = 0.7
|
||||||
|
else:
|
||||||
|
threshold = 0.9
|
||||||
|
if max_similarity is not None and max_similarity >= threshold:
|
||||||
data_info["id"] = doc_compare_mapping[
|
data_info["id"] = doc_compare_mapping[
|
||||||
doc_compare_mapping[compare_name_dp] == max_similarity_name
|
doc_compare_mapping[compare_name_dp] == max_similarity_name
|
||||||
][compare_id_dp].values[0]
|
][compare_id_dp].values[0]
|
||||||
|
|
@ -344,6 +351,7 @@ class DataMapping:
|
||||||
raw_name,
|
raw_name,
|
||||||
provider_compare_name_list,
|
provider_compare_name_list,
|
||||||
share_name=raw_share_name,
|
share_name=raw_share_name,
|
||||||
|
fund_name=raw_fund_name,
|
||||||
matching_type=matching_type,
|
matching_type=matching_type,
|
||||||
pre_common_word_list=pre_common_word_list,
|
pre_common_word_list=pre_common_word_list,
|
||||||
process_cache=process_cache
|
process_cache=process_cache
|
||||||
|
|
|
||||||
180
main.py
180
main.py
|
|
@ -338,7 +338,7 @@ def batch_start_job(
|
||||||
|
|
||||||
|
|
||||||
if calculate_metrics:
|
if calculate_metrics:
|
||||||
prediction_sheet_name = "mapping_data"
|
prediction_sheet_name = "total_mapping_data"
|
||||||
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
||||||
ground_truth_sheet_name = "mapping_data"
|
ground_truth_sheet_name = "mapping_data"
|
||||||
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
||||||
|
|
@ -600,9 +600,9 @@ def test_data_extraction_metrics():
|
||||||
|
|
||||||
|
|
||||||
def test_mapping_raw_name():
|
def test_mapping_raw_name():
|
||||||
doc_id = "394778487"
|
doc_id = "382366116"
|
||||||
raw_name = "Invesco Global Real Assets Fund FCP-RAIF Invesco Global Property Plus Fund Z Gross QD USD"
|
raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I"
|
||||||
raw_share_name = "Z Gross QD USD"
|
raw_share_name = "EUR I"
|
||||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||||
data_mapping = DataMapping(
|
data_mapping = DataMapping(
|
||||||
doc_id,
|
doc_id,
|
||||||
|
|
@ -615,7 +615,7 @@ def test_mapping_raw_name():
|
||||||
mapping_info = data_mapping.matching_with_database(
|
mapping_info = data_mapping.matching_with_database(
|
||||||
raw_name=raw_name,
|
raw_name=raw_name,
|
||||||
raw_share_name=raw_share_name,
|
raw_share_name=raw_share_name,
|
||||||
parent_id="FS0000H1C9",
|
parent_id=None,
|
||||||
matching_type="share",
|
matching_type="share",
|
||||||
process_cache=process_cache
|
process_cache=process_cache
|
||||||
)
|
)
|
||||||
|
|
@ -697,100 +697,102 @@ if __name__ == "__main__":
|
||||||
# "479793787",
|
# "479793787",
|
||||||
# "471641628",
|
# "471641628",
|
||||||
# ]
|
# ]
|
||||||
# check_db_mapping_doc_id_list = [
|
|
||||||
# "292989214",
|
|
||||||
# "316237292",
|
|
||||||
# "321733631",
|
|
||||||
# "323390570",
|
|
||||||
# "327956364",
|
|
||||||
# "332223498",
|
|
||||||
# "333207452",
|
|
||||||
# "334718372",
|
|
||||||
# "344636875",
|
|
||||||
# "362246081",
|
|
||||||
# "366179419",
|
|
||||||
# "380945052",
|
|
||||||
# "382366116",
|
|
||||||
# "387202452",
|
|
||||||
# "389171486",
|
|
||||||
# "391456740",
|
|
||||||
# "391736837",
|
|
||||||
# "394778487",
|
|
||||||
# "401684600",
|
|
||||||
# "402113224",
|
|
||||||
# "402181770",
|
|
||||||
# "402397014",
|
|
||||||
# "405803396",
|
|
||||||
# "445102363",
|
|
||||||
# "445256897",
|
|
||||||
# "448265376",
|
|
||||||
# "449555622",
|
|
||||||
# "449623976",
|
|
||||||
# "458291624",
|
|
||||||
# "458359181",
|
|
||||||
# "463081566",
|
|
||||||
# "469138353",
|
|
||||||
# "471641628",
|
|
||||||
# "476492237",
|
|
||||||
# "478585901",
|
|
||||||
# "478586066",
|
|
||||||
# "479042264",
|
|
||||||
# "479042269",
|
|
||||||
# "479793787",
|
|
||||||
# "481475385",
|
|
||||||
# "483617247",
|
|
||||||
# "486378555",
|
|
||||||
# "486383912",
|
|
||||||
# "492121213",
|
|
||||||
# "497497599",
|
|
||||||
# "502693599"
|
|
||||||
# ]
|
|
||||||
|
|
||||||
check_db_mapping_doc_id_list = [
|
check_db_mapping_doc_id_list = [
|
||||||
"334584772",
|
"292989214",
|
||||||
"406913630",
|
"316237292",
|
||||||
"407275419",
|
"321733631",
|
||||||
"337937633",
|
"323390570",
|
||||||
"337293427",
|
"327956364",
|
||||||
"334584772",
|
"332223498",
|
||||||
"404712928",
|
"333207452",
|
||||||
"451063582",
|
"334718372",
|
||||||
"451878128",
|
"344636875",
|
||||||
"425595958",
|
"362246081",
|
||||||
"536344026",
|
"366179419",
|
||||||
"532422548",
|
"380945052",
|
||||||
"423418540",
|
"382366116",
|
||||||
"423418395",
|
"387202452",
|
||||||
"532998065",
|
"389171486",
|
||||||
"540307575",
|
"391456740",
|
||||||
"423395975",
|
"391736837",
|
||||||
"508704368",
|
"394778487",
|
||||||
"481482392",
|
"401684600",
|
||||||
"466580448",
|
"402113224",
|
||||||
"423365707",
|
"402181770",
|
||||||
"423364758",
|
"402397014",
|
||||||
"422761666",
|
"405803396",
|
||||||
"422760156",
|
"445102363",
|
||||||
"422760148",
|
"445256897",
|
||||||
"422686965",
|
"448265376",
|
||||||
"492029971",
|
"449555622",
|
||||||
"510300817",
|
"449623976",
|
||||||
"512745032",
|
"458291624",
|
||||||
"514213638",
|
"458359181",
|
||||||
"527525440",
|
"463081566",
|
||||||
"534535767"
|
"469138353",
|
||||||
|
"471641628",
|
||||||
|
"476492237",
|
||||||
|
"478585901",
|
||||||
|
"478586066",
|
||||||
|
"479042264",
|
||||||
|
"479042269",
|
||||||
|
"479793787",
|
||||||
|
"481475385",
|
||||||
|
"483617247",
|
||||||
|
"486378555",
|
||||||
|
"486383912",
|
||||||
|
"492121213",
|
||||||
|
"497497599",
|
||||||
|
"502693599"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# check_db_mapping_doc_id_list = [
|
||||||
|
# "334584772",
|
||||||
|
# "406913630",
|
||||||
|
# "407275419",
|
||||||
|
# "337937633",
|
||||||
|
# "337293427",
|
||||||
|
# "334584772",
|
||||||
|
# "404712928",
|
||||||
|
# "451063582",
|
||||||
|
# "451878128",
|
||||||
|
# "425595958",
|
||||||
|
# "536344026",
|
||||||
|
# "532422548",
|
||||||
|
# "423418540",
|
||||||
|
# "423418395",
|
||||||
|
# "532998065",
|
||||||
|
# "540307575",
|
||||||
|
# "423395975",
|
||||||
|
# "508704368",
|
||||||
|
# "481482392",
|
||||||
|
# "466580448",
|
||||||
|
# "423365707",
|
||||||
|
# "423364758",
|
||||||
|
# "422761666",
|
||||||
|
# "422760156",
|
||||||
|
# "422760148",
|
||||||
|
# "422686965",
|
||||||
|
# "492029971",
|
||||||
|
# "510300817",
|
||||||
|
# "512745032",
|
||||||
|
# "514213638",
|
||||||
|
# "527525440",
|
||||||
|
# "534535767"
|
||||||
|
# ]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["337937633"]
|
# special_doc_id_list = ["394778487"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
calculate_metrics = False
|
calculate_metrics = True
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
pdf_folder = r"/data/emea_ar/small_pdf/"
|
||||||
|
# pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
for extract_way in extract_ways:
|
for extract_way in extract_ways:
|
||||||
batch_start_job(
|
batch_start_job(
|
||||||
pdf_folder,
|
pdf_folder,
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,7 @@ def clean_text(text: str) -> str:
|
||||||
def get_most_similar_name(text: str,
|
def get_most_similar_name(text: str,
|
||||||
name_list: list,
|
name_list: list,
|
||||||
share_name: str = None,
|
share_name: str = None,
|
||||||
|
fund_name: str = None,
|
||||||
matching_type="share",
|
matching_type="share",
|
||||||
pre_common_word_list: list = None,
|
pre_common_word_list: list = None,
|
||||||
process_cache: dict = None) -> str:
|
process_cache: dict = None) -> str:
|
||||||
|
|
@ -116,6 +117,12 @@ def get_most_similar_name(text: str,
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
text = remove_special_characters(text)
|
text = remove_special_characters(text)
|
||||||
text = replace_abbrevation(text)
|
text = replace_abbrevation(text)
|
||||||
|
raw_fund_name_split = []
|
||||||
|
if fund_name is not None and len(fund_name.strip()) > 0:
|
||||||
|
fund_name = fund_name.strip()
|
||||||
|
fund_name = remove_special_characters(fund_name)
|
||||||
|
raw_fund_name_split = fund_name.upper().split()
|
||||||
|
|
||||||
if share_name is not None:
|
if share_name is not None:
|
||||||
share_name = remove_special_characters(share_name)
|
share_name = remove_special_characters(share_name)
|
||||||
share_name = replace_abbrevation(share_name)
|
share_name = replace_abbrevation(share_name)
|
||||||
|
|
@ -171,11 +178,13 @@ def get_most_similar_name(text: str,
|
||||||
text_currency = cache.get("share_currency")
|
text_currency = cache.get("share_currency")
|
||||||
else:
|
else:
|
||||||
if share_name is not None and len(share_name.strip()) > 0:
|
if share_name is not None and len(share_name.strip()) > 0:
|
||||||
text_share_short_name_list = get_share_short_name_from_text(share_name)
|
text_share_short_name_list = get_share_short_name_from_text(share_name,
|
||||||
|
confirm_text_share=True)
|
||||||
text_feature = get_share_feature_from_text(share_name)
|
text_feature = get_share_feature_from_text(share_name)
|
||||||
text_currency = get_currency_from_text(share_name)
|
text_currency = get_currency_from_text(share_name)
|
||||||
else:
|
else:
|
||||||
text_share_short_name_list = get_share_short_name_from_text(text)
|
text_share_short_name_list = get_share_short_name_from_text(text,
|
||||||
|
confirm_text_share=True)
|
||||||
text_feature = get_share_feature_from_text(text)
|
text_feature = get_share_feature_from_text(text)
|
||||||
text_currency = get_currency_from_text(text)
|
text_currency = get_currency_from_text(text)
|
||||||
# sort text_share_short_name_list
|
# sort text_share_short_name_list
|
||||||
|
|
@ -187,12 +196,14 @@ def get_most_similar_name(text: str,
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
if share_name is not None and len(share_name.strip()) > 0:
|
if share_name is not None and len(share_name.strip()) > 0:
|
||||||
text_share_short_name_list = get_share_short_name_from_text(share_name)
|
text_share_short_name_list = get_share_short_name_from_text(share_name,
|
||||||
|
confirm_text_share=True)
|
||||||
text_share_short_name_list.sort()
|
text_share_short_name_list.sort()
|
||||||
text_feature = get_share_feature_from_text(share_name)
|
text_feature = get_share_feature_from_text(share_name)
|
||||||
text_currency = get_currency_from_text(share_name)
|
text_currency = get_currency_from_text(share_name)
|
||||||
else:
|
else:
|
||||||
text_share_short_name_list = get_share_short_name_from_text(text)
|
text_share_short_name_list = get_share_short_name_from_text(text,
|
||||||
|
confirm_text_share=True)
|
||||||
text_feature = get_share_feature_from_text(text)
|
text_feature = get_share_feature_from_text(text)
|
||||||
text_currency = get_currency_from_text(text)
|
text_currency = get_currency_from_text(text)
|
||||||
|
|
||||||
|
|
@ -203,6 +214,52 @@ def get_most_similar_name(text: str,
|
||||||
continue
|
continue
|
||||||
copy_name = remove_special_characters(copy_name)
|
copy_name = remove_special_characters(copy_name)
|
||||||
copy_name = split_words_without_space(copy_name)
|
copy_name = split_words_without_space(copy_name)
|
||||||
|
copy_name_short_name_list = None
|
||||||
|
copy_name_feature = None
|
||||||
|
copy_name_currency = None
|
||||||
|
if matching_type == "share":
|
||||||
|
if process_cache is not None and isinstance(process_cache, dict):
|
||||||
|
if process_cache.get(copy_name, None) is not None:
|
||||||
|
cache = process_cache.get(copy_name)
|
||||||
|
copy_name_short_name_list = cache.get("share_short_name")
|
||||||
|
copy_name_feature = cache.get("share_feature")
|
||||||
|
copy_name_currency = cache.get("share_currency")
|
||||||
|
else:
|
||||||
|
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
|
||||||
|
if copy_name_short_name_list is not None:
|
||||||
|
copy_name_short_name_list.sort()
|
||||||
|
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
||||||
|
copy_name_currency = get_currency_from_text(copy_share_name)
|
||||||
|
process_cache[copy_name] = {
|
||||||
|
"share_short_name": copy_name_short_name_list,
|
||||||
|
"share_feature": copy_name_feature,
|
||||||
|
"share_currency": copy_name_currency
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
|
||||||
|
copy_name_short_name_list.sort()
|
||||||
|
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
||||||
|
copy_name_currency = get_currency_from_text(copy_share_name)
|
||||||
|
try:
|
||||||
|
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
||||||
|
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
|
||||||
|
updated_text_share_short_name_list, updated_copy_name_short_name_list = \
|
||||||
|
compare_both_short_name(text_share_short_name_list, copy_name_short_name_list)
|
||||||
|
|
||||||
|
if updated_text_share_short_name_list != text_share_short_name_list:
|
||||||
|
text = ' '.join([split for split in text.split()
|
||||||
|
if split not in text_share_short_name_list])
|
||||||
|
text += ' ' + ' '.join(updated_text_share_short_name_list)
|
||||||
|
text_share_short_name_list = updated_text_share_short_name_list
|
||||||
|
|
||||||
|
if updated_copy_name_short_name_list != copy_name_short_name_list:
|
||||||
|
copy_name = ' '.join([split for split in copy_name.split()
|
||||||
|
if split not in copy_name_short_name_list])
|
||||||
|
copy_name += ' ' + ' '.join(updated_copy_name_short_name_list)
|
||||||
|
copy_name_short_name_list = updated_copy_name_short_name_list
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
similarity = get_jacard_similarity(text,
|
similarity = get_jacard_similarity(text,
|
||||||
copy_name,
|
copy_name,
|
||||||
|
|
@ -221,30 +278,7 @@ def get_most_similar_name(text: str,
|
||||||
if similarity_2 > similarity:
|
if similarity_2 > similarity:
|
||||||
similarity = similarity_2
|
similarity = similarity_2
|
||||||
if similarity > max_similarity:
|
if similarity > max_similarity:
|
||||||
if matching_type == "share":
|
if matching_type == "share":
|
||||||
if process_cache is not None and isinstance(process_cache, dict):
|
|
||||||
if process_cache.get(copy_name, None) is not None:
|
|
||||||
cache = process_cache.get(copy_name)
|
|
||||||
copy_name_short_name_list = cache.get("share_short_name")
|
|
||||||
copy_name_feature = cache.get("share_feature")
|
|
||||||
copy_name_currency = cache.get("share_currency")
|
|
||||||
else:
|
|
||||||
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
|
|
||||||
if copy_name_short_name_list is not None:
|
|
||||||
copy_name_short_name_list.sort()
|
|
||||||
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
|
||||||
copy_name_currency = get_currency_from_text(copy_share_name)
|
|
||||||
process_cache[copy_name] = {
|
|
||||||
"share_short_name": copy_name_short_name_list,
|
|
||||||
"share_feature": copy_name_feature,
|
|
||||||
"share_currency": copy_name_currency
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
|
|
||||||
copy_name_short_name_list.sort()
|
|
||||||
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
|
||||||
copy_name_currency = get_currency_from_text(copy_share_name)
|
|
||||||
|
|
||||||
if text_currency is not None and len(text_currency) > 0 and \
|
if text_currency is not None and len(text_currency) > 0 and \
|
||||||
copy_name_currency is not None and len(copy_name_currency) > 0:
|
copy_name_currency is not None and len(copy_name_currency) > 0:
|
||||||
if text_currency != copy_name_currency:
|
if text_currency != copy_name_currency:
|
||||||
|
|
@ -257,12 +291,18 @@ def get_most_similar_name(text: str,
|
||||||
if matching_type == "share":
|
if matching_type == "share":
|
||||||
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
||||||
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
|
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
|
||||||
raw_short_not_in_compare = False
|
short_name_invalid = False
|
||||||
for short in text_share_short_name_list:
|
for short in text_share_short_name_list:
|
||||||
if short not in copy_name_short_name_list:
|
if short not in copy_name_short_name_list:
|
||||||
raw_short_not_in_compare = True
|
short_name_invalid = True
|
||||||
break
|
break
|
||||||
if raw_short_not_in_compare:
|
for compare_short in copy_name_short_name_list:
|
||||||
|
if compare_short not in text_share_short_name_list:
|
||||||
|
# some short word is in fund name, but not belong to share name
|
||||||
|
if compare_short.upper() not in raw_fund_name_split:
|
||||||
|
short_name_invalid = True
|
||||||
|
break
|
||||||
|
if short_name_invalid:
|
||||||
continue
|
continue
|
||||||
max_similarity = similarity
|
max_similarity = similarity
|
||||||
max_similarity_full_name = full_name
|
max_similarity_full_name = full_name
|
||||||
|
|
@ -289,6 +329,43 @@ def get_most_similar_name(text: str,
|
||||||
return None, 0.0
|
return None, 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list):
|
||||||
|
copy_text_short_name_list = deepcopy(text_short_name_list)
|
||||||
|
copy_compare_short_name_list = deepcopy(compare_short_name_list)
|
||||||
|
copy_text_short_name_list = verify_short_name_container(copy_text_short_name_list,
|
||||||
|
copy_compare_short_name_list)
|
||||||
|
copy_compare_short_name_list = verify_short_name_container(copy_compare_short_name_list,
|
||||||
|
copy_text_short_name_list)
|
||||||
|
return copy_text_short_name_list, copy_compare_short_name_list
|
||||||
|
|
||||||
|
|
||||||
|
def verify_short_name_container(left_short_name_list: list, right_short_name_list: list):
|
||||||
|
length_1_over_1 = False
|
||||||
|
length_1_count = 0
|
||||||
|
length_1_list = []
|
||||||
|
for short_name in left_short_name_list:
|
||||||
|
if len(short_name) == 1:
|
||||||
|
length_1_count += 1
|
||||||
|
length_1_list.append(short_name)
|
||||||
|
if length_1_count > 1:
|
||||||
|
length_1_over_1 = True
|
||||||
|
|
||||||
|
if length_1_over_1:
|
||||||
|
for compare_short_name in right_short_name_list:
|
||||||
|
if len(compare_short_name) == length_1_count:
|
||||||
|
all_in = True
|
||||||
|
for short_name in length_1_list:
|
||||||
|
if short_name not in compare_short_name:
|
||||||
|
all_in = False
|
||||||
|
break
|
||||||
|
if all_in:
|
||||||
|
for short_name in length_1_list:
|
||||||
|
if short_name in left_short_name_list:
|
||||||
|
left_short_name_list.remove(short_name)
|
||||||
|
left_short_name_list.append(compare_short_name)
|
||||||
|
return left_short_name_list
|
||||||
|
|
||||||
|
|
||||||
def get_share_part_list(text_list: list):
|
def get_share_part_list(text_list: list):
|
||||||
share_part_list = []
|
share_part_list = []
|
||||||
for text in text_list:
|
for text in text_list:
|
||||||
|
|
@ -312,7 +389,7 @@ def get_share_part_list(text_list: list):
|
||||||
return share_part_list
|
return share_part_list
|
||||||
|
|
||||||
|
|
||||||
def get_share_short_name_from_text(text: str):
|
def get_share_short_name_from_text(text: str, confirm_text_share: bool = False):
|
||||||
if text is None or len(text.strip()) == 0:
|
if text is None or len(text.strip()) == 0:
|
||||||
return None
|
return None
|
||||||
text = remove_special_characters(text.strip())
|
text = remove_special_characters(text.strip())
|
||||||
|
|
@ -321,15 +398,18 @@ def get_share_short_name_from_text(text: str):
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
share_short_name_list = []
|
share_short_name_list = []
|
||||||
|
if confirm_text_share:
|
||||||
|
count_threshold = 6
|
||||||
|
else:
|
||||||
|
count_threshold = 4
|
||||||
for split in text_split[::-1]:
|
for split in text_split[::-1]:
|
||||||
if count == 4:
|
if count == count_threshold:
|
||||||
break
|
break
|
||||||
if split.lower() not in temp_share_features and \
|
if split.lower() not in temp_share_features and \
|
||||||
split.upper() not in total_currency_list:
|
split.upper() not in total_currency_list:
|
||||||
if len(split) <= 3:
|
if len(split) <= 3:
|
||||||
share_short_name_list.append(split.upper())
|
share_short_name_list.append(split.upper())
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
if len(share_short_name_list) > 1:
|
if len(share_short_name_list) > 1:
|
||||||
remove_number = []
|
remove_number = []
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue