optimize investment mapping algorithm
This commit is contained in:
parent
aa2c2332ae
commit
04a2409c58
|
|
@ -174,6 +174,7 @@ class DataMapping:
|
|||
investment_info = self.matching_with_database(
|
||||
raw_name=raw_name,
|
||||
raw_share_name=raw_share_name,
|
||||
raw_fund_name=raw_fund_name,
|
||||
parent_id=fund_id,
|
||||
matching_type="share",
|
||||
process_cache=process_cache
|
||||
|
|
@ -254,6 +255,7 @@ class DataMapping:
|
|||
self,
|
||||
raw_name: str,
|
||||
raw_share_name: str = None,
|
||||
raw_fund_name: str = None,
|
||||
parent_id: str = None,
|
||||
matching_type: str = "fund",
|
||||
process_cache: dict = {}
|
||||
|
|
@ -328,9 +330,14 @@ class DataMapping:
|
|||
raw_name,
|
||||
doc_compare_name_list,
|
||||
share_name=raw_share_name,
|
||||
fund_name=raw_fund_name,
|
||||
matching_type=matching_type,
|
||||
process_cache=process_cache)
|
||||
if max_similarity is not None and max_similarity >= 0.9:
|
||||
if matching_type == "fund":
|
||||
threshold = 0.7
|
||||
else:
|
||||
threshold = 0.9
|
||||
if max_similarity is not None and max_similarity >= threshold:
|
||||
data_info["id"] = doc_compare_mapping[
|
||||
doc_compare_mapping[compare_name_dp] == max_similarity_name
|
||||
][compare_id_dp].values[0]
|
||||
|
|
@ -344,6 +351,7 @@ class DataMapping:
|
|||
raw_name,
|
||||
provider_compare_name_list,
|
||||
share_name=raw_share_name,
|
||||
fund_name=raw_fund_name,
|
||||
matching_type=matching_type,
|
||||
pre_common_word_list=pre_common_word_list,
|
||||
process_cache=process_cache
|
||||
|
|
|
|||
180
main.py
180
main.py
|
|
@ -338,7 +338,7 @@ def batch_start_job(
|
|||
|
||||
|
||||
if calculate_metrics:
|
||||
prediction_sheet_name = "mapping_data"
|
||||
prediction_sheet_name = "total_mapping_data"
|
||||
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
||||
ground_truth_sheet_name = "mapping_data"
|
||||
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
||||
|
|
@ -600,9 +600,9 @@ def test_data_extraction_metrics():
|
|||
|
||||
|
||||
def test_mapping_raw_name():
|
||||
doc_id = "394778487"
|
||||
raw_name = "Invesco Global Real Assets Fund FCP-RAIF Invesco Global Property Plus Fund Z Gross QD USD"
|
||||
raw_share_name = "Z Gross QD USD"
|
||||
doc_id = "382366116"
|
||||
raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I"
|
||||
raw_share_name = "EUR I"
|
||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||
data_mapping = DataMapping(
|
||||
doc_id,
|
||||
|
|
@ -615,7 +615,7 @@ def test_mapping_raw_name():
|
|||
mapping_info = data_mapping.matching_with_database(
|
||||
raw_name=raw_name,
|
||||
raw_share_name=raw_share_name,
|
||||
parent_id="FS0000H1C9",
|
||||
parent_id=None,
|
||||
matching_type="share",
|
||||
process_cache=process_cache
|
||||
)
|
||||
|
|
@ -697,100 +697,102 @@ if __name__ == "__main__":
|
|||
# "479793787",
|
||||
# "471641628",
|
||||
# ]
|
||||
# check_db_mapping_doc_id_list = [
|
||||
# "292989214",
|
||||
# "316237292",
|
||||
# "321733631",
|
||||
# "323390570",
|
||||
# "327956364",
|
||||
# "332223498",
|
||||
# "333207452",
|
||||
# "334718372",
|
||||
# "344636875",
|
||||
# "362246081",
|
||||
# "366179419",
|
||||
# "380945052",
|
||||
# "382366116",
|
||||
# "387202452",
|
||||
# "389171486",
|
||||
# "391456740",
|
||||
# "391736837",
|
||||
# "394778487",
|
||||
# "401684600",
|
||||
# "402113224",
|
||||
# "402181770",
|
||||
# "402397014",
|
||||
# "405803396",
|
||||
# "445102363",
|
||||
# "445256897",
|
||||
# "448265376",
|
||||
# "449555622",
|
||||
# "449623976",
|
||||
# "458291624",
|
||||
# "458359181",
|
||||
# "463081566",
|
||||
# "469138353",
|
||||
# "471641628",
|
||||
# "476492237",
|
||||
# "478585901",
|
||||
# "478586066",
|
||||
# "479042264",
|
||||
# "479042269",
|
||||
# "479793787",
|
||||
# "481475385",
|
||||
# "483617247",
|
||||
# "486378555",
|
||||
# "486383912",
|
||||
# "492121213",
|
||||
# "497497599",
|
||||
# "502693599"
|
||||
# ]
|
||||
|
||||
check_db_mapping_doc_id_list = [
|
||||
"334584772",
|
||||
"406913630",
|
||||
"407275419",
|
||||
"337937633",
|
||||
"337293427",
|
||||
"334584772",
|
||||
"404712928",
|
||||
"451063582",
|
||||
"451878128",
|
||||
"425595958",
|
||||
"536344026",
|
||||
"532422548",
|
||||
"423418540",
|
||||
"423418395",
|
||||
"532998065",
|
||||
"540307575",
|
||||
"423395975",
|
||||
"508704368",
|
||||
"481482392",
|
||||
"466580448",
|
||||
"423365707",
|
||||
"423364758",
|
||||
"422761666",
|
||||
"422760156",
|
||||
"422760148",
|
||||
"422686965",
|
||||
"492029971",
|
||||
"510300817",
|
||||
"512745032",
|
||||
"514213638",
|
||||
"527525440",
|
||||
"534535767"
|
||||
"292989214",
|
||||
"316237292",
|
||||
"321733631",
|
||||
"323390570",
|
||||
"327956364",
|
||||
"332223498",
|
||||
"333207452",
|
||||
"334718372",
|
||||
"344636875",
|
||||
"362246081",
|
||||
"366179419",
|
||||
"380945052",
|
||||
"382366116",
|
||||
"387202452",
|
||||
"389171486",
|
||||
"391456740",
|
||||
"391736837",
|
||||
"394778487",
|
||||
"401684600",
|
||||
"402113224",
|
||||
"402181770",
|
||||
"402397014",
|
||||
"405803396",
|
||||
"445102363",
|
||||
"445256897",
|
||||
"448265376",
|
||||
"449555622",
|
||||
"449623976",
|
||||
"458291624",
|
||||
"458359181",
|
||||
"463081566",
|
||||
"469138353",
|
||||
"471641628",
|
||||
"476492237",
|
||||
"478585901",
|
||||
"478586066",
|
||||
"479042264",
|
||||
"479042269",
|
||||
"479793787",
|
||||
"481475385",
|
||||
"483617247",
|
||||
"486378555",
|
||||
"486383912",
|
||||
"492121213",
|
||||
"497497599",
|
||||
"502693599"
|
||||
]
|
||||
|
||||
# check_db_mapping_doc_id_list = [
|
||||
# "334584772",
|
||||
# "406913630",
|
||||
# "407275419",
|
||||
# "337937633",
|
||||
# "337293427",
|
||||
# "334584772",
|
||||
# "404712928",
|
||||
# "451063582",
|
||||
# "451878128",
|
||||
# "425595958",
|
||||
# "536344026",
|
||||
# "532422548",
|
||||
# "423418540",
|
||||
# "423418395",
|
||||
# "532998065",
|
||||
# "540307575",
|
||||
# "423395975",
|
||||
# "508704368",
|
||||
# "481482392",
|
||||
# "466580448",
|
||||
# "423365707",
|
||||
# "423364758",
|
||||
# "422761666",
|
||||
# "422760156",
|
||||
# "422760148",
|
||||
# "422686965",
|
||||
# "492029971",
|
||||
# "510300817",
|
||||
# "512745032",
|
||||
# "514213638",
|
||||
# "527525440",
|
||||
# "534535767"
|
||||
# ]
|
||||
# special_doc_id_list = check_mapping_doc_id_list
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
# special_doc_id_list = ["337937633"]
|
||||
# special_doc_id_list = ["394778487"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = False
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = True
|
||||
calculate_metrics = False
|
||||
calculate_metrics = True
|
||||
|
||||
extract_ways = ["text"]
|
||||
pdf_folder = r"/data/emea_ar/small_pdf/"
|
||||
# pdf_folder = r"/data/emea_ar/pdf/"
|
||||
for extract_way in extract_ways:
|
||||
batch_start_job(
|
||||
pdf_folder,
|
||||
|
|
|
|||
|
|
@ -81,6 +81,7 @@ def clean_text(text: str) -> str:
|
|||
def get_most_similar_name(text: str,
|
||||
name_list: list,
|
||||
share_name: str = None,
|
||||
fund_name: str = None,
|
||||
matching_type="share",
|
||||
pre_common_word_list: list = None,
|
||||
process_cache: dict = None) -> str:
|
||||
|
|
@ -116,6 +117,12 @@ def get_most_similar_name(text: str,
|
|||
text = text.strip()
|
||||
text = remove_special_characters(text)
|
||||
text = replace_abbrevation(text)
|
||||
raw_fund_name_split = []
|
||||
if fund_name is not None and len(fund_name.strip()) > 0:
|
||||
fund_name = fund_name.strip()
|
||||
fund_name = remove_special_characters(fund_name)
|
||||
raw_fund_name_split = fund_name.upper().split()
|
||||
|
||||
if share_name is not None:
|
||||
share_name = remove_special_characters(share_name)
|
||||
share_name = replace_abbrevation(share_name)
|
||||
|
|
@ -171,11 +178,13 @@ def get_most_similar_name(text: str,
|
|||
text_currency = cache.get("share_currency")
|
||||
else:
|
||||
if share_name is not None and len(share_name.strip()) > 0:
|
||||
text_share_short_name_list = get_share_short_name_from_text(share_name)
|
||||
text_share_short_name_list = get_share_short_name_from_text(share_name,
|
||||
confirm_text_share=True)
|
||||
text_feature = get_share_feature_from_text(share_name)
|
||||
text_currency = get_currency_from_text(share_name)
|
||||
else:
|
||||
text_share_short_name_list = get_share_short_name_from_text(text)
|
||||
text_share_short_name_list = get_share_short_name_from_text(text,
|
||||
confirm_text_share=True)
|
||||
text_feature = get_share_feature_from_text(text)
|
||||
text_currency = get_currency_from_text(text)
|
||||
# sort text_share_short_name_list
|
||||
|
|
@ -187,12 +196,14 @@ def get_most_similar_name(text: str,
|
|||
}
|
||||
else:
|
||||
if share_name is not None and len(share_name.strip()) > 0:
|
||||
text_share_short_name_list = get_share_short_name_from_text(share_name)
|
||||
text_share_short_name_list = get_share_short_name_from_text(share_name,
|
||||
confirm_text_share=True)
|
||||
text_share_short_name_list.sort()
|
||||
text_feature = get_share_feature_from_text(share_name)
|
||||
text_currency = get_currency_from_text(share_name)
|
||||
else:
|
||||
text_share_short_name_list = get_share_short_name_from_text(text)
|
||||
text_share_short_name_list = get_share_short_name_from_text(text,
|
||||
confirm_text_share=True)
|
||||
text_feature = get_share_feature_from_text(text)
|
||||
text_currency = get_currency_from_text(text)
|
||||
|
||||
|
|
@ -203,24 +214,9 @@ def get_most_similar_name(text: str,
|
|||
continue
|
||||
copy_name = remove_special_characters(copy_name)
|
||||
copy_name = split_words_without_space(copy_name)
|
||||
try:
|
||||
similarity = get_jacard_similarity(text,
|
||||
copy_name,
|
||||
need_remove_numeric_characters=False)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print_exc()
|
||||
similarity = 0
|
||||
if similarity == 1:
|
||||
return full_name, similarity
|
||||
copy_name_2 = replace_abbrevation(copy_name)
|
||||
if copy_name != copy_name_2:
|
||||
similarity_2 = get_jacard_similarity(text,
|
||||
copy_name_2,
|
||||
need_remove_numeric_characters=False)
|
||||
if similarity_2 > similarity:
|
||||
similarity = similarity_2
|
||||
if similarity > max_similarity:
|
||||
copy_name_short_name_list = None
|
||||
copy_name_feature = None
|
||||
copy_name_currency = None
|
||||
if matching_type == "share":
|
||||
if process_cache is not None and isinstance(process_cache, dict):
|
||||
if process_cache.get(copy_name, None) is not None:
|
||||
|
|
@ -244,7 +240,45 @@ def get_most_similar_name(text: str,
|
|||
copy_name_short_name_list.sort()
|
||||
copy_name_feature = get_share_feature_from_text(copy_share_name)
|
||||
copy_name_currency = get_currency_from_text(copy_share_name)
|
||||
try:
|
||||
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
||||
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
|
||||
updated_text_share_short_name_list, updated_copy_name_short_name_list = \
|
||||
compare_both_short_name(text_share_short_name_list, copy_name_short_name_list)
|
||||
|
||||
if updated_text_share_short_name_list != text_share_short_name_list:
|
||||
text = ' '.join([split for split in text.split()
|
||||
if split not in text_share_short_name_list])
|
||||
text += ' ' + ' '.join(updated_text_share_short_name_list)
|
||||
text_share_short_name_list = updated_text_share_short_name_list
|
||||
|
||||
if updated_copy_name_short_name_list != copy_name_short_name_list:
|
||||
copy_name = ' '.join([split for split in copy_name.split()
|
||||
if split not in copy_name_short_name_list])
|
||||
copy_name += ' ' + ' '.join(updated_copy_name_short_name_list)
|
||||
copy_name_short_name_list = updated_copy_name_short_name_list
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
try:
|
||||
similarity = get_jacard_similarity(text,
|
||||
copy_name,
|
||||
need_remove_numeric_characters=False)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print_exc()
|
||||
similarity = 0
|
||||
if similarity == 1:
|
||||
return full_name, similarity
|
||||
copy_name_2 = replace_abbrevation(copy_name)
|
||||
if copy_name != copy_name_2:
|
||||
similarity_2 = get_jacard_similarity(text,
|
||||
copy_name_2,
|
||||
need_remove_numeric_characters=False)
|
||||
if similarity_2 > similarity:
|
||||
similarity = similarity_2
|
||||
if similarity > max_similarity:
|
||||
if matching_type == "share":
|
||||
if text_currency is not None and len(text_currency) > 0 and \
|
||||
copy_name_currency is not None and len(copy_name_currency) > 0:
|
||||
if text_currency != copy_name_currency:
|
||||
|
|
@ -257,12 +291,18 @@ def get_most_similar_name(text: str,
|
|||
if matching_type == "share":
|
||||
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
||||
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
|
||||
raw_short_not_in_compare = False
|
||||
short_name_invalid = False
|
||||
for short in text_share_short_name_list:
|
||||
if short not in copy_name_short_name_list:
|
||||
raw_short_not_in_compare = True
|
||||
short_name_invalid = True
|
||||
break
|
||||
if raw_short_not_in_compare:
|
||||
for compare_short in copy_name_short_name_list:
|
||||
if compare_short not in text_share_short_name_list:
|
||||
# some short word is in fund name, but not belong to share name
|
||||
if compare_short.upper() not in raw_fund_name_split:
|
||||
short_name_invalid = True
|
||||
break
|
||||
if short_name_invalid:
|
||||
continue
|
||||
max_similarity = similarity
|
||||
max_similarity_full_name = full_name
|
||||
|
|
@ -289,6 +329,43 @@ def get_most_similar_name(text: str,
|
|||
return None, 0.0
|
||||
|
||||
|
||||
def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list):
|
||||
copy_text_short_name_list = deepcopy(text_short_name_list)
|
||||
copy_compare_short_name_list = deepcopy(compare_short_name_list)
|
||||
copy_text_short_name_list = verify_short_name_container(copy_text_short_name_list,
|
||||
copy_compare_short_name_list)
|
||||
copy_compare_short_name_list = verify_short_name_container(copy_compare_short_name_list,
|
||||
copy_text_short_name_list)
|
||||
return copy_text_short_name_list, copy_compare_short_name_list
|
||||
|
||||
|
||||
def verify_short_name_container(left_short_name_list: list, right_short_name_list: list):
|
||||
length_1_over_1 = False
|
||||
length_1_count = 0
|
||||
length_1_list = []
|
||||
for short_name in left_short_name_list:
|
||||
if len(short_name) == 1:
|
||||
length_1_count += 1
|
||||
length_1_list.append(short_name)
|
||||
if length_1_count > 1:
|
||||
length_1_over_1 = True
|
||||
|
||||
if length_1_over_1:
|
||||
for compare_short_name in right_short_name_list:
|
||||
if len(compare_short_name) == length_1_count:
|
||||
all_in = True
|
||||
for short_name in length_1_list:
|
||||
if short_name not in compare_short_name:
|
||||
all_in = False
|
||||
break
|
||||
if all_in:
|
||||
for short_name in length_1_list:
|
||||
if short_name in left_short_name_list:
|
||||
left_short_name_list.remove(short_name)
|
||||
left_short_name_list.append(compare_short_name)
|
||||
return left_short_name_list
|
||||
|
||||
|
||||
def get_share_part_list(text_list: list):
|
||||
share_part_list = []
|
||||
for text in text_list:
|
||||
|
|
@ -312,7 +389,7 @@ def get_share_part_list(text_list: list):
|
|||
return share_part_list
|
||||
|
||||
|
||||
def get_share_short_name_from_text(text: str):
|
||||
def get_share_short_name_from_text(text: str, confirm_text_share: bool = False):
|
||||
if text is None or len(text.strip()) == 0:
|
||||
return None
|
||||
text = remove_special_characters(text.strip())
|
||||
|
|
@ -321,9 +398,12 @@ def get_share_short_name_from_text(text: str):
|
|||
|
||||
count = 0
|
||||
share_short_name_list = []
|
||||
|
||||
if confirm_text_share:
|
||||
count_threshold = 6
|
||||
else:
|
||||
count_threshold = 4
|
||||
for split in text_split[::-1]:
|
||||
if count == 4:
|
||||
if count == count_threshold:
|
||||
break
|
||||
if split.lower() not in temp_share_features and \
|
||||
split.upper() not in total_currency_list:
|
||||
|
|
|
|||
Loading…
Reference in New Issue