optimize investment mapping algorithm

This commit is contained in:
Blade He 2024-10-08 23:53:55 -05:00
parent aa2c2332ae
commit 04a2409c58
3 changed files with 215 additions and 125 deletions

View File

@ -174,6 +174,7 @@ class DataMapping:
investment_info = self.matching_with_database(
raw_name=raw_name,
raw_share_name=raw_share_name,
raw_fund_name=raw_fund_name,
parent_id=fund_id,
matching_type="share",
process_cache=process_cache
@ -254,6 +255,7 @@ class DataMapping:
self,
raw_name: str,
raw_share_name: str = None,
raw_fund_name: str = None,
parent_id: str = None,
matching_type: str = "fund",
process_cache: dict = {}
@ -328,9 +330,14 @@ class DataMapping:
raw_name,
doc_compare_name_list,
share_name=raw_share_name,
fund_name=raw_fund_name,
matching_type=matching_type,
process_cache=process_cache)
if max_similarity is not None and max_similarity >= 0.9:
if matching_type == "fund":
threshold = 0.7
else:
threshold = 0.9
if max_similarity is not None and max_similarity >= threshold:
data_info["id"] = doc_compare_mapping[
doc_compare_mapping[compare_name_dp] == max_similarity_name
][compare_id_dp].values[0]
@ -344,6 +351,7 @@ class DataMapping:
raw_name,
provider_compare_name_list,
share_name=raw_share_name,
fund_name=raw_fund_name,
matching_type=matching_type,
pre_common_word_list=pre_common_word_list,
process_cache=process_cache

180
main.py
View File

@ -338,7 +338,7 @@ def batch_start_job(
if calculate_metrics:
prediction_sheet_name = "mapping_data"
prediction_sheet_name = "total_mapping_data"
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
ground_truth_sheet_name = "mapping_data"
metrics_output_folder = r"/data/emea_ar/output/metrics/"
@ -600,9 +600,9 @@ def test_data_extraction_metrics():
def test_mapping_raw_name():
doc_id = "394778487"
raw_name = "Invesco Global Real Assets Fund FCP-RAIF Invesco Global Property Plus Fund Z Gross QD USD"
raw_share_name = "Z Gross QD USD"
doc_id = "382366116"
raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I"
raw_share_name = "EUR I"
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
data_mapping = DataMapping(
doc_id,
@ -615,7 +615,7 @@ def test_mapping_raw_name():
mapping_info = data_mapping.matching_with_database(
raw_name=raw_name,
raw_share_name=raw_share_name,
parent_id="FS0000H1C9",
parent_id=None,
matching_type="share",
process_cache=process_cache
)
@ -697,100 +697,102 @@ if __name__ == "__main__":
# "479793787",
# "471641628",
# ]
# check_db_mapping_doc_id_list = [
# "292989214",
# "316237292",
# "321733631",
# "323390570",
# "327956364",
# "332223498",
# "333207452",
# "334718372",
# "344636875",
# "362246081",
# "366179419",
# "380945052",
# "382366116",
# "387202452",
# "389171486",
# "391456740",
# "391736837",
# "394778487",
# "401684600",
# "402113224",
# "402181770",
# "402397014",
# "405803396",
# "445102363",
# "445256897",
# "448265376",
# "449555622",
# "449623976",
# "458291624",
# "458359181",
# "463081566",
# "469138353",
# "471641628",
# "476492237",
# "478585901",
# "478586066",
# "479042264",
# "479042269",
# "479793787",
# "481475385",
# "483617247",
# "486378555",
# "486383912",
# "492121213",
# "497497599",
# "502693599"
# ]
check_db_mapping_doc_id_list = [
"334584772",
"406913630",
"407275419",
"337937633",
"337293427",
"334584772",
"404712928",
"451063582",
"451878128",
"425595958",
"536344026",
"532422548",
"423418540",
"423418395",
"532998065",
"540307575",
"423395975",
"508704368",
"481482392",
"466580448",
"423365707",
"423364758",
"422761666",
"422760156",
"422760148",
"422686965",
"492029971",
"510300817",
"512745032",
"514213638",
"527525440",
"534535767"
"292989214",
"316237292",
"321733631",
"323390570",
"327956364",
"332223498",
"333207452",
"334718372",
"344636875",
"362246081",
"366179419",
"380945052",
"382366116",
"387202452",
"389171486",
"391456740",
"391736837",
"394778487",
"401684600",
"402113224",
"402181770",
"402397014",
"405803396",
"445102363",
"445256897",
"448265376",
"449555622",
"449623976",
"458291624",
"458359181",
"463081566",
"469138353",
"471641628",
"476492237",
"478585901",
"478586066",
"479042264",
"479042269",
"479793787",
"481475385",
"483617247",
"486378555",
"486383912",
"492121213",
"497497599",
"502693599"
]
# check_db_mapping_doc_id_list = [
# "334584772",
# "406913630",
# "407275419",
# "337937633",
# "337293427",
# "334584772",
# "404712928",
# "451063582",
# "451878128",
# "425595958",
# "536344026",
# "532422548",
# "423418540",
# "423418395",
# "532998065",
# "540307575",
# "423395975",
# "508704368",
# "481482392",
# "466580448",
# "423365707",
# "423364758",
# "422761666",
# "422760156",
# "422760148",
# "422686965",
# "492029971",
# "510300817",
# "512745032",
# "514213638",
# "527525440",
# "534535767"
# ]
# special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["337937633"]
# special_doc_id_list = ["394778487"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False
re_run_mapping_data = False
re_run_mapping_data = True
force_save_total_data = True
calculate_metrics = False
calculate_metrics = True
extract_ways = ["text"]
pdf_folder = r"/data/emea_ar/small_pdf/"
# pdf_folder = r"/data/emea_ar/pdf/"
for extract_way in extract_ways:
batch_start_job(
pdf_folder,

View File

@ -81,6 +81,7 @@ def clean_text(text: str) -> str:
def get_most_similar_name(text: str,
name_list: list,
share_name: str = None,
fund_name: str = None,
matching_type="share",
pre_common_word_list: list = None,
process_cache: dict = None) -> str:
@ -116,6 +117,12 @@ def get_most_similar_name(text: str,
text = text.strip()
text = remove_special_characters(text)
text = replace_abbrevation(text)
raw_fund_name_split = []
if fund_name is not None and len(fund_name.strip()) > 0:
fund_name = fund_name.strip()
fund_name = remove_special_characters(fund_name)
raw_fund_name_split = fund_name.upper().split()
if share_name is not None:
share_name = remove_special_characters(share_name)
share_name = replace_abbrevation(share_name)
@ -171,11 +178,13 @@ def get_most_similar_name(text: str,
text_currency = cache.get("share_currency")
else:
if share_name is not None and len(share_name.strip()) > 0:
text_share_short_name_list = get_share_short_name_from_text(share_name)
text_share_short_name_list = get_share_short_name_from_text(share_name,
confirm_text_share=True)
text_feature = get_share_feature_from_text(share_name)
text_currency = get_currency_from_text(share_name)
else:
text_share_short_name_list = get_share_short_name_from_text(text)
text_share_short_name_list = get_share_short_name_from_text(text,
confirm_text_share=True)
text_feature = get_share_feature_from_text(text)
text_currency = get_currency_from_text(text)
# sort text_share_short_name_list
@ -187,12 +196,14 @@ def get_most_similar_name(text: str,
}
else:
if share_name is not None and len(share_name.strip()) > 0:
text_share_short_name_list = get_share_short_name_from_text(share_name)
text_share_short_name_list = get_share_short_name_from_text(share_name,
confirm_text_share=True)
text_share_short_name_list.sort()
text_feature = get_share_feature_from_text(share_name)
text_currency = get_currency_from_text(share_name)
else:
text_share_short_name_list = get_share_short_name_from_text(text)
text_share_short_name_list = get_share_short_name_from_text(text,
confirm_text_share=True)
text_feature = get_share_feature_from_text(text)
text_currency = get_currency_from_text(text)
@ -203,6 +214,52 @@ def get_most_similar_name(text: str,
continue
copy_name = remove_special_characters(copy_name)
copy_name = split_words_without_space(copy_name)
copy_name_short_name_list = None
copy_name_feature = None
copy_name_currency = None
if matching_type == "share":
if process_cache is not None and isinstance(process_cache, dict):
if process_cache.get(copy_name, None) is not None:
cache = process_cache.get(copy_name)
copy_name_short_name_list = cache.get("share_short_name")
copy_name_feature = cache.get("share_feature")
copy_name_currency = cache.get("share_currency")
else:
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
if copy_name_short_name_list is not None:
copy_name_short_name_list.sort()
copy_name_feature = get_share_feature_from_text(copy_share_name)
copy_name_currency = get_currency_from_text(copy_share_name)
process_cache[copy_name] = {
"share_short_name": copy_name_short_name_list,
"share_feature": copy_name_feature,
"share_currency": copy_name_currency
}
else:
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
copy_name_short_name_list.sort()
copy_name_feature = get_share_feature_from_text(copy_share_name)
copy_name_currency = get_currency_from_text(copy_share_name)
try:
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
updated_text_share_short_name_list, updated_copy_name_short_name_list = \
compare_both_short_name(text_share_short_name_list, copy_name_short_name_list)
if updated_text_share_short_name_list != text_share_short_name_list:
text = ' '.join([split for split in text.split()
if split not in text_share_short_name_list])
text += ' ' + ' '.join(updated_text_share_short_name_list)
text_share_short_name_list = updated_text_share_short_name_list
if updated_copy_name_short_name_list != copy_name_short_name_list:
copy_name = ' '.join([split for split in copy_name.split()
if split not in copy_name_short_name_list])
copy_name += ' ' + ' '.join(updated_copy_name_short_name_list)
copy_name_short_name_list = updated_copy_name_short_name_list
except Exception as e:
print(e)
try:
similarity = get_jacard_similarity(text,
copy_name,
@ -221,30 +278,7 @@ def get_most_similar_name(text: str,
if similarity_2 > similarity:
similarity = similarity_2
if similarity > max_similarity:
if matching_type == "share":
if process_cache is not None and isinstance(process_cache, dict):
if process_cache.get(copy_name, None) is not None:
cache = process_cache.get(copy_name)
copy_name_short_name_list = cache.get("share_short_name")
copy_name_feature = cache.get("share_feature")
copy_name_currency = cache.get("share_currency")
else:
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
if copy_name_short_name_list is not None:
copy_name_short_name_list.sort()
copy_name_feature = get_share_feature_from_text(copy_share_name)
copy_name_currency = get_currency_from_text(copy_share_name)
process_cache[copy_name] = {
"share_short_name": copy_name_short_name_list,
"share_feature": copy_name_feature,
"share_currency": copy_name_currency
}
else:
copy_name_short_name_list = get_share_short_name_from_text(copy_share_name)
copy_name_short_name_list.sort()
copy_name_feature = get_share_feature_from_text(copy_share_name)
copy_name_currency = get_currency_from_text(copy_share_name)
if matching_type == "share":
if text_currency is not None and len(text_currency) > 0 and \
copy_name_currency is not None and len(copy_name_currency) > 0:
if text_currency != copy_name_currency:
@ -257,12 +291,18 @@ def get_most_similar_name(text: str,
if matching_type == "share":
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
copy_name_short_name_list is not None and len(copy_name_short_name_list) > 0:
raw_short_not_in_compare = False
short_name_invalid = False
for short in text_share_short_name_list:
if short not in copy_name_short_name_list:
raw_short_not_in_compare = True
short_name_invalid = True
break
if raw_short_not_in_compare:
for compare_short in copy_name_short_name_list:
if compare_short not in text_share_short_name_list:
# some short word is in fund name, but not belong to share name
if compare_short.upper() not in raw_fund_name_split:
short_name_invalid = True
break
if short_name_invalid:
continue
max_similarity = similarity
max_similarity_full_name = full_name
@ -289,6 +329,43 @@ def get_most_similar_name(text: str,
return None, 0.0
def compare_both_short_name(text_short_name_list: list, compare_short_name_list: list):
copy_text_short_name_list = deepcopy(text_short_name_list)
copy_compare_short_name_list = deepcopy(compare_short_name_list)
copy_text_short_name_list = verify_short_name_container(copy_text_short_name_list,
copy_compare_short_name_list)
copy_compare_short_name_list = verify_short_name_container(copy_compare_short_name_list,
copy_text_short_name_list)
return copy_text_short_name_list, copy_compare_short_name_list
def verify_short_name_container(left_short_name_list: list, right_short_name_list: list):
length_1_over_1 = False
length_1_count = 0
length_1_list = []
for short_name in left_short_name_list:
if len(short_name) == 1:
length_1_count += 1
length_1_list.append(short_name)
if length_1_count > 1:
length_1_over_1 = True
if length_1_over_1:
for compare_short_name in right_short_name_list:
if len(compare_short_name) == length_1_count:
all_in = True
for short_name in length_1_list:
if short_name not in compare_short_name:
all_in = False
break
if all_in:
for short_name in length_1_list:
if short_name in left_short_name_list:
left_short_name_list.remove(short_name)
left_short_name_list.append(compare_short_name)
return left_short_name_list
def get_share_part_list(text_list: list):
share_part_list = []
for text in text_list:
@ -312,7 +389,7 @@ def get_share_part_list(text_list: list):
return share_part_list
def get_share_short_name_from_text(text: str):
def get_share_short_name_from_text(text: str, confirm_text_share: bool = False):
if text is None or len(text.strip()) == 0:
return None
text = remove_special_characters(text.strip())
@ -321,15 +398,18 @@ def get_share_short_name_from_text(text: str):
count = 0
share_short_name_list = []
if confirm_text_share:
count_threshold = 6
else:
count_threshold = 4
for split in text_split[::-1]:
if count == 4:
if count == count_threshold:
break
if split.lower() not in temp_share_features and \
split.upper() not in total_currency_list:
if len(split) <= 3:
share_short_name_list.append(split.upper())
count += 1
count += 1
if len(share_short_name_list) > 1:
remove_number = []