optimize share feature judgment logic:

accumulation with capitalisation and institutional
income with distribution

Document: 337293427
This commit is contained in:
Blade He 2024-12-02 13:11:49 -06:00
parent 352886ade2
commit c146497052
2 changed files with 48 additions and 14 deletions

17
main.py
View File

@ -697,8 +697,10 @@ def test_data_extraction_metrics():
def test_mapping_raw_name(): def test_mapping_raw_name():
doc_id = "337293427" doc_id = "337293427"
raw_name = "KBC BONDS CAPITAL FUND Institutional F Shares" # KBC Bonds Inflation-Linked Bonds Distribution Shares
raw_share_name = "Institutional F Shares" # KBC Bonds Inflation-Linked Bonds Institutional B Shares
raw_name = "KBC Bonds Inflation-Linked Bonds Institutional B Shares"
raw_share_name = "Institutional B Shares"
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
data_mapping = DataMapping( data_mapping = DataMapping(
doc_id, doc_id,
@ -711,7 +713,7 @@ def test_mapping_raw_name():
mapping_info = data_mapping.matching_with_database( mapping_info = data_mapping.matching_with_database(
raw_name=raw_name, raw_name=raw_name,
raw_share_name=raw_share_name, raw_share_name=raw_share_name,
parent_id="FSGBR0536J", parent_id="FSGBR051XK",
matching_type="share", matching_type="share",
process_cache=process_cache process_cache=process_cache
) )
@ -862,6 +864,7 @@ if __name__ == "__main__":
# test_calculate_metrics() # test_calculate_metrics()
# test_replace_abbrevation() # test_replace_abbrevation()
# test_translate_pdf() # test_translate_pdf()
# test_mapping_raw_name()
pdf_folder = r"/data/emea_ar/pdf/" pdf_folder = r"/data/emea_ar/pdf/"
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@ -1194,12 +1197,12 @@ if __name__ == "__main__":
"534535767" "534535767"
] ]
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["334584772"] # special_doc_id_list = ["337293427"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = False force_save_total_data = True
calculate_metrics = False calculate_metrics = False
extract_ways = ["text"] extract_ways = ["text"]
@ -1222,4 +1225,4 @@ if __name__ == "__main__":
) )
# test_data_extraction_metrics() # test_data_extraction_metrics()
# test_mapping_raw_name()

View File

@ -268,8 +268,38 @@ def get_most_similar_name(text: str,
except Exception as e: except Exception as e:
print(e) print(e)
compare_text = text
try: try:
similarity = get_jacard_similarity(text, text_split = text.split()
text_split_lower = text.lower().split()
copy_name_split_lower = copy_name.lower().split()
if copy_name_feature == "accumulation" and \
(text_feature is None or len(text_feature) == 0 or
text_feature in ["capitalisation", "institutional"]
or "capitalisation" in text_split_lower or "institutional" in text_split_lower):
if "capitalisation" not in copy_name_split_lower:
compare_text = " ".join([split for split in text_split
if split.lower() not in ["cap", "cap.", "capitalisation"]])
text_split = compare_text.split()
if "institutional" not in copy_name_split_lower:
compare_text = " ".join([split for split in text_split
if split.lower() not in ["inst", "inst.", "institutional"]])
text_split = compare_text.split()
if text_feature is not None and len(text_feature) > 0:
compare_text = " ".join([split for split in text_split
if split.lower() != text_feature])
compare_text += " accumulation"
text_feature = "accumulation"
elif copy_name_feature == "income" and \
(text_feature is None or len(text_feature) == 0 or text_feature == "distribution"):
if "dist" in text_split_lower or "dist." in text_split_lower or "distribution" in text_split_lower:
compare_text = " ".join([split for split in text_split
if split.lower() not in ["dist", "dist.", "distribution"]])
compare_text += " income"
text_feature = "income"
else:
pass
similarity = get_jacard_similarity(compare_text,
copy_name, copy_name,
need_remove_numeric_characters=False) need_remove_numeric_characters=False)
except Exception as e: except Exception as e:
@ -280,7 +310,7 @@ def get_most_similar_name(text: str,
return full_name, similarity return full_name, similarity
copy_name_2 = replace_abbrevation(copy_name) copy_name_2 = replace_abbrevation(copy_name)
if copy_name != copy_name_2: if copy_name != copy_name_2:
similarity_2 = get_jacard_similarity(text, similarity_2 = get_jacard_similarity(compare_text,
copy_name_2, copy_name_2,
need_remove_numeric_characters=False) need_remove_numeric_characters=False)
if similarity_2 > similarity: if similarity_2 > similarity:
@ -296,7 +326,7 @@ def get_most_similar_name(text: str,
if text_feature != copy_name_feature: if text_feature != copy_name_feature:
if text_feature.lower() not in copy_name.lower().split() and \ if text_feature.lower() not in copy_name.lower().split() and \
copy_name_feature.lower() != "accmulation" and \ copy_name_feature.lower() != "accmulation" and \
copy_name_feature.lower() not in text.lower().split(): copy_name_feature.lower() not in compare_text.lower().split():
continue continue
if matching_type == "share": if matching_type == "share":
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \ if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
@ -309,6 +339,7 @@ def get_most_similar_name(text: str,
for compare_short in copy_name_short_name_list: for compare_short in copy_name_short_name_list:
if compare_short not in text_share_short_name_list: if compare_short not in text_share_short_name_list:
# some short word is in fund name, but not belong to share name # some short word is in fund name, but not belong to share name
if compare_short.upper() not in raw_fund_name_split: if compare_short.upper() not in raw_fund_name_split:
short_name_invalid = True short_name_invalid = True
break break