optimize share feature judgment logic:
accumulation with capitalisation and institutional income with distribution Document: 337293427
This commit is contained in:
parent
352886ade2
commit
c146497052
17
main.py
17
main.py
|
|
@ -697,8 +697,10 @@ def test_data_extraction_metrics():
|
||||||
|
|
||||||
def test_mapping_raw_name():
|
def test_mapping_raw_name():
|
||||||
doc_id = "337293427"
|
doc_id = "337293427"
|
||||||
raw_name = "KBC BONDS CAPITAL FUND Institutional F Shares"
|
# KBC Bonds Inflation-Linked Bonds Distribution Shares
|
||||||
raw_share_name = "Institutional F Shares"
|
# KBC Bonds Inflation-Linked Bonds Institutional B Shares
|
||||||
|
raw_name = "KBC Bonds Inflation-Linked Bonds Institutional B Shares"
|
||||||
|
raw_share_name = "Institutional B Shares"
|
||||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||||
data_mapping = DataMapping(
|
data_mapping = DataMapping(
|
||||||
doc_id,
|
doc_id,
|
||||||
|
|
@ -711,7 +713,7 @@ def test_mapping_raw_name():
|
||||||
mapping_info = data_mapping.matching_with_database(
|
mapping_info = data_mapping.matching_with_database(
|
||||||
raw_name=raw_name,
|
raw_name=raw_name,
|
||||||
raw_share_name=raw_share_name,
|
raw_share_name=raw_share_name,
|
||||||
parent_id="FSGBR0536J",
|
parent_id="FSGBR051XK",
|
||||||
matching_type="share",
|
matching_type="share",
|
||||||
process_cache=process_cache
|
process_cache=process_cache
|
||||||
)
|
)
|
||||||
|
|
@ -862,6 +864,7 @@ if __name__ == "__main__":
|
||||||
# test_calculate_metrics()
|
# test_calculate_metrics()
|
||||||
# test_replace_abbrevation()
|
# test_replace_abbrevation()
|
||||||
# test_translate_pdf()
|
# test_translate_pdf()
|
||||||
|
# test_mapping_raw_name()
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
|
|
@ -1194,12 +1197,12 @@ if __name__ == "__main__":
|
||||||
"534535767"
|
"534535767"
|
||||||
]
|
]
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
special_doc_id_list = ["334584772"]
|
# special_doc_id_list = ["337293427"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = True
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = True
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
@ -1222,4 +1225,4 @@ if __name__ == "__main__":
|
||||||
)
|
)
|
||||||
|
|
||||||
# test_data_extraction_metrics()
|
# test_data_extraction_metrics()
|
||||||
# test_mapping_raw_name()
|
|
||||||
|
|
|
||||||
|
|
@ -268,10 +268,40 @@ def get_most_similar_name(text: str,
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
|
compare_text = text
|
||||||
try:
|
try:
|
||||||
similarity = get_jacard_similarity(text,
|
text_split = text.split()
|
||||||
copy_name,
|
text_split_lower = text.lower().split()
|
||||||
need_remove_numeric_characters=False)
|
copy_name_split_lower = copy_name.lower().split()
|
||||||
|
if copy_name_feature == "accumulation" and \
|
||||||
|
(text_feature is None or len(text_feature) == 0 or
|
||||||
|
text_feature in ["capitalisation", "institutional"]
|
||||||
|
or "capitalisation" in text_split_lower or "institutional" in text_split_lower):
|
||||||
|
if "capitalisation" not in copy_name_split_lower:
|
||||||
|
compare_text = " ".join([split for split in text_split
|
||||||
|
if split.lower() not in ["cap", "cap.", "capitalisation"]])
|
||||||
|
text_split = compare_text.split()
|
||||||
|
if "institutional" not in copy_name_split_lower:
|
||||||
|
compare_text = " ".join([split for split in text_split
|
||||||
|
if split.lower() not in ["inst", "inst.", "institutional"]])
|
||||||
|
text_split = compare_text.split()
|
||||||
|
if text_feature is not None and len(text_feature) > 0:
|
||||||
|
compare_text = " ".join([split for split in text_split
|
||||||
|
if split.lower() != text_feature])
|
||||||
|
compare_text += " accumulation"
|
||||||
|
text_feature = "accumulation"
|
||||||
|
elif copy_name_feature == "income" and \
|
||||||
|
(text_feature is None or len(text_feature) == 0 or text_feature == "distribution"):
|
||||||
|
if "dist" in text_split_lower or "dist." in text_split_lower or "distribution" in text_split_lower:
|
||||||
|
compare_text = " ".join([split for split in text_split
|
||||||
|
if split.lower() not in ["dist", "dist.", "distribution"]])
|
||||||
|
compare_text += " income"
|
||||||
|
text_feature = "income"
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
similarity = get_jacard_similarity(compare_text,
|
||||||
|
copy_name,
|
||||||
|
need_remove_numeric_characters=False)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
print_exc()
|
print_exc()
|
||||||
|
|
@ -280,9 +310,9 @@ def get_most_similar_name(text: str,
|
||||||
return full_name, similarity
|
return full_name, similarity
|
||||||
copy_name_2 = replace_abbrevation(copy_name)
|
copy_name_2 = replace_abbrevation(copy_name)
|
||||||
if copy_name != copy_name_2:
|
if copy_name != copy_name_2:
|
||||||
similarity_2 = get_jacard_similarity(text,
|
similarity_2 = get_jacard_similarity(compare_text,
|
||||||
copy_name_2,
|
copy_name_2,
|
||||||
need_remove_numeric_characters=False)
|
need_remove_numeric_characters=False)
|
||||||
if similarity_2 > similarity:
|
if similarity_2 > similarity:
|
||||||
similarity = similarity_2
|
similarity = similarity_2
|
||||||
if similarity > max_similarity:
|
if similarity > max_similarity:
|
||||||
|
|
@ -296,7 +326,7 @@ def get_most_similar_name(text: str,
|
||||||
if text_feature != copy_name_feature:
|
if text_feature != copy_name_feature:
|
||||||
if text_feature.lower() not in copy_name.lower().split() and \
|
if text_feature.lower() not in copy_name.lower().split() and \
|
||||||
copy_name_feature.lower() != "accmulation" and \
|
copy_name_feature.lower() != "accmulation" and \
|
||||||
copy_name_feature.lower() not in text.lower().split():
|
copy_name_feature.lower() not in compare_text.lower().split():
|
||||||
continue
|
continue
|
||||||
if matching_type == "share":
|
if matching_type == "share":
|
||||||
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
||||||
|
|
@ -309,6 +339,7 @@ def get_most_similar_name(text: str,
|
||||||
for compare_short in copy_name_short_name_list:
|
for compare_short in copy_name_short_name_list:
|
||||||
if compare_short not in text_share_short_name_list:
|
if compare_short not in text_share_short_name_list:
|
||||||
# some short word is in fund name, but not belong to share name
|
# some short word is in fund name, but not belong to share name
|
||||||
|
|
||||||
if compare_short.upper() not in raw_fund_name_split:
|
if compare_short.upper() not in raw_fund_name_split:
|
||||||
short_name_invalid = True
|
short_name_invalid = True
|
||||||
break
|
break
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue