optimize for investment mapping: share feature logic

This commit is contained in:
Blade He 2024-10-09 14:07:07 -05:00
parent 04a2409c58
commit 17284c74f0
4 changed files with 13 additions and 10 deletions

View File

@ -41,7 +41,7 @@ class DataExtraction:
else: else:
self.page_text_dict = page_text_dict self.page_text_dict = page_text_dict
if document_mapping_info_df is None or len(document_mapping_info_df) == 0: if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
self.document_mapping_info_df = query_document_fund_mapping(doc_id) self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
else: else:
self.document_mapping_info_df = document_mapping_info_df self.document_mapping_info_df = document_mapping_info_df
self.provider_mapping_df = self.get_provider_mapping() self.provider_mapping_df = self.get_provider_mapping()

View File

@ -718,7 +718,7 @@ class Metrics:
if incorrect mapping in document mapping: if incorrect mapping in document mapping:
true 0 pred 1 --- hurt precision true 0 pred 1 --- hurt precision
""" """
document_mapping_data = query_document_fund_mapping(doc_id) document_mapping_data = query_document_fund_mapping(doc_id, rerun=False)
if len(document_mapping_data) == 0: if len(document_mapping_data) == 0:
return [1], [1], [] return [1], [1], []
fund_id_list = document_mapping_data["FundId"].unique().tolist() fund_id_list = document_mapping_data["FundId"].unique().tolist()

15
main.py
View File

@ -384,11 +384,12 @@ def only_output_mapping_data_in_db(mapping_data: pd.DataFrame) -> None:
fund_id_list = document_mapping["FundId"].unique().tolist() fund_id_list = document_mapping["FundId"].unique().tolist()
sec_id_list = document_mapping["SecId"].unique().tolist() sec_id_list = document_mapping["SecId"].unique().tolist()
id_list = fund_id_list + sec_id_list id_list = fund_id_list + sec_id_list
# filter doc_mapping_data by id_list # filter doc_mapping_data by id_list or empty id
filter_doc_mapping_data = doc_mapping_data[doc_mapping_data["investment_id"].isin(id_list)] filter_doc_mapping_data = doc_mapping_data[(doc_mapping_data["investment_id"].isin(id_list)) | (doc_mapping_data["investment_id"] == "")]
data_in_mapping_df_list.append(filter_doc_mapping_data) data_in_mapping_df_list.append(filter_doc_mapping_data)
result_mapping_data_df = pd.concat(data_in_mapping_df_list) result_mapping_data_df = pd.concat(data_in_mapping_df_list)
result_mapping_data_df.reset_index(drop=True, inplace=True) result_mapping_data_df.reset_index(drop=True, inplace=True)
return result_mapping_data_df return result_mapping_data_df
@ -600,9 +601,9 @@ def test_data_extraction_metrics():
def test_mapping_raw_name(): def test_mapping_raw_name():
doc_id = "382366116" doc_id = "337293427"
raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I" raw_name = "KBC BONDS CAPITAL FUND Institutional F Shares"
raw_share_name = "EUR I" raw_share_name = "Institutional F Shares"
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
data_mapping = DataMapping( data_mapping = DataMapping(
doc_id, doc_id,
@ -615,7 +616,7 @@ def test_mapping_raw_name():
mapping_info = data_mapping.matching_with_database( mapping_info = data_mapping.matching_with_database(
raw_name=raw_name, raw_name=raw_name,
raw_share_name=raw_share_name, raw_share_name=raw_share_name,
parent_id=None, parent_id="FSGBR0536J",
matching_type="share", matching_type="share",
process_cache=process_cache process_cache=process_cache
) )
@ -782,7 +783,7 @@ if __name__ == "__main__":
# ] # ]
# special_doc_id_list = check_mapping_doc_id_list # special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["394778487"] # special_doc_id_list = ["382366116"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = False

View File

@ -286,7 +286,9 @@ def get_most_similar_name(text: str,
if text_feature is not None and len(text_feature) > 0 and \ if text_feature is not None and len(text_feature) > 0 and \
copy_name_feature is not None and len(copy_name_feature) > 0: copy_name_feature is not None and len(copy_name_feature) > 0:
if text_feature != copy_name_feature: if text_feature != copy_name_feature:
if copy_name_feature.lower() not in text.lower().split(): if text_feature.lower() not in copy_name.lower().split() and \
copy_name_feature.lower() != "accmulation" and \
copy_name_feature.lower() not in text.lower().split():
continue continue
if matching_type == "share": if matching_type == "share":
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \ if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \