optimize for investment mapping: share feature logic

This commit is contained in:
Blade He 2024-10-09 14:07:07 -05:00
parent 04a2409c58
commit 17284c74f0
4 changed files with 13 additions and 10 deletions

View File

@ -41,7 +41,7 @@ class DataExtraction:
else:
self.page_text_dict = page_text_dict
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
self.document_mapping_info_df = query_document_fund_mapping(doc_id)
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
else:
self.document_mapping_info_df = document_mapping_info_df
self.provider_mapping_df = self.get_provider_mapping()

View File

@ -718,7 +718,7 @@ class Metrics:
if incorrect mapping in document mapping:
true 0 pred 1 --- hurt precision
"""
document_mapping_data = query_document_fund_mapping(doc_id)
document_mapping_data = query_document_fund_mapping(doc_id, rerun=False)
if len(document_mapping_data) == 0:
return [1], [1], []
fund_id_list = document_mapping_data["FundId"].unique().tolist()

15
main.py
View File

@ -384,11 +384,12 @@ def only_output_mapping_data_in_db(mapping_data: pd.DataFrame) -> None:
fund_id_list = document_mapping["FundId"].unique().tolist()
sec_id_list = document_mapping["SecId"].unique().tolist()
id_list = fund_id_list + sec_id_list
# filter doc_mapping_data by id_list
filter_doc_mapping_data = doc_mapping_data[doc_mapping_data["investment_id"].isin(id_list)]
# filter doc_mapping_data by id_list or empty id
filter_doc_mapping_data = doc_mapping_data[(doc_mapping_data["investment_id"].isin(id_list)) | (doc_mapping_data["investment_id"] == "")]
data_in_mapping_df_list.append(filter_doc_mapping_data)
result_mapping_data_df = pd.concat(data_in_mapping_df_list)
result_mapping_data_df.reset_index(drop=True, inplace=True)
return result_mapping_data_df
@ -600,9 +601,9 @@ def test_data_extraction_metrics():
def test_mapping_raw_name():
doc_id = "382366116"
raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I"
raw_share_name = "EUR I"
doc_id = "337293427"
raw_name = "KBC BONDS CAPITAL FUND Institutional F Shares"
raw_share_name = "Institutional F Shares"
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
data_mapping = DataMapping(
doc_id,
@ -615,7 +616,7 @@ def test_mapping_raw_name():
mapping_info = data_mapping.matching_with_database(
raw_name=raw_name,
raw_share_name=raw_share_name,
parent_id=None,
parent_id="FSGBR0536J",
matching_type="share",
process_cache=process_cache
)
@ -782,7 +783,7 @@ if __name__ == "__main__":
# ]
# special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["394778487"]
# special_doc_id_list = ["382366116"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False

View File

@ -286,7 +286,9 @@ def get_most_similar_name(text: str,
if text_feature is not None and len(text_feature) > 0 and \
copy_name_feature is not None and len(copy_name_feature) > 0:
if text_feature != copy_name_feature:
if copy_name_feature.lower() not in text.lower().split():
if text_feature.lower() not in copy_name.lower().split() and \
copy_name_feature.lower() != "accmulation" and \
copy_name_feature.lower() not in text.lower().split():
continue
if matching_type == "share":
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \