optimize for investment mapping: share feature logic
This commit is contained in:
parent
04a2409c58
commit
17284c74f0
|
|
@ -41,7 +41,7 @@ class DataExtraction:
|
|||
else:
|
||||
self.page_text_dict = page_text_dict
|
||||
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
|
||||
self.document_mapping_info_df = query_document_fund_mapping(doc_id)
|
||||
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
|
||||
else:
|
||||
self.document_mapping_info_df = document_mapping_info_df
|
||||
self.provider_mapping_df = self.get_provider_mapping()
|
||||
|
|
|
|||
|
|
@ -718,7 +718,7 @@ class Metrics:
|
|||
if incorrect mapping in document mapping:
|
||||
true 0 pred 1 --- hurt precision
|
||||
"""
|
||||
document_mapping_data = query_document_fund_mapping(doc_id)
|
||||
document_mapping_data = query_document_fund_mapping(doc_id, rerun=False)
|
||||
if len(document_mapping_data) == 0:
|
||||
return [1], [1], []
|
||||
fund_id_list = document_mapping_data["FundId"].unique().tolist()
|
||||
|
|
|
|||
15
main.py
15
main.py
|
|
@ -384,11 +384,12 @@ def only_output_mapping_data_in_db(mapping_data: pd.DataFrame) -> None:
|
|||
fund_id_list = document_mapping["FundId"].unique().tolist()
|
||||
sec_id_list = document_mapping["SecId"].unique().tolist()
|
||||
id_list = fund_id_list + sec_id_list
|
||||
# filter doc_mapping_data by id_list
|
||||
filter_doc_mapping_data = doc_mapping_data[doc_mapping_data["investment_id"].isin(id_list)]
|
||||
# filter doc_mapping_data by id_list or empty id
|
||||
filter_doc_mapping_data = doc_mapping_data[(doc_mapping_data["investment_id"].isin(id_list)) | (doc_mapping_data["investment_id"] == "")]
|
||||
data_in_mapping_df_list.append(filter_doc_mapping_data)
|
||||
result_mapping_data_df = pd.concat(data_in_mapping_df_list)
|
||||
result_mapping_data_df.reset_index(drop=True, inplace=True)
|
||||
|
||||
return result_mapping_data_df
|
||||
|
||||
|
||||
|
|
@ -600,9 +601,9 @@ def test_data_extraction_metrics():
|
|||
|
||||
|
||||
def test_mapping_raw_name():
|
||||
doc_id = "382366116"
|
||||
raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I"
|
||||
raw_share_name = "EUR I"
|
||||
doc_id = "337293427"
|
||||
raw_name = "KBC BONDS CAPITAL FUND Institutional F Shares"
|
||||
raw_share_name = "Institutional F Shares"
|
||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||
data_mapping = DataMapping(
|
||||
doc_id,
|
||||
|
|
@ -615,7 +616,7 @@ def test_mapping_raw_name():
|
|||
mapping_info = data_mapping.matching_with_database(
|
||||
raw_name=raw_name,
|
||||
raw_share_name=raw_share_name,
|
||||
parent_id=None,
|
||||
parent_id="FSGBR0536J",
|
||||
matching_type="share",
|
||||
process_cache=process_cache
|
||||
)
|
||||
|
|
@ -782,7 +783,7 @@ if __name__ == "__main__":
|
|||
# ]
|
||||
# special_doc_id_list = check_mapping_doc_id_list
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
# special_doc_id_list = ["394778487"]
|
||||
# special_doc_id_list = ["382366116"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
|
|
|
|||
|
|
@ -286,7 +286,9 @@ def get_most_similar_name(text: str,
|
|||
if text_feature is not None and len(text_feature) > 0 and \
|
||||
copy_name_feature is not None and len(copy_name_feature) > 0:
|
||||
if text_feature != copy_name_feature:
|
||||
if copy_name_feature.lower() not in text.lower().split():
|
||||
if text_feature.lower() not in copy_name.lower().split() and \
|
||||
copy_name_feature.lower() != "accmulation" and \
|
||||
copy_name_feature.lower() not in text.lower().split():
|
||||
continue
|
||||
if matching_type == "share":
|
||||
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
||||
|
|
|
|||
Loading…
Reference in New Issue