optimize for investment mapping: share feature logic
This commit is contained in:
parent
04a2409c58
commit
17284c74f0
|
|
@ -41,7 +41,7 @@ class DataExtraction:
|
||||||
else:
|
else:
|
||||||
self.page_text_dict = page_text_dict
|
self.page_text_dict = page_text_dict
|
||||||
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
|
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
|
||||||
self.document_mapping_info_df = query_document_fund_mapping(doc_id)
|
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
|
||||||
else:
|
else:
|
||||||
self.document_mapping_info_df = document_mapping_info_df
|
self.document_mapping_info_df = document_mapping_info_df
|
||||||
self.provider_mapping_df = self.get_provider_mapping()
|
self.provider_mapping_df = self.get_provider_mapping()
|
||||||
|
|
|
||||||
|
|
@ -718,7 +718,7 @@ class Metrics:
|
||||||
if incorrect mapping in document mapping:
|
if incorrect mapping in document mapping:
|
||||||
true 0 pred 1 --- hurt precision
|
true 0 pred 1 --- hurt precision
|
||||||
"""
|
"""
|
||||||
document_mapping_data = query_document_fund_mapping(doc_id)
|
document_mapping_data = query_document_fund_mapping(doc_id, rerun=False)
|
||||||
if len(document_mapping_data) == 0:
|
if len(document_mapping_data) == 0:
|
||||||
return [1], [1], []
|
return [1], [1], []
|
||||||
fund_id_list = document_mapping_data["FundId"].unique().tolist()
|
fund_id_list = document_mapping_data["FundId"].unique().tolist()
|
||||||
|
|
|
||||||
15
main.py
15
main.py
|
|
@ -384,11 +384,12 @@ def only_output_mapping_data_in_db(mapping_data: pd.DataFrame) -> None:
|
||||||
fund_id_list = document_mapping["FundId"].unique().tolist()
|
fund_id_list = document_mapping["FundId"].unique().tolist()
|
||||||
sec_id_list = document_mapping["SecId"].unique().tolist()
|
sec_id_list = document_mapping["SecId"].unique().tolist()
|
||||||
id_list = fund_id_list + sec_id_list
|
id_list = fund_id_list + sec_id_list
|
||||||
# filter doc_mapping_data by id_list
|
# filter doc_mapping_data by id_list or empty id
|
||||||
filter_doc_mapping_data = doc_mapping_data[doc_mapping_data["investment_id"].isin(id_list)]
|
filter_doc_mapping_data = doc_mapping_data[(doc_mapping_data["investment_id"].isin(id_list)) | (doc_mapping_data["investment_id"] == "")]
|
||||||
data_in_mapping_df_list.append(filter_doc_mapping_data)
|
data_in_mapping_df_list.append(filter_doc_mapping_data)
|
||||||
result_mapping_data_df = pd.concat(data_in_mapping_df_list)
|
result_mapping_data_df = pd.concat(data_in_mapping_df_list)
|
||||||
result_mapping_data_df.reset_index(drop=True, inplace=True)
|
result_mapping_data_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
return result_mapping_data_df
|
return result_mapping_data_df
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -600,9 +601,9 @@ def test_data_extraction_metrics():
|
||||||
|
|
||||||
|
|
||||||
def test_mapping_raw_name():
|
def test_mapping_raw_name():
|
||||||
doc_id = "382366116"
|
doc_id = "337293427"
|
||||||
raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I"
|
raw_name = "KBC BONDS CAPITAL FUND Institutional F Shares"
|
||||||
raw_share_name = "EUR I"
|
raw_share_name = "Institutional F Shares"
|
||||||
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
|
||||||
data_mapping = DataMapping(
|
data_mapping = DataMapping(
|
||||||
doc_id,
|
doc_id,
|
||||||
|
|
@ -615,7 +616,7 @@ def test_mapping_raw_name():
|
||||||
mapping_info = data_mapping.matching_with_database(
|
mapping_info = data_mapping.matching_with_database(
|
||||||
raw_name=raw_name,
|
raw_name=raw_name,
|
||||||
raw_share_name=raw_share_name,
|
raw_share_name=raw_share_name,
|
||||||
parent_id=None,
|
parent_id="FSGBR0536J",
|
||||||
matching_type="share",
|
matching_type="share",
|
||||||
process_cache=process_cache
|
process_cache=process_cache
|
||||||
)
|
)
|
||||||
|
|
@ -782,7 +783,7 @@ if __name__ == "__main__":
|
||||||
# ]
|
# ]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["394778487"]
|
# special_doc_id_list = ["382366116"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
|
|
|
||||||
|
|
@ -286,7 +286,9 @@ def get_most_similar_name(text: str,
|
||||||
if text_feature is not None and len(text_feature) > 0 and \
|
if text_feature is not None and len(text_feature) > 0 and \
|
||||||
copy_name_feature is not None and len(copy_name_feature) > 0:
|
copy_name_feature is not None and len(copy_name_feature) > 0:
|
||||||
if text_feature != copy_name_feature:
|
if text_feature != copy_name_feature:
|
||||||
if copy_name_feature.lower() not in text.lower().split():
|
if text_feature.lower() not in copy_name.lower().split() and \
|
||||||
|
copy_name_feature.lower() != "accmulation" and \
|
||||||
|
copy_name_feature.lower() not in text.lower().split():
|
||||||
continue
|
continue
|
||||||
if matching_type == "share":
|
if matching_type == "share":
|
||||||
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue