From 17284c74f0f2dec94ecbbd852b0845ef270fe28e Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 9 Oct 2024 14:07:07 -0500 Subject: [PATCH] optimize for investment mapping: share feature logic --- core/data_extraction.py | 2 +- core/metrics.py | 2 +- main.py | 15 ++++++++------- utils/biz_utils.py | 4 +++- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/core/data_extraction.py b/core/data_extraction.py index d9990bf..2d002a1 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -41,7 +41,7 @@ class DataExtraction: else: self.page_text_dict = page_text_dict if document_mapping_info_df is None or len(document_mapping_info_df) == 0: - self.document_mapping_info_df = query_document_fund_mapping(doc_id) + self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) else: self.document_mapping_info_df = document_mapping_info_df self.provider_mapping_df = self.get_provider_mapping() diff --git a/core/metrics.py b/core/metrics.py index 43707d0..03a113a 100644 --- a/core/metrics.py +++ b/core/metrics.py @@ -718,7 +718,7 @@ class Metrics: if incorrect mapping in document mapping: true 0 pred 1 --- hurt precision """ - document_mapping_data = query_document_fund_mapping(doc_id) + document_mapping_data = query_document_fund_mapping(doc_id, rerun=False) if len(document_mapping_data) == 0: return [1], [1], [] fund_id_list = document_mapping_data["FundId"].unique().tolist() diff --git a/main.py b/main.py index f50f738..b9c0a49 100644 --- a/main.py +++ b/main.py @@ -384,11 +384,12 @@ def only_output_mapping_data_in_db(mapping_data: pd.DataFrame) -> None: fund_id_list = document_mapping["FundId"].unique().tolist() sec_id_list = document_mapping["SecId"].unique().tolist() id_list = fund_id_list + sec_id_list - # filter doc_mapping_data by id_list - filter_doc_mapping_data = doc_mapping_data[doc_mapping_data["investment_id"].isin(id_list)] + # filter doc_mapping_data by id_list or empty id + filter_doc_mapping_data = doc_mapping_data[(doc_mapping_data["investment_id"].isin(id_list)) | (doc_mapping_data["investment_id"] == "")] data_in_mapping_df_list.append(filter_doc_mapping_data) result_mapping_data_df = pd.concat(data_in_mapping_df_list) result_mapping_data_df.reset_index(drop=True, inplace=True) + return result_mapping_data_df @@ -600,9 +601,9 @@ def test_data_extraction_metrics(): def test_mapping_raw_name(): - doc_id = "382366116" - raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I" - raw_share_name = "EUR I" + doc_id = "337293427" + raw_name = "KBC BONDS CAPITAL FUND Institutional F Shares" + raw_share_name = "Institutional F Shares" output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/" data_mapping = DataMapping( doc_id, @@ -615,7 +616,7 @@ def test_mapping_raw_name(): mapping_info = data_mapping.matching_with_database( raw_name=raw_name, raw_share_name=raw_share_name, - parent_id=None, + parent_id="FSGBR0536J", matching_type="share", process_cache=process_cache ) @@ -782,7 +783,7 @@ if __name__ == "__main__": # ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["394778487"] + # special_doc_id_list = ["382366116"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False diff --git a/utils/biz_utils.py b/utils/biz_utils.py index f9ef772..587520c 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -286,7 +286,9 @@ def get_most_similar_name(text: str, if text_feature is not None and len(text_feature) > 0 and \ copy_name_feature is not None and len(copy_name_feature) > 0: if text_feature != copy_name_feature: - if copy_name_feature.lower() not in text.lower().split(): + if text_feature.lower() not in copy_name.lower().split() and \ + copy_name_feature.lower() != "accmulation" and \ + copy_name_feature.lower() not in text.lower().split(): continue if matching_type == "share": if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \