optimize for investment mapping: share feature logic

2024-10-09 14:07:07 -05:00 · 2024-10-09 14:07:07 -05:00 · 17284c74f0
parent 04a2409c58
commit 17284c74f0
4 changed files with 13 additions and 10 deletions
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -41,7 +41,7 @@ class DataExtraction:
        else:
            self.page_text_dict = page_text_dict
        if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
-            self.document_mapping_info_df = query_document_fund_mapping(doc_id)
+            self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
        else:
            self.document_mapping_info_df = document_mapping_info_df
        self.provider_mapping_df = self.get_provider_mapping()
--- a/core/metrics.py
+++ b/core/metrics.py
@ -718,7 +718,7 @@ class Metrics:
                                if incorrect mapping in document mapping:
                                true 0 pred 1 --- hurt precision
        """
-        document_mapping_data = query_document_fund_mapping(doc_id)
+        document_mapping_data = query_document_fund_mapping(doc_id, rerun=False)
        if len(document_mapping_data) == 0:
            return [1], [1], []
        fund_id_list = document_mapping_data["FundId"].unique().tolist()
--- a/main.py
+++ b/main.py
@ -384,11 +384,12 @@ def only_output_mapping_data_in_db(mapping_data: pd.DataFrame) -> None:
        fund_id_list = document_mapping["FundId"].unique().tolist()
        sec_id_list = document_mapping["SecId"].unique().tolist()
        id_list = fund_id_list + sec_id_list
-        # filter doc_mapping_data by id_list
+        # filter doc_mapping_data by id_list or empty id
-        filter_doc_mapping_data = doc_mapping_data[doc_mapping_data["investment_id"].isin(id_list)]
+        filter_doc_mapping_data = doc_mapping_data[(doc_mapping_data["investment_id"].isin(id_list)) | (doc_mapping_data["investment_id"] == "")]
        data_in_mapping_df_list.append(filter_doc_mapping_data)
    result_mapping_data_df = pd.concat(data_in_mapping_df_list)
    result_mapping_data_df.reset_index(drop=True, inplace=True)
    return result_mapping_data_df
@ -600,9 +601,9 @@ def test_data_extraction_metrics():
 def test_mapping_raw_name():
-    doc_id = "382366116"
+    doc_id = "337293427"
-    raw_name = "SPARINVEST SICAV - ETHICAL EMERGING MARKETS VALUE EUR I"
+    raw_name = "KBC BONDS CAPITAL FUND Institutional F Shares"
-    raw_share_name = "EUR I"
+    raw_share_name = "Institutional F Shares"
    output_folder = r"/data/emea_ar/output/mapping_data/docs/by_text/"
    data_mapping = DataMapping(
        doc_id,
@ -615,7 +616,7 @@ def test_mapping_raw_name():
    mapping_info = data_mapping.matching_with_database(
        raw_name=raw_name,
        raw_share_name=raw_share_name,
-        parent_id=None, 
+        parent_id="FSGBR0536J", 
        matching_type="share",
        process_cache=process_cache
    )
@ -782,7 +783,7 @@ if __name__ == "__main__":
    # ]
    # special_doc_id_list = check_mapping_doc_id_list
    special_doc_id_list = check_db_mapping_doc_id_list
-    # special_doc_id_list = ["394778487"]
+    # special_doc_id_list = ["382366116"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -286,7 +286,9 @@ def get_most_similar_name(text: str,
                    if text_feature is not None and len(text_feature) > 0 and \
                        copy_name_feature is not None and len(copy_name_feature) > 0:
                        if text_feature != copy_name_feature:
-                            if copy_name_feature.lower() not in text.lower().split():
+                            if text_feature.lower() not in copy_name.lower().split() and \
                                copy_name_feature.lower() != "accmulation" and \
                                    copy_name_feature.lower() not in text.lower().split():
                                continue
                    if matching_type == "share":
                        if text_share_short_name_list is not None and len(text_share_short_name_list) > 0 and \