Merge branch 'aus_prospectus_ravi'

2025-01-27 12:32:42 -06:00 · 2025-01-27 12:32:42 -06:00 · 6f831e241c
parent 41f8c307ff 47c41e492f
commit 6f831e241c
4 changed files with 1293 additions and 116 deletions
--- a/app_emea_ar.py
+++ b/app_emea_ar.py
@ -69,7 +69,8 @@ def emea_ar_data_extract():
                                          output_extract_data_folder=output_extract_data_folder,
                                          output_mapping_data_folder=output_mapping_data_folder,
                                          extract_way=extract_way,
-                                          drilldown_folder=drilldown_folder)
+                                          drilldown_folder=drilldown_folder,
+                                          compare_with_provider=False)
        doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
        doc_mapping_data = emea_ar_parsing.mapping_data(
            data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
--- a/configuration/emea_ar/abbreviation_records.json
+++ b/configuration/emea_ar/abbreviation_records.json
--- a/core/auz_nz/hybrid_solution_script.py
+++ b/core/auz_nz/hybrid_solution_script.py
@ -32,24 +32,24 @@ from openai import AzureOpenAI

 ABB_JSON = dict()

-def get_abb_json():
+def get_abb_json(doc_source: str = "aus_prospectus"):
    global ABB_JSON
    if len(ABB_JSON.keys()) == 0:
-        with open("./configuration/aus_prospectus/abbreviation_records.json", "r") as file:
+        with open(f"./configuration/{doc_source}/abbreviation_records.json", "r") as file:
            # Load the JSON and convert keys to lowercase
            ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}

-def get_abbre_format_str(fundname):
+def get_abbre_format_str(fundname, doc_source: str = "aus_prospectus"):
    """Replaces abbreviations in a fund name with their expanded forms."""
    # Convert fund name to lowercase while matching
    f_list = fundname.lower().split()
-    get_abb_json()
+    get_abb_json(doc_source)
    updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
    return " ".join(updated_doc_fname_words)

-def replace_abbrevs_in_fundnames(fund_names_list):
+def replace_abbrevs_in_fundnames(fund_names_list, doc_source: str = "aus_prospectus"):
    """Replaces abbreviations in a list of fund names."""
-    return [get_abbre_format_str(fund_name) for fund_name in fund_names_list]
+    return [get_abbre_format_str(fund_name, doc_source) for fund_name in fund_names_list]


 ### STEP 2 - Remove Stopwords
@ -440,7 +440,7 @@ def format_response(doc_id, pred_fund, db_fund, clean_pred_name, clean_db_name,
    return dt


-def final_function_to_match(doc_id, pred_list, db_list, provider_name):
+def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_source: str = "aus_prospectus"):
    final_result = {}
    df_data = []
    unmatched_pred_list = pred_list.copy()
@ -466,8 +466,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
                # unmatched_pred_list.remove(pred_list[index])
            else:
                ### STEP-1 Abbreviation replacement
-                cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund])[0]
-                cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list)
+                cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund], doc_source)[0]
+                cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list, doc_source)
                # print("--> ",cleaned_db_list1, cleaned_pred_name1)
                step1_result, matched_index, all_scores1_, all_matched_fund_names1_ = get_fund_match_final_score(cleaned_db_list1, cleaned_pred_name1)
                # print(f"\nStep 1 - Abbreviation Replacement Result: {step1_result}")
@ -617,11 +617,11 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
    # print("==>>> DB LIST: ",unmatched_db_list)
    # print("==>>> PRED LIST: ",unmatched_pred_list)
    if len(unmatched_pred_list)!=0:
-        cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list)
+        cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list, doc_source)
        cleaned_unmatched_pred_list = remove_stopwords_nltk(cleaned_unmatched_pred_list)
        cleaned_unmatched_pred_list = remove_special_characters(cleaned_unmatched_pred_list)
        
-        cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list)
+        cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list, doc_source)
        cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list)
        cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list)
        prompt_context = f"""
--- a/core/data_mapping.py
+++ b/core/data_mapping.py
@ -1,6 +1,7 @@
 import os
 import json
 import pandas as pd
+from copy import deepcopy
 from utils.biz_utils import get_most_similar_name, remove_common_word
 from utils.sql_query_util import (
    query_document_fund_mapping,
@ -18,14 +19,18 @@ class DataMapping:
        raw_document_data_list: list,
        document_mapping_info_df: pd.DataFrame,
        output_data_folder: str,
-        doc_source: str = "emea_ar"
+        doc_source: str = "emea_ar",
+        compare_with_provider: bool = True
    ):
        self.doc_id = doc_id
        self.datapoints = datapoints
        self.doc_source = doc_source
+        self.compare_with_provider = compare_with_provider
        self.raw_document_data_list = raw_document_data_list
        if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
-            self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
+            self.document_mapping_info_df = query_document_fund_mapping(
+                doc_id, rerun=False
+            )
        else:
            self.document_mapping_info_df = document_mapping_info_df

@ -44,7 +49,9 @@ class DataMapping:
    def set_mapping_data_by_db(self, document_mapping_info_df: pd.DataFrame):
        logger.info("Setting document mapping data")
        if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
-            self.document_mapping_info_df = query_document_fund_mapping(self.doc_id, rerun=False)
+            self.document_mapping_info_df = query_document_fund_mapping(
+                self.doc_id, rerun=False
+            )
        else:
            self.document_mapping_info_df = document_mapping_info_df
        if len(self.document_mapping_info_df) == 0:
@ -92,26 +99,27 @@ class DataMapping:
    def get_provider_mapping(self):
        if len(self.document_mapping_info_df) == 0:
            return pd.DataFrame()
-        provider_id_list = (
-            self.document_mapping_info_df["ProviderId"].unique().tolist()
-        )
+        provider_id_list = self.document_mapping_info_df["ProviderId"].unique().tolist()
        provider_mapping_list = []
        for provider_id in provider_id_list:
-            provider_mapping_list.append(query_investment_by_provider(provider_id, rerun=False))
+            provider_mapping_list.append(
+                query_investment_by_provider(provider_id, rerun=False)
+            )
        provider_mapping_df = pd.concat(provider_mapping_list)
        provider_mapping_df = provider_mapping_df.drop_duplicates()
        provider_mapping_df.reset_index(drop=True, inplace=True)
        return provider_mapping_df
-    
+
    def mapping_raw_data_entrance(self):
-        if self.doc_source == "emear_ar":
+        if self.doc_source == "emea_ar":
            return self.mapping_raw_data()
        elif self.doc_source == "aus_prospectus":
-            return self.mapping_raw_data_aus()
+            return self.mapping_raw_data_generic()
        else:
            return self.mapping_raw_data()
-        
-    def mapping_raw_data_aus(self):
+        # return self.mapping_raw_data_generic()
+
+    def mapping_raw_data_generic(self):
        logger.info(f"Mapping raw data for AUS Prospectus document {self.doc_id}")
        mapped_data_list = []
        # Generate raw name based on fund name and share name by integrate_share_name
@ -128,7 +136,9 @@ class DataMapping:
                raw_share_name = raw_data.get("share_name", "")
                raw_data_keys = list(raw_data.keys())
                if len(raw_share_name) > 0:
-                    integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name)
+                    integrated_share_name = self.integrate_share_name(
+                        raw_fund_name, raw_share_name
+                    )
                    if integrated_share_name not in share_raw_name_list:
                        share_raw_name_list.append(integrated_share_name)
                    for datapoint in self.datapoints:
@ -144,7 +154,7 @@ class DataMapping:
                                "investment_type": 1,
                                "investment_id": "",
                                "investment_name": "",
-                                "similarity": 0
+                                "similarity": 0,
                            }
                            mapped_data_list.append(mapped_data)
                else:
@ -162,29 +172,38 @@ class DataMapping:
                                "value": raw_data[datapoint],
                                "investment_type": 33,
                                "investment_id": "",
-                                "investment_name": ""
+                                "investment_name": "",
                            }
                            mapped_data_list.append(mapped_data)
        # Mapping raw data with database
-        iter_count = 30
+        iter_count = 60
        fund_match_result = {}
        if len(fund_raw_name_list) > 0:
-            fund_match_result = self.get_raw_name_db_match_result(fund_raw_name_list, "fund", iter_count)
-            logger.info(f"Fund match result: \n{fund_match_result}")
+            fund_match_result = self.get_raw_name_db_match_result(
+                fund_raw_name_list, "fund", iter_count
+            )
+            # logger.info(f"Fund match result: \n{fund_match_result}")
        share_match_result = {}
        if len(share_raw_name_list) > 0:
-            share_match_result = self.get_raw_name_db_match_result(share_raw_name_list, "share", iter_count)
-            logger.info(f"Share match result: \n{share_match_result}")
-        
+            share_match_result = self.get_raw_name_db_match_result(
+                share_raw_name_list, "share", iter_count
+            )
+            # logger.info(f"Share match result: \n{share_match_result}")
+
        for mapped_data in mapped_data_list:
            investment_type = mapped_data["investment_type"]
            raw_name = mapped_data["raw_name"]
            if investment_type == 33:
                if fund_match_result.get(raw_name) is not None:
                    matched_db_fund_name = fund_match_result[raw_name]
-                    if matched_db_fund_name is not None and len(matched_db_fund_name) > 0:
+                    if (
+                        matched_db_fund_name is not None
+                        and len(matched_db_fund_name) > 0
+                    ):
                        # get FundId from self.doc_fund_mapping
-                        find_fund_df = self.doc_fund_mapping[self.doc_fund_mapping["FundName"] == matched_db_fund_name]
+                        find_fund_df = self.doc_fund_mapping[
+                            self.doc_fund_mapping["FundName"] == matched_db_fund_name
+                        ]
                        if find_fund_df is not None and len(find_fund_df) > 0:
                            fund_id = find_fund_df["FundId"].values[0]
                            mapped_data["investment_id"] = fund_id
@ -193,38 +212,82 @@ class DataMapping:
            if investment_type == 1:
                if share_match_result.get(raw_name) is not None:
                    matched_db_share_name = share_match_result[raw_name]
-                    if matched_db_share_name is not None and len(matched_db_share_name) > 0:
+                    if (
+                        matched_db_share_name is not None
+                        and len(matched_db_share_name) > 0
+                    ):
                        # get SecId from self.doc_fund_class_mapping
-                        find_share_df = self.doc_fund_class_mapping[self.doc_fund_class_mapping["ShareClassName"] == matched_db_share_name]
+                        find_share_df = self.doc_fund_class_mapping[
+                            self.doc_fund_class_mapping["ShareClassName"]
+                            == matched_db_share_name
+                        ]
                        if find_share_df is not None and len(find_share_df) > 0:
                            share_id = find_share_df["SecId"].values[0]
                            mapped_data["investment_id"] = share_id
                            mapped_data["investment_name"] = matched_db_share_name
                            mapped_data["similarity"] = 1
-                        
+
        self.output_mapping_file(mapped_data_list)
        return mapped_data_list
-        
-    def get_raw_name_db_match_result(self, raw_name_list, investment_type: str, iter_count: int = 30):
+
+    def get_raw_name_db_match_result(
+        self, raw_name_list, investment_type: str, iter_count: int = 30
+    ):
        # split raw_name_list into several parts which each part is with 30 elements
        # The reason to split is to avoid invoke token limitation issues from CahtGPT
-        raw_name_list_parts = [raw_name_list[i:i + iter_count] 
-                               for i in range(0, len(raw_name_list), iter_count)]
+        raw_name_list_parts = [
+            raw_name_list[i : i + iter_count]
+            for i in range(0, len(raw_name_list), iter_count)
+        ]
        all_match_result = {}
+        doc_fund_name_list = deepcopy(self.doc_fund_name_list)
+        doc_share_name_list = deepcopy(self.doc_share_name_list)
        for raw_name_list in raw_name_list_parts:
            if investment_type == "fund":
-                match_result = final_function_to_match(doc_id=self.doc_id,
-                                                       pred_list=raw_name_list,
-                                                       db_list=self.doc_fund_name_list,
-                                                       provider_name=self.provider_name)
+                match_result, doc_fund_name_list = self.get_final_function_to_match(
+                    raw_name_list, doc_fund_name_list
+                )
            else:
-                match_result = final_function_to_match(doc_id=self.doc_id,
-                                                       pred_list=raw_name_list,
-                                                       db_list=self.doc_share_name_list,
-                                                       provider_name=self.provider_name)
+                match_result, doc_share_name_list = self.get_final_function_to_match(
+                    raw_name_list, doc_share_name_list
+                )
            all_match_result.update(match_result)
        return all_match_result
+
+    def get_final_function_to_match(self, raw_name_list, db_name_list):
+        if len(db_name_list) == 0:
+            match_result = {}
+            for raw_name in raw_name_list:
+                match_result[raw_name] = ""
+        else:
+            match_result = final_function_to_match(
+                doc_id=self.doc_id,
+                pred_list=raw_name_list,
+                db_list=db_name_list,
+                provider_name=self.provider_name,
+                doc_source=self.doc_source
+            )
+            matched_name_list = list(match_result.values())
+            db_name_list = self.remove_matched_names(db_name_list, matched_name_list)
+        return match_result, db_name_list
+
+    def remove_matched_names(self, target_name_list: list, matched_name_list: list):
+        if len(matched_name_list) == 0:
+            return target_name_list
        
+        matched_name_list = list(set(matched_name_list))
+        matched_name_list = [
+            value for value in matched_name_list if value is not None and len(value) > 0
+        ]
+        for matched_name in matched_name_list:
+            if (
+                matched_name is not None
+                and len(matched_name) > 0
+                and matched_name in target_name_list
+            ):
+                target_name_list.remove(matched_name)
+        return target_name_list
+
    def mapping_raw_data(self):
        """
        doc_id, page_index, datapoint, value,
@ -245,9 +308,14 @@ class DataMapping:
                if raw_fund_name is None or len(raw_fund_name) == 0:
                    continue
                raw_share_name = raw_data.get("share_name", "")
-                if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0:
+                if (
+                    len(self.doc_fund_name_list) == 0
+                    and len(self.provider_fund_name_list) == 0
+                ):
                    if len(raw_share_name) > 0:
-                        integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name)
+                        integrated_share_name = self.integrate_share_name(
+                            raw_fund_name, raw_share_name
+                        )
                        raw_data_keys = list(raw_data.keys())
                        for datapoint in self.datapoints:
                            if datapoint in raw_data_keys:
@ -262,7 +330,7 @@ class DataMapping:
                                    "investment_type": 1,
                                    "investment_id": "",
                                    "investment_name": "",
-                                    "similarity": 0
+                                    "similarity": 0,
                                }
                                mapped_data_list.append(mapped_data)
                    else:
@ -279,13 +347,15 @@ class DataMapping:
                                    "value": raw_data[datapoint],
                                    "investment_type": 33,
                                    "investment_id": "",
-                                    "investment_name": ""
+                                    "investment_name": "",
                                }
                                mapped_data_list.append(mapped_data)
                else:
                    raw_name = ""
                    if raw_share_name is not None and len(raw_share_name) > 0:
-                        raw_name = self.integrate_share_name(raw_fund_name, raw_share_name)
+                        raw_name = self.integrate_share_name(
+                            raw_fund_name, raw_share_name
+                        )
                        if mapped_share_cache.get(raw_name) is not None:
                            investment_info = mapped_share_cache[raw_name]
                        else:
@ -298,14 +368,20 @@ class DataMapping:
                                )
                                fund_id = fund_info["id"]
                                mapped_fund_cache[raw_fund_name] = fund_info
-                            investment_info = self.matching_with_database(
-                                raw_name=raw_name, 
-                                raw_share_name=raw_share_name, 
-                                raw_fund_name=raw_fund_name,
-                                parent_id=fund_id, 
-                                matching_type="share",
-                                process_cache=process_cache
-                            )
+                            investment_info = {}
+                            if len(fund_id) > 0:
+                                investment_info = self.mapping_unique_raw_data(fund_id=fund_id,
+                                                                               raw_fund_name=raw_fund_name,
+                                                                               raw_data_list=raw_data_list)
+                            if investment_info.get("id", None) is None or len(investment_info.get("id", "")) == 0:
+                                investment_info = self.matching_with_database(
+                                    raw_name=raw_name,
+                                    raw_share_name=raw_share_name,
+                                    raw_fund_name=raw_fund_name,
+                                    parent_id=fund_id,
+                                    matching_type="share",
+                                    process_cache=process_cache,
+                                )
                            mapped_share_cache[raw_name] = investment_info
                    elif raw_fund_name is not None and len(raw_fund_name) > 0:
                        raw_name = raw_fund_name
@ -322,7 +398,7 @@ class DataMapping:
                            "id": "",
                            "legal_name": "",
                            "investment_type": -1,
-                            "similarity": 0
+                            "similarity": 0,
                        }

                    raw_data_keys = list(raw_data.keys())
@ -339,13 +415,35 @@ class DataMapping:
                                "investment_type": investment_info["investment_type"],
                                "investment_id": investment_info["id"],
                                "investment_name": investment_info["legal_name"],
-                                "similarity": investment_info["similarity"]
+                                "similarity": investment_info["similarity"],
                            }
                            mapped_data_list.append(mapped_data)
-        
+
        self.output_mapping_file(mapped_data_list)
        return mapped_data_list
    
+    def mapping_unique_raw_data(self, fund_id: str, raw_fund_name: str, raw_data_list: list):
+        share_count = 0
+        for raw_data in raw_data_list:
+            fund_name = raw_data.get("fund_name", "")
+            share_name = raw_data.get("share_name", "")
+            if fund_name == raw_fund_name and  share_name is not None and len(share_name) > 0:
+                share_count += 1
+                if share_count > 1:
+                    break
+        data_info = {}
+        if share_count == 1:
+            doc_compare_mapping = self.doc_fund_class_mapping[
+                    self.doc_fund_class_mapping["FundId"] == fund_id
+                ]
+            if len(doc_compare_mapping) == 1:
+                data_info["id"] = doc_compare_mapping["SecId"].values[0]
+                data_info["legal_name"] = doc_compare_mapping["ShareClassName"].values[0]
+                data_info["investment_type"] = 1
+                data_info["similarity"] = 1
+        return data_info
+        
+
    def output_mapping_file(self, mapped_data_list: list):
        json_data_file = os.path.join(
            self.output_data_json_folder, f"{self.doc_id}.json"
@ -355,10 +453,10 @@ class DataMapping:

        extract_data_df = pd.DataFrame(self.raw_document_data_list)
        extract_data_df.reset_index(drop=True, inplace=True)
-        
+
        mapping_data_df = pd.DataFrame(mapped_data_list)
        mapping_data_df.reset_index(drop=True, inplace=True)
-        
+
        excel_data_file = os.path.join(
            self.output_data_excel_folder, f"{self.doc_id}.xlsx"
        )
@ -373,7 +471,7 @@ class DataMapping:
        raw_name = ""
        if raw_share_name is not None and len(raw_share_name) > 0:
            raw_name = raw_share_name
-            # some share names are very short, 
+            # some share names are very short,
            # so we need to combine with fund name
            raw_name_splits = raw_name.split()
            raw_fund_name_splits = raw_fund_name.split()
@ -384,13 +482,13 @@ class DataMapping:
        return raw_name

    def matching_with_database(
-        self, 
-        raw_name: str, 
-        raw_share_name: str = None, 
+        self,
+        raw_name: str,
+        raw_share_name: str = None,
        raw_fund_name: str = None,
-        parent_id: str = None, 
+        parent_id: str = None,
        matching_type: str = "fund",
-        process_cache: dict = {}
+        process_cache: dict = {},
    ):
        if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0:
            data_info["id"] = ""
@ -402,7 +500,7 @@ class DataMapping:
            data_info["investment_type"] = investment_type
            data_info["similarity"] = 0
            return data_info
-        
+
        if matching_type == "fund":
            doc_compare_name_list = self.doc_fund_name_list
            doc_compare_mapping = self.doc_fund_mapping
@ -417,8 +515,9 @@ class DataMapping:
                doc_compare_mapping = self.doc_fund_class_mapping[
                    self.doc_fund_class_mapping["FundId"] == parent_id
                ]
-                provider_compare_mapping = self.provider_fund_class_mapping\
-                        [self.provider_fund_class_mapping["FundId"] == parent_id]
+                provider_compare_mapping = self.provider_fund_class_mapping[
+                    self.provider_fund_class_mapping["FundId"] == parent_id
+                ]
                if len(doc_compare_mapping) == 0:
                    if len(provider_compare_mapping) == 0:
                        doc_compare_name_list = self.doc_share_name_list
@ -435,9 +534,10 @@ class DataMapping:
                    doc_compare_name_list = (
                        doc_compare_mapping["ShareClassName"].unique().tolist()
                    )
-                    
-                if len(provider_compare_mapping) == 0 or \
-                    len(provider_compare_mapping) < len(doc_compare_mapping):
+
+                if len(provider_compare_mapping) == 0 or len(
+                    provider_compare_mapping
+                ) < len(doc_compare_mapping):
                    provider_compare_name_list = doc_compare_name_list
                    provider_compare_mapping = doc_compare_mapping
                else:
@ -459,58 +559,68 @@ class DataMapping:
            if doc_compare_name_list is not None and len(doc_compare_name_list) > 0:
                _, pre_common_word_list = remove_common_word(doc_compare_name_list)
                max_similarity_name, max_similarity = get_most_similar_name(
-                    raw_name, 
-                    doc_compare_name_list, 
-                    share_name=raw_share_name, 
+                    raw_name,
+                    doc_compare_name_list,
+                    share_name=raw_share_name,
                    fund_name=raw_fund_name,
                    matching_type=matching_type,
-                    process_cache=process_cache)
+                    process_cache=process_cache,
+                )
                if matching_type == "fund":
                    threshold = 0.7
                else:
-                    threshold = 0.9
+                    if self.compare_with_provider:
+                        threshold = 0.9
+                    else:
+                        threshold = 0.6
                if max_similarity is not None and max_similarity >= threshold:
                    data_info["id"] = doc_compare_mapping[
                        doc_compare_mapping[compare_name_dp] == max_similarity_name
                    ][compare_id_dp].values[0]
                    data_info["legal_name"] = max_similarity_name
                    data_info["similarity"] = max_similarity
-                    
+
            if data_info.get("id", None) is None or len(data_info.get("id", "")) == 0:
                # set pre_common_word_list, reason: the document mapping for same fund maybe different with provider mapping
                # the purpose is to get the most common word list, to improve the similarity.
-                max_similarity_name, max_similarity = get_most_similar_name(
-                    raw_name, 
-                    provider_compare_name_list, 
-                    share_name=raw_share_name,
-                    fund_name=raw_fund_name,
-                    matching_type=matching_type, 
-                    pre_common_word_list=pre_common_word_list,
-                    process_cache=process_cache
-                )
-                threshold = 0.7
-                if matching_type == "share":
-                    threshold = 0.5
-                round_similarity = 0
-                if max_similarity is not None and isinstance(max_similarity, float):
-                    round_similarity = round(max_similarity, 1)
-                if round_similarity is not None and round_similarity >= threshold:
-                    data_info["id"] = provider_compare_mapping[
-                        provider_compare_mapping[compare_name_dp] == max_similarity_name
-                    ][compare_id_dp].values[0]
-                    data_info["legal_name"] = max_similarity_name
-                    data_info["similarity"] = max_similarity
-                else:
-                    if len(doc_compare_name_list) == 1:
-                        data_info["id"] = doc_compare_mapping[
-                            doc_compare_mapping[compare_name_dp] == doc_compare_name_list[0]
+                if self.compare_with_provider:
+                    max_similarity_name, max_similarity = get_most_similar_name(
+                        raw_name,
+                        provider_compare_name_list,
+                        share_name=raw_share_name,
+                        fund_name=raw_fund_name,
+                        matching_type=matching_type,
+                        pre_common_word_list=pre_common_word_list,
+                        process_cache=process_cache,
+                    )
+                    threshold = 0.7
+                    if matching_type == "share":
+                        threshold = 0.5
+                    round_similarity = 0
+                    if max_similarity is not None and isinstance(max_similarity, float):
+                        round_similarity = round(max_similarity, 1)
+                    if round_similarity is not None and round_similarity >= threshold:
+                        data_info["id"] = provider_compare_mapping[
+                            provider_compare_mapping[compare_name_dp] == max_similarity_name
                        ][compare_id_dp].values[0]
-                        data_info["legal_name"] = doc_compare_name_list[0]
-                        data_info["similarity"] = 1
+                        data_info["legal_name"] = max_similarity_name
+                        data_info["similarity"] = max_similarity
                    else:
-                        data_info["id"] = ""
-                        data_info["legal_name"] = ""
-                        data_info["similarity"] = 0
+                        if len(doc_compare_name_list) == 1:
+                            data_info["id"] = doc_compare_mapping[
+                                doc_compare_mapping[compare_name_dp]
+                                == doc_compare_name_list[0]
+                            ][compare_id_dp].values[0]
+                            data_info["legal_name"] = doc_compare_name_list[0]
+                            data_info["similarity"] = 1
+                        else:
+                            data_info["id"] = ""
+                            data_info["legal_name"] = ""
+                            data_info["similarity"] = 0
+                else:
+                    data_info["id"] = ""
+                    data_info["legal_name"] = ""
+                    data_info["similarity"] = 0
            data_info["investment_type"] = investment_type
        else:
            data_info["id"] = ""