support get sqlpass api by configuration

optimize instructions configuration
optimize drilldown part logic
2025-02-19 14:37:21 -06:00 · 2025-02-04 15:29:24 -06:00 · 2025-01-31 10:59:54 -06:00 · 2025-01-27 14:59:26 -06:00 · 2025-01-27 12:32:42 -06:00 · 2025-01-27 12:32:36 -06:00
11 changed files with 1518 additions and 202 deletions
--- a/app_emea_ar.py
+++ b/app_emea_ar.py
@ -44,6 +44,8 @@ def emea_ar_data_extract():
    output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
    output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
    drilldown_folder = r"./data/emea_ar/output/drilldown/"
    db_mapping_document_folder = r"./data/emea_ar/output/db_mapping/document/"
    db_mapping_provider_folder = r"./data/emea_ar/output/db_mapping/provider/"
    extract_way = "text"
    os.makedirs(pdf_folder, exist_ok=True)
@ -51,12 +53,16 @@ def emea_ar_data_extract():
    os.makedirs(output_extract_data_folder, exist_ok=True)
    os.makedirs(output_mapping_data_folder, exist_ok=True)
    os.makedirs(drilldown_folder, exist_ok=True)
    os.makedirs(db_mapping_document_folder, exist_ok=True)
    os.makedirs(db_mapping_provider_folder, exist_ok=True)
    clean_folder(pdf_folder)
    clean_folder(output_pdf_text_folder)
    clean_folder(output_extract_data_folder)
    clean_folder(output_mapping_data_folder)
    clean_folder(drilldown_folder)
    clean_folder(db_mapping_document_folder)
    clean_folder(db_mapping_provider_folder)
    re_run_extract_data = False
    re_run_mapping_data = False
@ -69,7 +75,8 @@ def emea_ar_data_extract():
                                          output_extract_data_folder=output_extract_data_folder,
                                          output_mapping_data_folder=output_mapping_data_folder,
                                          extract_way=extract_way,
-                                          drilldown_folder=drilldown_folder)
+                                          drilldown_folder=drilldown_folder,
                                          compare_with_provider=False)
        doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
        doc_mapping_data = emea_ar_parsing.mapping_data(
            data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
--- a/configuration/aus_prospectus/datapoint_keyword.json
+++ b/configuration/aus_prospectus/datapoint_keyword.json
@ -1,6 +1,6 @@
 {
  "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
-  "management_fee_and_costs": {"english": ["management fees and cost", "Plus other investment fees and costs"]},
+  "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]},
  "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
  "performance_fee": {"english": ["performance fee", "performance fees"]},
  "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
--- a/configuration/emea_ar/abbreviation_records.json
+++ b/configuration/emea_ar/abbreviation_records.json
--- a/core/auz_nz/hybrid_solution_script.py
+++ b/core/auz_nz/hybrid_solution_script.py
@ -32,22 +32,24 @@ from openai import AzureOpenAI
 ABB_JSON = dict()
-def get_abb_json():
+def get_abb_json(doc_source: str = "aus_prospectus"):
    global ABB_JSON
-    with open("abbreviation_records.json", "r") as file:
+    if len(ABB_JSON.keys()) == 0:
-        # Load the JSON and convert keys to lowercase
+        with open(f"./configuration/{doc_source}/abbreviation_records.json", "r") as file:
-        ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
+            # Load the JSON and convert keys to lowercase
            ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
-def get_abbre_format_str(fundname):
+def get_abbre_format_str(fundname, doc_source: str = "aus_prospectus"):
    """Replaces abbreviations in a fund name with their expanded forms."""
    # Convert fund name to lowercase while matching
    f_list = fundname.lower().split()
    get_abb_json(doc_source)
    updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
    return " ".join(updated_doc_fname_words)
-def replace_abbrevs_in_fundnames(fund_names_list):
+def replace_abbrevs_in_fundnames(fund_names_list, doc_source: str = "aus_prospectus"):
    """Replaces abbreviations in a list of fund names."""
-    return [get_abbre_format_str(fund_name) for fund_name in fund_names_list]
+    return [get_abbre_format_str(fund_name, doc_source) for fund_name in fund_names_list]
 ### STEP 2 - Remove Stopwords
@ -438,7 +440,7 @@ def format_response(doc_id, pred_fund, db_fund, clean_pred_name, clean_db_name,
    return dt
-def final_function_to_match(doc_id, pred_list, db_list, provider_name):
+def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_source: str = "aus_prospectus"):
    final_result = {}
    df_data = []
    unmatched_pred_list = pred_list.copy()
@ -456,12 +458,16 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
                                            step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
                                            step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2], 
                                            llm_flag=False))
-                unmatched_db_list.remove(db_list[matched_index])
+                if db_list[matched_index] in unmatched_db_list:
-                unmatched_pred_list.remove(pred_list[index])
+                    unmatched_db_list.remove(db_list[matched_index])
                # unmatched_db_list.remove(db_list[matched_index])
                if pred_list[index] in unmatched_pred_list:
                    unmatched_pred_list.remove(pred_list[index])
                # unmatched_pred_list.remove(pred_list[index])
            else:
                ### STEP-1 Abbreviation replacement
-                cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund])[0]
+                cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund], doc_source)[0]
-                cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list)
+                cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list, doc_source)
                # print("--> ",cleaned_db_list1, cleaned_pred_name1)
                step1_result, matched_index, all_scores1_, all_matched_fund_names1_ = get_fund_match_final_score(cleaned_db_list1, cleaned_pred_name1)
                # print(f"\nStep 1 - Abbreviation Replacement Result: {step1_result}")
@ -477,8 +483,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
                                            step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
                                            step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
                                            step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False))
-                    unmatched_db_list.remove(db_list[matched_index])
+                    if db_list[matched_index] in unmatched_db_list:
-                    unmatched_pred_list.remove(pred_list[index])
+                        unmatched_db_list.remove(db_list[matched_index])
                    # unmatched_db_list.remove(db_list[matched_index])
                    if pred_list[index] in unmatched_pred_list:
                        unmatched_pred_list.remove(pred_list[index])
                    # unmatched_pred_list.remove(pred_list[index])
                else:
                    ### STEP-2 Remove Stopwords
                    cleaned_pred_name2 = remove_stopwords_nltk([cleaned_pred_name1])[0]
@ -501,8 +511,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
                                            step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
                                            step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
                                            step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False))
-                        unmatched_db_list.remove(db_list[matched_index])
+                        if db_list[matched_index] in unmatched_db_list:
-                        unmatched_pred_list.remove(pred_list[index])
+                            unmatched_db_list.remove(db_list[matched_index])
                        # unmatched_db_list.remove(db_list[matched_index])
                        if pred_list[index] in unmatched_pred_list:
                            unmatched_pred_list.remove(pred_list[index])
                        # unmatched_pred_list.remove(pred_list[index])
                    else:
                        ### STEP-3 Special Character Removal
                        cleaned_pred_name3 = remove_special_characters([cleaned_pred_name2])[0]
@ -527,8 +541,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
                                    step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3,
                                    step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2],
                                    step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False))
-                            unmatched_db_list.remove(db_list[matched_index])
+                            if db_list[matched_index] in unmatched_db_list:
-                            unmatched_pred_list.remove(pred_list[index])
+                                unmatched_db_list.remove(db_list[matched_index])
                            # unmatched_db_list.remove(db_list[matched_index])
                            if pred_list[index] in unmatched_pred_list:
                                unmatched_pred_list.remove(pred_list[index])
                            # unmatched_pred_list.remove(pred_list[index])
                        else:
                            ### STEP-4 Common Words Removal
                            cleaned_db_list4, _ = remove_common_words(cleaned_db_list3)
@ -565,8 +583,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
                                # print("unmatched_pred_list: ",unmatched_pred_list)
                                # print("db_list[matched_index]: ",db_list[matched_index])
                                # print("pred_list[index]: ",pred_list[index])
-                                unmatched_db_list.remove(db_list[matched_index])
+                                if db_list[matched_index] in unmatched_db_list:
-                                unmatched_pred_list.remove(pred_list[index])
+                                    unmatched_db_list.remove(db_list[matched_index])
                                # unmatched_db_list.remove(db_list[matched_index])
                                if pred_list[index] in unmatched_pred_list:
                                    unmatched_pred_list.remove(pred_list[index])
                                # unmatched_pred_list.remove(pred_list[index])
                            else:
                                df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name4, 
                                    db_list[matched_index],
@ -595,11 +617,11 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
    # print("==>>> DB LIST: ",unmatched_db_list)
    # print("==>>> PRED LIST: ",unmatched_pred_list)
    if len(unmatched_pred_list)!=0:
-        cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list)
+        cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list, doc_source)
        cleaned_unmatched_pred_list = remove_stopwords_nltk(cleaned_unmatched_pred_list)
        cleaned_unmatched_pred_list = remove_special_characters(cleaned_unmatched_pred_list)
-        cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list)
+        cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list, doc_source)
        cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list)
        cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list)
        prompt_context = f"""
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -969,7 +969,9 @@ class DataExtraction:
                if datapoint_name == "performance_fee":
                    datapoint_name = "performance fees"
                else:
-                    datapoint_name = datapoint_name.upper()
+                    datapoint_name = self.datapoint_name_config.get(datapoint_name, "")
                    if len(datapoint_name) == 0:
                        datapoint_name = datapoint.upper()
                reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}"
            instructions.append(reported_name)
--- a/core/data_mapping.py
+++ b/core/data_mapping.py
@ -1,6 +1,7 @@
 import os
 import json
 import pandas as pd
 from copy import deepcopy
 from utils.biz_utils import get_most_similar_name, remove_common_word
 from utils.sql_query_util import (
    query_document_fund_mapping,
@ -18,14 +19,18 @@ class DataMapping:
        raw_document_data_list: list,
        document_mapping_info_df: pd.DataFrame,
        output_data_folder: str,
-        doc_source: str = "emea_ar"
+        doc_source: str = "emea_ar",
        compare_with_provider: bool = True
    ):
        self.doc_id = doc_id
        self.datapoints = datapoints
        self.doc_source = doc_source
        self.compare_with_provider = compare_with_provider
        self.raw_document_data_list = raw_document_data_list
        if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
-            self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
+            self.document_mapping_info_df = query_document_fund_mapping(
                doc_id, rerun=False
            )
        else:
            self.document_mapping_info_df = document_mapping_info_df
@ -44,7 +49,9 @@ class DataMapping:
    def set_mapping_data_by_db(self, document_mapping_info_df: pd.DataFrame):
        logger.info("Setting document mapping data")
        if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
-            self.document_mapping_info_df = query_document_fund_mapping(self.doc_id, rerun=False)
+            self.document_mapping_info_df = query_document_fund_mapping(
                self.doc_id, rerun=False
            )
        else:
            self.document_mapping_info_df = document_mapping_info_df
        if len(self.document_mapping_info_df) == 0:
@ -92,26 +99,27 @@ class DataMapping:
    def get_provider_mapping(self):
        if len(self.document_mapping_info_df) == 0:
            return pd.DataFrame()
-        provider_id_list = (
+        provider_id_list = self.document_mapping_info_df["ProviderId"].unique().tolist()
            self.document_mapping_info_df["ProviderId"].unique().tolist()
        )
        provider_mapping_list = []
        for provider_id in provider_id_list:
-            provider_mapping_list.append(query_investment_by_provider(provider_id, rerun=False))
+            provider_mapping_list.append(
                query_investment_by_provider(provider_id, rerun=False)
            )
        provider_mapping_df = pd.concat(provider_mapping_list)
        provider_mapping_df = provider_mapping_df.drop_duplicates()
        provider_mapping_df.reset_index(drop=True, inplace=True)
        return provider_mapping_df
-    
+
    def mapping_raw_data_entrance(self):
-        if self.doc_source == "emear_ar":
+        if self.doc_source == "emea_ar":
            return self.mapping_raw_data()
        elif self.doc_source == "aus_prospectus":
-            return self.mapping_raw_data_aus()
+            return self.mapping_raw_data_generic()
        else:
            return self.mapping_raw_data()
-        
+        # return self.mapping_raw_data_generic()
-    def mapping_raw_data_aus(self):
+
    def mapping_raw_data_generic(self):
        logger.info(f"Mapping raw data for AUS Prospectus document {self.doc_id}")
        mapped_data_list = []
        # Generate raw name based on fund name and share name by integrate_share_name
@ -128,7 +136,9 @@ class DataMapping:
                raw_share_name = raw_data.get("share_name", "")
                raw_data_keys = list(raw_data.keys())
                if len(raw_share_name) > 0:
-                    integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name)
+                    integrated_share_name = self.integrate_share_name(
                        raw_fund_name, raw_share_name
                    )
                    if integrated_share_name not in share_raw_name_list:
                        share_raw_name_list.append(integrated_share_name)
                    for datapoint in self.datapoints:
@ -144,7 +154,7 @@ class DataMapping:
                                "investment_type": 1,
                                "investment_id": "",
                                "investment_name": "",
-                                "similarity": 0
+                                "similarity": 0,
                            }
                            mapped_data_list.append(mapped_data)
                else:
@ -162,29 +172,38 @@ class DataMapping:
                                "value": raw_data[datapoint],
                                "investment_type": 33,
                                "investment_id": "",
-                                "investment_name": ""
+                                "investment_name": "",
                            }
                            mapped_data_list.append(mapped_data)
        # Mapping raw data with database
-        iter_count = 30
+        iter_count = 60
        fund_match_result = {}
        if len(fund_raw_name_list) > 0:
-            fund_match_result = self.get_raw_name_db_match_result(fund_raw_name_list, "fund", iter_count)
+            fund_match_result = self.get_raw_name_db_match_result(
-            logger.info(f"Fund match result: \n{fund_match_result}")
+                fund_raw_name_list, "fund", iter_count
            )
            # logger.info(f"Fund match result: \n{fund_match_result}")
        share_match_result = {}
        if len(share_raw_name_list) > 0:
-            share_match_result = self.get_raw_name_db_match_result(share_raw_name_list, "share", iter_count)
+            share_match_result = self.get_raw_name_db_match_result(
-            logger.info(f"Share match result: \n{share_match_result}")
+                share_raw_name_list, "share", iter_count
-        
+            )
            # logger.info(f"Share match result: \n{share_match_result}")
        for mapped_data in mapped_data_list:
            investment_type = mapped_data["investment_type"]
            raw_name = mapped_data["raw_name"]
            if investment_type == 33:
                if fund_match_result.get(raw_name) is not None:
                    matched_db_fund_name = fund_match_result[raw_name]
-                    if matched_db_fund_name is not None and len(matched_db_fund_name) > 0:
+                    if (
                        matched_db_fund_name is not None
                        and len(matched_db_fund_name) > 0
                    ):
                        # get FundId from self.doc_fund_mapping
-                        find_fund_df = self.doc_fund_mapping[self.doc_fund_mapping["FundName"] == matched_db_fund_name]
+                        find_fund_df = self.doc_fund_mapping[
                            self.doc_fund_mapping["FundName"] == matched_db_fund_name
                        ]
                        if find_fund_df is not None and len(find_fund_df) > 0:
                            fund_id = find_fund_df["FundId"].values[0]
                            mapped_data["investment_id"] = fund_id
@ -193,38 +212,82 @@ class DataMapping:
            if investment_type == 1:
                if share_match_result.get(raw_name) is not None:
                    matched_db_share_name = share_match_result[raw_name]
-                    if matched_db_share_name is not None and len(matched_db_share_name) > 0:
+                    if (
                        matched_db_share_name is not None
                        and len(matched_db_share_name) > 0
                    ):
                        # get SecId from self.doc_fund_class_mapping
-                        find_share_df = self.doc_fund_class_mapping[self.doc_fund_class_mapping["ShareClassName"] == matched_db_share_name]
+                        find_share_df = self.doc_fund_class_mapping[
                            self.doc_fund_class_mapping["ShareClassName"]
                            == matched_db_share_name
                        ]
                        if find_share_df is not None and len(find_share_df) > 0:
                            share_id = find_share_df["SecId"].values[0]
                            mapped_data["investment_id"] = share_id
                            mapped_data["investment_name"] = matched_db_share_name
                            mapped_data["similarity"] = 1
-                        
+
        self.output_mapping_file(mapped_data_list)
        return mapped_data_list
-        
+
-    def get_raw_name_db_match_result(self, raw_name_list, investment_type: str, iter_count: int = 30):
+    def get_raw_name_db_match_result(
        self, raw_name_list, investment_type: str, iter_count: int = 30
    ):
        # split raw_name_list into several parts which each part is with 30 elements
        # The reason to split is to avoid invoke token limitation issues from CahtGPT
-        raw_name_list_parts = [raw_name_list[i:i + iter_count] 
+        raw_name_list_parts = [
-                               for i in range(0, len(raw_name_list), iter_count)]
+            raw_name_list[i : i + iter_count]
            for i in range(0, len(raw_name_list), iter_count)
        ]
        all_match_result = {}
        doc_fund_name_list = deepcopy(self.doc_fund_name_list)
        doc_share_name_list = deepcopy(self.doc_share_name_list)
        for raw_name_list in raw_name_list_parts:
            if investment_type == "fund":
-                match_result = final_function_to_match(doc_id=self.doc_id,
+                match_result, doc_fund_name_list = self.get_final_function_to_match(
-                                                       pred_list=raw_name_list,
+                    raw_name_list, doc_fund_name_list
-                                                       db_list=self.doc_fund_name_list,
+                )
                                                       provider_name=self.provider_name)
            else:
-                match_result = final_function_to_match(doc_id=self.doc_id,
+                match_result, doc_share_name_list = self.get_final_function_to_match(
-                                                       pred_list=raw_name_list,
+                    raw_name_list, doc_share_name_list
-                                                       db_list=self.doc_share_name_list,
+                )
                                                       provider_name=self.provider_name)
            all_match_result.update(match_result)
        return all_match_result
    def get_final_function_to_match(self, raw_name_list, db_name_list):
        if len(db_name_list) == 0:
            match_result = {}
            for raw_name in raw_name_list:
                match_result[raw_name] = ""
        else:
            match_result = final_function_to_match(
                doc_id=self.doc_id,
                pred_list=raw_name_list,
                db_list=db_name_list,
                provider_name=self.provider_name,
                doc_source=self.doc_source
            )
            matched_name_list = list(match_result.values())
            db_name_list = self.remove_matched_names(db_name_list, matched_name_list)
        return match_result, db_name_list
    def remove_matched_names(self, target_name_list: list, matched_name_list: list):
        if len(matched_name_list) == 0:
            return target_name_list
        matched_name_list = list(set(matched_name_list))
        matched_name_list = [
            value for value in matched_name_list if value is not None and len(value) > 0
        ]
        for matched_name in matched_name_list:
            if (
                matched_name is not None
                and len(matched_name) > 0
                and matched_name in target_name_list
            ):
                target_name_list.remove(matched_name)
        return target_name_list
    def mapping_raw_data(self):
        """
        doc_id, page_index, datapoint, value,
@ -245,9 +308,14 @@ class DataMapping:
                if raw_fund_name is None or len(raw_fund_name) == 0:
                    continue
                raw_share_name = raw_data.get("share_name", "")
-                if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0:
+                if (
                    len(self.doc_fund_name_list) == 0
                    and len(self.provider_fund_name_list) == 0
                ):
                    if len(raw_share_name) > 0:
-                        integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name)
+                        integrated_share_name = self.integrate_share_name(
                            raw_fund_name, raw_share_name
                        )
                        raw_data_keys = list(raw_data.keys())
                        for datapoint in self.datapoints:
                            if datapoint in raw_data_keys:
@ -262,7 +330,7 @@ class DataMapping:
                                    "investment_type": 1,
                                    "investment_id": "",
                                    "investment_name": "",
-                                    "similarity": 0
+                                    "similarity": 0,
                                }
                                mapped_data_list.append(mapped_data)
                    else:
@ -279,13 +347,15 @@ class DataMapping:
                                    "value": raw_data[datapoint],
                                    "investment_type": 33,
                                    "investment_id": "",
-                                    "investment_name": ""
+                                    "investment_name": "",
                                }
                                mapped_data_list.append(mapped_data)
                else:
                    raw_name = ""
                    if raw_share_name is not None and len(raw_share_name) > 0:
-                        raw_name = self.integrate_share_name(raw_fund_name, raw_share_name)
+                        raw_name = self.integrate_share_name(
                            raw_fund_name, raw_share_name
                        )
                        if mapped_share_cache.get(raw_name) is not None:
                            investment_info = mapped_share_cache[raw_name]
                        else:
@ -298,14 +368,20 @@ class DataMapping:
                                )
                                fund_id = fund_info["id"]
                                mapped_fund_cache[raw_fund_name] = fund_info
-                            investment_info = self.matching_with_database(
+                            investment_info = {}
-                                raw_name=raw_name, 
+                            if len(fund_id) > 0:
-                                raw_share_name=raw_share_name, 
+                                investment_info = self.mapping_unique_raw_data(fund_id=fund_id,
-                                raw_fund_name=raw_fund_name,
+                                                                               raw_fund_name=raw_fund_name,
-                                parent_id=fund_id, 
+                                                                               raw_data_list=raw_data_list)
-                                matching_type="share",
+                            if investment_info.get("id", None) is None or len(investment_info.get("id", "")) == 0:
-                                process_cache=process_cache
+                                investment_info = self.matching_with_database(
-                            )
+                                    raw_name=raw_name,
                                    raw_share_name=raw_share_name,
                                    raw_fund_name=raw_fund_name,
                                    parent_id=fund_id,
                                    matching_type="share",
                                    process_cache=process_cache,
                                )
                            mapped_share_cache[raw_name] = investment_info
                    elif raw_fund_name is not None and len(raw_fund_name) > 0:
                        raw_name = raw_fund_name
@ -322,7 +398,7 @@ class DataMapping:
                            "id": "",
                            "legal_name": "",
                            "investment_type": -1,
-                            "similarity": 0
+                            "similarity": 0,
                        }
                    raw_data_keys = list(raw_data.keys())
@ -339,13 +415,35 @@ class DataMapping:
                                "investment_type": investment_info["investment_type"],
                                "investment_id": investment_info["id"],
                                "investment_name": investment_info["legal_name"],
-                                "similarity": investment_info["similarity"]
+                                "similarity": investment_info["similarity"],
                            }
                            mapped_data_list.append(mapped_data)
-        
+
        self.output_mapping_file(mapped_data_list)
        return mapped_data_list
    def mapping_unique_raw_data(self, fund_id: str, raw_fund_name: str, raw_data_list: list):
        share_count = 0
        for raw_data in raw_data_list:
            fund_name = raw_data.get("fund_name", "")
            share_name = raw_data.get("share_name", "")
            if fund_name == raw_fund_name and  share_name is not None and len(share_name) > 0:
                share_count += 1
                if share_count > 1:
                    break
        data_info = {}
        if share_count == 1:
            doc_compare_mapping = self.doc_fund_class_mapping[
                    self.doc_fund_class_mapping["FundId"] == fund_id
                ]
            if len(doc_compare_mapping) == 1:
                data_info["id"] = doc_compare_mapping["SecId"].values[0]
                data_info["legal_name"] = doc_compare_mapping["ShareClassName"].values[0]
                data_info["investment_type"] = 1
                data_info["similarity"] = 1
        return data_info
    def output_mapping_file(self, mapped_data_list: list):
        json_data_file = os.path.join(
            self.output_data_json_folder, f"{self.doc_id}.json"
@ -355,10 +453,10 @@ class DataMapping:
        extract_data_df = pd.DataFrame(self.raw_document_data_list)
        extract_data_df.reset_index(drop=True, inplace=True)
-        
+
        mapping_data_df = pd.DataFrame(mapped_data_list)
        mapping_data_df.reset_index(drop=True, inplace=True)
-        
+
        excel_data_file = os.path.join(
            self.output_data_excel_folder, f"{self.doc_id}.xlsx"
        )
@ -373,7 +471,7 @@ class DataMapping:
        raw_name = ""
        if raw_share_name is not None and len(raw_share_name) > 0:
            raw_name = raw_share_name
-            # some share names are very short, 
+            # some share names are very short,
            # so we need to combine with fund name
            raw_name_splits = raw_name.split()
            raw_fund_name_splits = raw_fund_name.split()
@ -384,13 +482,13 @@ class DataMapping:
        return raw_name
    def matching_with_database(
-        self, 
+        self,
-        raw_name: str, 
+        raw_name: str,
-        raw_share_name: str = None, 
+        raw_share_name: str = None,
        raw_fund_name: str = None,
-        parent_id: str = None, 
+        parent_id: str = None,
        matching_type: str = "fund",
-        process_cache: dict = {}
+        process_cache: dict = {},
    ):
        if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0:
            data_info["id"] = ""
@ -402,7 +500,7 @@ class DataMapping:
            data_info["investment_type"] = investment_type
            data_info["similarity"] = 0
            return data_info
-        
+
        if matching_type == "fund":
            doc_compare_name_list = self.doc_fund_name_list
            doc_compare_mapping = self.doc_fund_mapping
@ -417,8 +515,9 @@ class DataMapping:
                doc_compare_mapping = self.doc_fund_class_mapping[
                    self.doc_fund_class_mapping["FundId"] == parent_id
                ]
-                provider_compare_mapping = self.provider_fund_class_mapping\
+                provider_compare_mapping = self.provider_fund_class_mapping[
-                        [self.provider_fund_class_mapping["FundId"] == parent_id]
+                    self.provider_fund_class_mapping["FundId"] == parent_id
                ]
                if len(doc_compare_mapping) == 0:
                    if len(provider_compare_mapping) == 0:
                        doc_compare_name_list = self.doc_share_name_list
@ -435,9 +534,10 @@ class DataMapping:
                    doc_compare_name_list = (
                        doc_compare_mapping["ShareClassName"].unique().tolist()
                    )
-                    
+
-                if len(provider_compare_mapping) == 0 or \
+                if len(provider_compare_mapping) == 0 or len(
-                    len(provider_compare_mapping) < len(doc_compare_mapping):
+                    provider_compare_mapping
                ) < len(doc_compare_mapping):
                    provider_compare_name_list = doc_compare_name_list
                    provider_compare_mapping = doc_compare_mapping
                else:
@ -459,58 +559,68 @@ class DataMapping:
            if doc_compare_name_list is not None and len(doc_compare_name_list) > 0:
                _, pre_common_word_list = remove_common_word(doc_compare_name_list)
                max_similarity_name, max_similarity = get_most_similar_name(
-                    raw_name, 
+                    raw_name,
-                    doc_compare_name_list, 
+                    doc_compare_name_list,
-                    share_name=raw_share_name, 
+                    share_name=raw_share_name,
                    fund_name=raw_fund_name,
                    matching_type=matching_type,
-                    process_cache=process_cache)
+                    process_cache=process_cache,
                )
                if matching_type == "fund":
                    threshold = 0.7
                else:
-                    threshold = 0.9
+                    if self.compare_with_provider:
                        threshold = 0.9
                    else:
                        threshold = 0.6
                if max_similarity is not None and max_similarity >= threshold:
                    data_info["id"] = doc_compare_mapping[
                        doc_compare_mapping[compare_name_dp] == max_similarity_name
                    ][compare_id_dp].values[0]
                    data_info["legal_name"] = max_similarity_name
                    data_info["similarity"] = max_similarity
-                    
+
            if data_info.get("id", None) is None or len(data_info.get("id", "")) == 0:
                # set pre_common_word_list, reason: the document mapping for same fund maybe different with provider mapping
                # the purpose is to get the most common word list, to improve the similarity.
-                max_similarity_name, max_similarity = get_most_similar_name(
+                if self.compare_with_provider:
-                    raw_name, 
+                    max_similarity_name, max_similarity = get_most_similar_name(
-                    provider_compare_name_list, 
+                        raw_name,
-                    share_name=raw_share_name,
+                        provider_compare_name_list,
-                    fund_name=raw_fund_name,
+                        share_name=raw_share_name,
-                    matching_type=matching_type, 
+                        fund_name=raw_fund_name,
-                    pre_common_word_list=pre_common_word_list,
+                        matching_type=matching_type,
-                    process_cache=process_cache
+                        pre_common_word_list=pre_common_word_list,
-                )
+                        process_cache=process_cache,
-                threshold = 0.7
+                    )
-                if matching_type == "share":
+                    threshold = 0.7
-                    threshold = 0.5
+                    if matching_type == "share":
-                round_similarity = 0
+                        threshold = 0.5
-                if max_similarity is not None and isinstance(max_similarity, float):
+                    round_similarity = 0
-                    round_similarity = round(max_similarity, 1)
+                    if max_similarity is not None and isinstance(max_similarity, float):
-                if round_similarity is not None and round_similarity >= threshold:
+                        round_similarity = round(max_similarity, 1)
-                    data_info["id"] = provider_compare_mapping[
+                    if round_similarity is not None and round_similarity >= threshold:
-                        provider_compare_mapping[compare_name_dp] == max_similarity_name
+                        data_info["id"] = provider_compare_mapping[
-                    ][compare_id_dp].values[0]
+                            provider_compare_mapping[compare_name_dp] == max_similarity_name
                    data_info["legal_name"] = max_similarity_name
                    data_info["similarity"] = max_similarity
                else:
                    if len(doc_compare_name_list) == 1:
                        data_info["id"] = doc_compare_mapping[
                            doc_compare_mapping[compare_name_dp] == doc_compare_name_list[0]
                        ][compare_id_dp].values[0]
-                        data_info["legal_name"] = doc_compare_name_list[0]
+                        data_info["legal_name"] = max_similarity_name
-                        data_info["similarity"] = 1
+                        data_info["similarity"] = max_similarity
                    else:
-                        data_info["id"] = ""
+                        if len(doc_compare_name_list) == 1:
-                        data_info["legal_name"] = ""
+                            data_info["id"] = doc_compare_mapping[
-                        data_info["similarity"] = 0
+                                doc_compare_mapping[compare_name_dp]
                                == doc_compare_name_list[0]
                            ][compare_id_dp].values[0]
                            data_info["legal_name"] = doc_compare_name_list[0]
                            data_info["similarity"] = 1
                        else:
                            data_info["id"] = ""
                            data_info["legal_name"] = ""
                            data_info["similarity"] = 0
                else:
                    data_info["id"] = ""
                    data_info["legal_name"] = ""
                    data_info["similarity"] = 0
            data_info["investment_type"] = investment_type
        else:
            data_info["id"] = ""
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@ -61,23 +61,8 @@
 			"---Example End---",
 			"The output should be:",
 			"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
 			"- 5. Reverse order of data columns from table text in PDF:",
 			"For this case, 1. the columns order is reversed, \n2. The fund name is in the end of row with number value in front of fund name.",
 			"---Example 1 Start---",
 			"Transaction\ncosts\n(gross)1\nBuy-sell\nspreads\nTransaction\ncosts (net)\nEquals\ninvestment fees and\ncosts\nThe investment fees and\ncosts are made up of\nPlus\nother\ninvestment\nfees and\ncosts\nPerformance\nfee\n% pa\nEntry %/\nExit %\n% pa\n% pa\n% pa\nReady-made portfolios\nSimple choice\n0.04\n0.10/0.10\n0.00\n0.62\n0.55\n0.07\nMLC Stable\n0.05\n0.10/0.10\n0.02\n0.80\n0.65\n0.15\nMLC Conservative Balanced",
 			"---Example 1 End---",
 			"For this case, Management fees and costs = Management fees with same reported name: Plus\nother\ninvestment\nfees and\ncosts",
 			"The output should be: ",
 			"{\"data\": [{\"fund name\": \"MLC Stable\", \"share name\": \"MLC Stable\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.07}, {\"fund name\": \"MLC Conservative Balanced\", \"share name\": \"MLC Conservative Balanced\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.65, \"management_fee\": 0.65, \"performance_fee\": 0.15}]",
 			"\n",
 			"---Example 2 Start---",
 			"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
 			"---Example 2 End---",
 			"For this case, Management fees and costs = Management fees + Indirect Fee.",
 			"The output should be:",
 			"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
 			"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
-			"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."		
+			"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
 		],
 		"investment_level": {
 			"total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.",
@ -136,7 +121,7 @@
 		"special_rule": {
 			"management_fee_and_costs": [
 				"If there are multiple Management fee and costs reported names, here is the priority rule:",
-				"- With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
+				"A. With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
 				"---Example Start---",
 				"\n Investment option \nInvestment option \nmanagement \ncosts1  \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2  \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net)  \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross)  \n% p.a. \n(A + B)\nTotal Management \nfees and costs  \n(net)  \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
 				"---Example End---",
@ -144,19 +129,24 @@
 				"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
 				"\n",
 				"If there are multiple Management fee and costs sub-columns, here is the rule:",
-				"- With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
+				"B. With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
 				"---Example Start---",
-				"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
+				"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
 				"---Example End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
 				"\n",
-				"- With \"Management fees\" and \"Administration fee\", sum the values from these two columns: \"Management fees\" + \"Administration fee\".",
+				"C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".",
-				"---Example Start---",
+				"---Example 1 Start---",
-				"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nAdministration Fee\nManagement fees\nMLC diversified investment\noption\n1.62% p.a.\n0.02% p.a.\n0.03% p.a.\n0.09% p.a.\n1.58% p.a.\nMLC Horizon 4 Balanced\nPortfolio\n",
+				"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
-				"---Example End---",
+				"---Example 1 End---",
 				"The output should be:",
-				"{\"data\": [{\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.67, \"management_fee\": 1.58, \"administration_fees\": 0.09, \"performance_fee\": 0.03}]"
+				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]",
 				"---Example 2 Start---",
 				"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
 				"---Example 2 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]"
 			],
 			"buy_spread": [
 				"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
@ -263,7 +253,7 @@
 			"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
 			"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
 			"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
-			"minimum_initial_investment_value": [0, 5, 12],
+			"minimum_initial_investment_value": [0, 5000, 10000],
 			"recoverable_expenses_value": [0.12, 0.05, 0.06],
 			"indirect_costs_value": [0.12, 0.16, 0.02]
 		},
--- a/main.py
+++ b/main.py
@ -31,11 +31,14 @@ class EMEA_AR_Parsing:
        output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
        extract_way: str = "text",
        drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
        compare_with_provider: bool = True
    ) -> None:
        self.doc_id = doc_id
        self.doc_source = doc_source
        self.pdf_folder = pdf_folder
        os.makedirs(self.pdf_folder, exist_ok=True)
        self.compare_with_provider = compare_with_provider
        self.pdf_file = self.download_pdf()
        self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
@ -72,11 +75,11 @@ class EMEA_AR_Parsing:
        os.makedirs(self.output_mapping_data_folder, exist_ok=True)
        self.filter_pages = FilterPages(
-            self.doc_id, 
+            self.doc_id,
-            self.pdf_file, 
+            self.pdf_file,
-            self.document_mapping_info_df, 
+            self.document_mapping_info_df,
            self.doc_source,
-            output_pdf_text_folder
+            output_pdf_text_folder,
        )
        self.page_text_dict = self.filter_pages.page_text_dict
@ -87,7 +90,9 @@ class EMEA_AR_Parsing:
            drilldown_folder = r"/data/emea_ar/output/drilldown/"
        os.makedirs(drilldown_folder, exist_ok=True)
        self.drilldown_folder = drilldown_folder
-        misc_config_file = os.path.join(f"./configuration/{doc_source}/", "misc_config.json")
+        misc_config_file = os.path.join(
            f"./configuration/{doc_source}/", "misc_config.json"
        )
        if os.path.exists(misc_config_file):
            with open(misc_config_file, "r", encoding="utf-8") as f:
                misc_config = json.load(f)
@ -249,6 +254,14 @@ class EMEA_AR_Parsing:
                except Exception as e:
                    logger.error(f"Error: {e}")
                annotation_list = annotation_list_df.to_dict(orient="records")
                try:
                    drilldown_json_file = os.path.join(
                        drilldown_data_folder, f"{doc_id}_drilldown.json"
                    )
                    with open(drilldown_json_file, "w", encoding="utf-8") as f:
                        json.dump(annotation_list, f, ensure_ascii=False, indent=4)
                except Exception as e:
                    logger.error(f"Error: {e}")
        return annotation_list
    def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
@ -278,7 +291,8 @@ class EMEA_AR_Parsing:
            data_from_gpt,
            self.document_mapping_info_df,
            self.output_mapping_data_folder,
-            self.doc_source
+            self.doc_source,
            compare_with_provider=self.compare_with_provider
        )
        return data_mapping.mapping_raw_data_entrance()
@ -334,6 +348,7 @@ def mapping_data(
        output_mapping_data_folder=output_mapping_folder,
        extract_way=extract_way,
        drilldown_folder=drilldown_folder,
        compare_with_provider=False
    )
    doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(
        re_run=re_run_extract_data
@ -501,19 +516,30 @@ def batch_start_job(
            result_extract_data_df.to_excel(
                writer, index=False, sheet_name="extract_data"
            )
-        
+
-        if document_mapping_file is not None and len(document_mapping_file) > 0 and os.path.exists(document_mapping_file):
+        if (
            doc_source == "aus_prospectus"
            and document_mapping_file is not None
            and len(document_mapping_file) > 0
            and os.path.exists(document_mapping_file)
        ):
            try:
-                merged_total_data_folder = os.path.join(output_mapping_total_folder, "merged/")
+                merged_total_data_folder = os.path.join(
                    output_mapping_total_folder, "merged/"
                )
                os.makedirs(merged_total_data_folder, exist_ok=True)
                data_file_base_name = os.path.basename(output_file)
-                output_merged_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
+                output_merged_data_file_path = os.path.join(
-                merge_output_data_aus_prospectus(output_file, document_mapping_file, output_merged_data_file_path)
+                    merged_total_data_folder, "merged_" + data_file_base_name
                )
                merge_output_data_aus_prospectus(
                    output_file, document_mapping_file, output_merged_data_file_path
                )
            except Exception as e:
                logger.error(f"Error: {e}")
        if calculate_metrics:
-            prediction_sheet_name = "total_mapping_data"
+            prediction_sheet_name = "data_in_doc_mapping"
            ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
            ground_truth_sheet_name = "mapping_data"
            metrics_output_folder = r"/data/emea_ar/output/metrics/"
@ -770,11 +796,11 @@ def test_auto_generate_instructions():
 def test_data_extraction_metrics():
-    data_type = "data_extraction"
+    data_type = "document_mapping_in_db"
    # prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx"
-    prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_text_20240922152517.xlsx"
+    prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_51_documents_by_text_20250127104008.xlsx"
    # prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/481475385.xlsx"
-    prediction_sheet_name = "mapping_data"
+    prediction_sheet_name = "data_in_doc_mapping"
    ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
    ground_truth_sheet_name = "mapping_data"
    metrics_output_folder = r"/data/emea_ar/output/metrics/"
@ -1015,9 +1041,9 @@ def batch_run_documents(
    page_filter_ground_truth_file = (
        r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
    )
-    re_run_extract_data = False
+    re_run_extract_data = True
-    re_run_mapping_data = False
+    re_run_mapping_data = True
-    force_save_total_data = True
+    force_save_total_data = False
    calculate_metrics = False
    extract_way = "text"
@ -1194,13 +1220,17 @@ def merge_output_data_aus_prospectus(
 ):
    # TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
    data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
-    document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping")
+    document_mapping_df = pd.read_excel(
        document_mapping_file, sheet_name="document_mapping"
    )
    # set doc_id to be string type
    data_df["doc_id"] = data_df["doc_id"].astype(str)
    document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
    doc_id_list = data_df["doc_id"].unique().tolist()
-    datapoint_keyword_config_file = r"./configuration/aus_prospectus/datapoint_name.json"
+    datapoint_keyword_config_file = (
        r"./configuration/aus_prospectus/datapoint_name.json"
    )
    with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f:
        datapoint_keyword_config = json.load(f)
    datapoint_name_list = list(datapoint_keyword_config.keys())
@ -1212,7 +1242,9 @@ def merge_output_data_aus_prospectus(
                "EffectiveDate"
            ].values[0]
        )[0:10]
-        share_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)]
+        share_doc_data_df = data_df[
            (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)
        ]
        exist_raw_name_list = []
        for index, row in share_doc_data_df.iterrows():
            doc_id = str(row["doc_id"])
@ -1228,7 +1260,9 @@ def merge_output_data_aus_prospectus(
            fund_id = ""
            fund_legal_name = ""
            if share_class_id != "":
-                record_row = document_mapping_df[document_mapping_df["FundClassId"] == share_class_id]
+                record_row = document_mapping_df[
                    document_mapping_df["FundClassId"] == share_class_id
                ]
                if len(record_row) > 0:
                    fund_id = record_row["FundId"].values[0]
                    fund_legal_name = record_row["FundLegalName"].values[0]
@ -1265,16 +1299,16 @@ def merge_output_data_aus_prospectus(
                doc_data_list.append(data)
            # find data from total_data_list by raw_name
            for data in doc_data_list:
-                if (
+                if data["raw_name"] == raw_name:
                    data["raw_name"] == raw_name
                ):
                    update_key = datapoint
                    data[update_key] = value
                    if page_index not in data["page_index"]:
                        data["page_index"].append(page_index)
                    break
-        
+
-        fund_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)]
+        fund_doc_data_df = data_df[
            (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
        ]
        for index, row in fund_doc_data_df.iterrows():
            doc_id = str(row["doc_id"])
            page_index = int(row["page_index"])
@ -1285,12 +1319,13 @@ def merge_output_data_aus_prospectus(
            value = row["value"]
            fund_id = row["investment_id"]
            fund_legal_name = row["investment_name"]
-            
+
            exist = False
            if fund_id != "":
                for data in doc_data_list:
-                    if (fund_id != "" and data["fund_id"] == fund_id) or \
+                    if (fund_id != "" and data["fund_id"] == fund_id) or (
-                        (data["raw_fund_name"] == raw_fund_name):
+                        data["raw_fund_name"] == raw_fund_name
                    ):
                        update_key = datapoint
                        data[update_key] = value
                        if page_index not in data["page_index"]:
@ -1323,6 +1358,7 @@ def merge_output_data_aus_prospectus(
 if __name__ == "__main__":
    # test_data_extraction_metrics()
    # data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
    # document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
    # merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
@ -1347,13 +1383,19 @@ if __name__ == "__main__":
    #                        output_mapping_child_folder=output_mapping_child_folder)
    # special_doc_id_list = ["553242411"]
-    
+
    doc_source = "aus_prospectus"
    if doc_source == "aus_prospectus":
-        document_sample_file = r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
+        # document_sample_file = (
        #     r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
        # )
        document_sample_file = (
            r"./sample_documents/aus_prospectus_17_documents_sample.txt"
        )
        with open(document_sample_file, "r", encoding="utf-8") as f:
            special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
-        document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
+        # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
        document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
        # special_doc_id_list: list = [
        #     "539790009",
        #     "542300403",
@ -1367,7 +1409,7 @@ if __name__ == "__main__":
        #     "555377021",
        #     "555654388",
        # ]
-        # special_doc_id_list: list = ["534287518"]
+        special_doc_id_list: list = ["377377369"]
        pdf_folder: str = r"/data/aus_prospectus/pdf/"
        output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
        output_extract_data_child_folder: str = (
@ -1383,7 +1425,7 @@ if __name__ == "__main__":
            r"/data/aus_prospectus/output/mapping_data/total/"
        )
        drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
-        
+
        batch_run_documents(
            doc_source=doc_source,
            special_doc_id_list=special_doc_id_list,
@ -1397,7 +1439,61 @@ if __name__ == "__main__":
            drilldown_folder=drilldown_folder,
        )
    elif doc_source == "emea_ar":
-        special_doc_id_list = ["553242408"]
+        special_doc_id_list = [
            "292989214",
            "316237292",
            "321733631",
            "323390570",
            "327956364",
            "333207452",
            "334718372",
            "344636875",
            "362246081",
            "366179419",
            "380945052",
            "382366116",
            "387202452",
            "389171486",
            "391456740",
            "391736837",
            "394778487",
            "401684600",
            "402113224",
            "402181770",
            "402397014",
            "405803396",
            "445102363",
            "445256897",
            "448265376",
            "449555622",
            "449623976",
            "458291624",
            "458359181",
            "463081566",
            "469138353",
            "471641628",
            "476492237",
            "478585901",
            "478586066",
            "479042264",
            "479793787",
            "481475385",
            "483617247",
            "486378555",
            "486383912",
            "492121213",
            "497497599",
            "502693599",
            "502821436",
            "503194284",
            "506559375",
            "507967525",
            "508854243",
            "509845549",
            "520879048",
            "529925114",
        ]
        # special_doc_id_list = ["532438210"]
        batch_run_documents(
            doc_source=doc_source, special_doc_id_list=special_doc_id_list
        )
--- a/sample_documents/aus_prospectus_17_documents_sample.txt
+++ b/sample_documents/aus_prospectus_17_documents_sample.txt
@ -0,0 +1,17 @@
 377377369
 397107472
 401212184
 409723592
 411062815
 412778803
 414751292
 462770987
 471206458
 391080133
 391080140
 410899007
 420339794
 441280757
 446324179
 454036250
 384508026
--- a/utils/pdf_util.py
+++ b/utils/pdf_util.py
@ -543,7 +543,7 @@ class PDFUtil:
                matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
            if len(matching_val_area) == 0:
                matching_val_area = page.search_for(text_block.replace('-\n', ''))
-        if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
+        if len(matching_val_area) > 0 and len(text_block.strip().split()) < 3:
            new_matching_val_area = []
            for area in matching_val_area:
                # get text by text_bbox
--- a/utils/sql_query_util.py
+++ b/utils/sql_query_util.py
@ -8,7 +8,7 @@ import dotenv
 dotenv.load_dotenv()
-def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_ar/output/mapping/document/"):
+def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/document/"):
    count = 1
    while True:
        try:
@ -27,10 +27,13 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a
                    by=["FundName", "ShareClassName"]
                ).reset_index(drop=True)
                if output_folder is not None and len(output_folder) > 0:
-                    os.makedirs(output_folder, exist_ok=True)
+                    try:
-                    output_file = os.path.join(output_folder, f"{doc_id}.xlsx")
+                        os.makedirs(output_folder, exist_ok=True)
-                    with pd.ExcelWriter(output_file) as writer:
+                        output_file = os.path.join(output_folder, f"{doc_id}.xlsx")
-                        document_mapping_info_df.to_excel(writer, index=False)
+                        with pd.ExcelWriter(output_file) as writer:
                            document_mapping_info_df.to_excel(writer, index=False)
                    except:
                        pass
            return document_mapping_info_df
        except Exception as e:
            print(e)
@ -40,7 +43,7 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a
            count += 1
-def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/data/emea_ar/output/mapping/provider/"):
+def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/provider/"):
    count = 1
    while True:
        try:
@ -59,10 +62,13 @@ def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/d
                    .sort_values(by=['FundName', 'ShareClassName']) \
                    .reset_index(drop=True)
                if output_folder is not None and len(output_folder) > 0:
-                    os.makedirs(output_folder, exist_ok=True)
+                    try:
-                    output_file = os.path.join(output_folder, f"{company_id}.xlsx")
+                        os.makedirs(output_folder, exist_ok=True)
-                    with pd.ExcelWriter(output_file) as writer:
+                        output_file = os.path.join(output_folder, f"{company_id}.xlsx")
-                        investment_by_provider_df.to_excel(writer, index=False)
+                        with pd.ExcelWriter(output_file) as writer:
                            investment_by_provider_df.to_excel(writer, index=False)
                    except:
                        pass
            return investment_by_provider_df
        except Exception as e:
            print(e)
@ -73,7 +79,7 @@ def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/d
 def query_data_by_biz_type(biztype: str, para, return_df: bool):
-    sqlpass_url = "https://api.morningstar.com/sqlpassapi/v1/sql"
+    sqlpass_url = os.getenv("SQL_PASS_URL")
    url = sqlpass_url + "?sqlName={0}&params={1}".format(biztype, str(para))
    headers = {"ApiKey": os.getenv("SQL_PASS_KEY")}
    if return_df:
Author	SHA1	Message	Date
Blade He	f7d53acdde	support get sqlpass api by configuration	2025-02-19 14:37:21 -06:00
Blade He	a8810519f8	optimize instructions configuration optimize drilldown part logic	2025-02-04 15:29:24 -06:00
Blade He	f9ef4cec96	update sql_query cache file store location At most cache 5 days, then clean from local disk.	2025-01-31 10:59:54 -06:00
Blade He	7f37f3532f	switch example document	2025-01-27 14:59:26 -06:00
Blade He	6f831e241c	Merge branch 'aus_prospectus_ravi'	2025-01-27 12:32:42 -06:00
Blade He	41f8c307ff	a little change	2025-01-27 12:32:36 -06:00
Blade He	47c41e492f	1. only get name mapping data from document mapping 2. Compare name mapping metrics between Ravi's and mine.	2025-01-27 12:29:49 -06:00
Blade He	d9b0bed39a	a little change	2025-01-22 09:57:42 -06:00
Blade He	350550d1b0	fix issue for removing item from list	2025-01-21 17:24:05 -06:00
Blade He	e2b9bcbdbc	initial abbreviation configurations	2025-01-21 17:09:45 -06:00