add configuration for datapoints data types

update configuration for minimum initial investment support apply value to all of funds for minimum initial investment
2025-02-05 12:08:12 -06:00 · 2025-02-05 12:08:12 -06:00 · 01e2a0e38d
parent a8810519f8
commit 01e2a0e38d
9 changed files with 135 additions and 15 deletions
--- a/configuration/aus_prospectus/datapoint_keyword.json
+++ b/configuration/aus_prospectus/datapoint_keyword.json
@ -21,7 +21,7 @@
  "date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
  "date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
  "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
-  "minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]},
+  "minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]},
  "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
  "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
 }
--- a/configuration/aus_prospectus/datapoint_level.json
+++ b/configuration/aus_prospectus/datapoint_level.json
@ -21,7 +21,7 @@
    "date_of_last_hwm_reset": "share_level",
    "date_of_last_performance_fee_restructure": "share_level",
    "high_water_mark_type": "share_level",
-    "minimum_initial_investment": "share_level",
+    "minimum_initial_investment": "fund_level",
    "recoverable_expenses": "share_level",
    "indirect_costs": "share_level"
 }
--- a/configuration/aus_prospectus/datapoint_reported_name.json
+++ b/configuration/aus_prospectus/datapoint_reported_name.json
@ -21,7 +21,7 @@
    "date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
    "date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
    "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
-    "minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]},
+    "minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]},
    "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
    "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
  }
--- a/configuration/aus_prospectus/datapoint_type.json
+++ b/configuration/aus_prospectus/datapoint_type.json
@ -0,0 +1,27 @@
 {
    "total_annual_dollar_based_charges": "float",
    "management_fee_and_costs": "float",
    "management_fee": "float",
    "performance_fee": "float",
    "performance_fee_costs": "float",
    "buy_spread": "float",
    "sell_spread": "float",
    "establishment_fee": "float",
    "contribution_fee": "float",
    "withdrawal_fee": "float",
    "switching_fee": "float",
    "activity_fee": "float",
    "exit_fee": "float", 
    "administration_fees": "float",
    "interposed_vehicle_performance_fee_cost": "float", 
    "additional_hurdle": "text",
    "benchmark_name": "text",
    "reference_rate": "float",
    "crystallisation_frequency": "text",
    "date_of_last_hwm_reset": "text",
    "date_of_last_performance_fee_restructure": "text",
    "high_water_mark_type": "text",
    "minimum_initial_investment": "integer",
    "recoverable_expenses": "float",
    "indirect_costs": "float"
 }
--- a/configuration/emea_ar/datapoint_type.json
+++ b/configuration/emea_ar/datapoint_type.json
@ -0,0 +1,6 @@
 {
    "tor": "float",
    "ogc": "float",
    "ter": "float",
    "performance_fee": "float"
 }
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -185,9 +185,64 @@ class DataExtraction:
            data_list = self.extract_data_by_image()
        else:
            data_list = self.extract_data_by_text()
        if self.doc_source == "aus_prospectus":
            data_list = self.post_supplement_data(data_list)
        # data_list = remove_abundant_data(data_list)
        self.output_data_to_file(data_list)
        return data_list
    def post_supplement_data(self, data_list: list) -> list:
        """
        data_dict = {"doc_id": self.doc_id}
        data_dict["page_index"] = page_num
        data_dict["datapoints"] = ", ".join(page_datapoints)
        data_dict["page_text"] = page_text
        data_dict["instructions"] = instructions
        data_dict["raw_answer"] = response
        data_dict["extract_data"] = data
        data_dict["extract_way"] = original_way
        data_dict["prompt_token"] = result.get("prompt_token", 0)
        data_dict["completion_token"] = result.get("completion_token", 0)
        data_dict["total_token"] = result.get("total_token", 0)
        """
        exist_minimum_initial_investment = False
        minimum_initial_investment = -1
        mii_fund_name = ""
        mii_dict = None
        for data_dict in data_list:
            extract_data = data_dict.get("extract_data", {})
            data = extract_data.get("data", [])
            for data_item in data:
                keys = list(data_item.keys())
                if "minimum_initial_investment" in keys:
                    exist_minimum_initial_investment = True
                    minimum_initial_investment = data_item.get("minimum_initial_investment", -1)
                    mii_fund_name = data_item.get("fund_name", "")
                    mii_dict = data_dict
                    break
            if exist_minimum_initial_investment:
                break
        if exist_minimum_initial_investment and minimum_initial_investment != -1:
            # get all of funds in data_list
            fund_name_list = []
            for data_dict in data_list:
                extract_data = data_dict.get("extract_data", {})
                data = extract_data.get("data", [])
                for data_item in data:
                    keys = list(data_item.keys())
                    if "fund_name" in keys:
                        fund_name = data_item.get("fund_name", "")
                        if len(fund_name) > 0 and fund_name not in fund_name_list and fund_name != mii_fund_name:
                            fund_name_list.append(fund_name)
            # rewrite mii_dict, set each fund name with same minimum_initial_investment value
            new_mii_data_list = []
            for fund_name in fund_name_list:
                new_data_dict = {"fund_name": fund_name, "minimum_initial_investment": minimum_initial_investment}
                new_mii_data_list.append(new_data_dict)
            mii_dict["extract_data"]["data"] = new_mii_data_list
        return data_list
    def extract_data_by_text(self) -> dict:
        """
--- a/core/page_filter.py
+++ b/core/page_filter.py
@ -23,6 +23,7 @@ class FilterPages:
        self.pdf_file = pdf_file
        self.output_pdf_text_folder = output_pdf_text_folder
        self.configuration_folder = f"./configuration/{doc_source}/"
        self.doc_source = doc_source
        misc_config_file = os.path.join(self.configuration_folder, "misc_config.json")
        if os.path.exists(misc_config_file):
            with open(misc_config_file, "r", encoding="utf-8") as file:
@ -119,6 +120,7 @@ class FilterPages:
        domicile_datapoint_config_file = os.path.join(self.configuration_folder, "domicile_datapoints.json")
        datapoint_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_keyword.json")
        datapoint_exclude_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_exclude_keyword.json")
        datapoint_type_config_file = os.path.join(self.configuration_folder, "datapoint_type.json")
        with open(language_config_file, "r", encoding="utf-8") as file:
            self.language_config = json.load(file)
@ -130,6 +132,10 @@ class FilterPages:
            datapoint_exclude_keywords_config_file, "r", encoding="utf-8"
        ) as file:
            self.datapoint_exclude_keywords_config = json.load(file)
        with open(
            datapoint_type_config_file, "r", encoding="utf-8"
        ) as file:
            self.datapoint_type_config = json.load(file)
    def get_doc_info(self) -> dict:
        if len(self.document_mapping_info_df) == 0:
@ -224,7 +230,8 @@ class FilterPages:
            if page_index < 2:
                continue
            page_num = page_index + 1
-            if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and page_num not in self.document_dp_pages:
+            if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and \
                page_num not in self.document_dp_pages:
                continue
            page_text = clean_text(page_text)
@ -237,7 +244,8 @@ class FilterPages:
            language = self.doc_info.get("language", None)
            if language is None:
                language = "english"
-            if language == "english" and re.search(self.percentage_regex, text) is None:
+            if self.doc_source == "emea_ar" and language == "english" and \
                re.search(self.percentage_regex, text) is None:
                continue
            for datapoint, keywords in self.datapoint_config.items():
                find_datapoint = False
@ -257,10 +265,12 @@ class FilterPages:
                                    break
                            if need_exclude:
                                continue
-
+                        is_valid = True
-                        is_valid = self.search_in_sentence_is_valid(search_text, text)
+                        data_type = self.datapoint_type_config.get(datapoint, "float")
-                        if not is_valid:
+                        if data_type == "float":
-                            continue
+                            is_valid = self.search_in_sentence_is_valid(search_text, text)
                            if not is_valid:
                                continue
                        result[datapoint].append(page_index)
                        detail = {
                            "doc_id": self.doc_id,
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@ -136,21 +136,35 @@
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
 				"\n",
-				"C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".",
+				"C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
 				"---Example 1 Start---",
 				"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
 				"---Example 1 End---",
 				"The output should be:",
-				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]",
+				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}]",
 				"---Example 2 Start---",
 				"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
 				"---Example 2 End---",
 				"The output should be:",
-				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]"
+				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]"
 			],
 			"buy_spread": [
 				"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
 				"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
 			],
 			"minimum_initial_investment": [
 				"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
 				"---Example 1 Start---",
 				"The minimum investment per Pension Plan account is \n$20,000. The minimum initial investment in any \ninvestment option is $5,000.\n\nPerpetual WealthFocus Pension Plan",
 				"---Example 1 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Perpetual WealthFocus Pension Plan\", \"share name\": \"\", \"minimum_initial_investment\": 5000}]",
 				"\n",
 				"---Example 2 Start---",
 				"Prime Super \n\n5 Initial investment amount \n\nThe minimum net total initial investment amount is $10,000. Please note before you open your pension account: If you \nhave made personal contributions into super and wish to claim a tax deduction, you will have to lodge a Notice of \nIntent to Claim form with the relevant super fund (including Prime Super) before you roll your super into the Income \nStreams account.",
 				"---Example 2 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Prime Super\", \"share name\": \"\", \"minimum_initial_investment\": 10000}]"
 			]
 		}
 	},
--- a/main.py
+++ b/main.py
@ -1,5 +1,6 @@
 import os
 import json
 import numpy as np
 import pandas as pd
 from glob import glob
 from tqdm import tqdm
@ -1043,7 +1044,7 @@ def batch_run_documents(
    )
    re_run_extract_data = True
    re_run_mapping_data = True
-    force_save_total_data = False
+    force_save_total_data = True
    calculate_metrics = False
    extract_way = "text"
@ -1309,6 +1310,7 @@ def merge_output_data_aus_prospectus(
        fund_doc_data_df = data_df[
            (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
        ]
        fund_doc_data_df.fillna("", inplace=True)
        for index, row in fund_doc_data_df.iterrows():
            doc_id = str(row["doc_id"])
            page_index = int(row["page_index"])
@ -1319,7 +1321,6 @@ def merge_output_data_aus_prospectus(
            value = row["value"]
            fund_id = row["investment_id"]
            fund_legal_name = row["investment_name"]
            exist = False
            if fund_id != "":
                for data in doc_data_list:
@ -1331,7 +1332,14 @@ def merge_output_data_aus_prospectus(
                        if page_index not in data["page_index"]:
                            data["page_index"].append(page_index)
                        exist = True
-
+            else:
                for data in doc_data_list:
                    if data["raw_name"] == raw_name:
                        update_key = datapoint
                        data[update_key] = value
                        if page_index not in data["page_index"]:
                            data["page_index"].append(page_index)
                        exist = True
            if not exist:
                data = {
                    "DocumentId": doc_id,