diff --git a/configuration/aus_prospectus/datapoint_keyword.json b/configuration/aus_prospectus/datapoint_keyword.json index f8773c8..696a982 100644 --- a/configuration/aus_prospectus/datapoint_keyword.json +++ b/configuration/aus_prospectus/datapoint_keyword.json @@ -21,7 +21,7 @@ "date_of_last_hwm_reset": {"english": ["date of last hwm reset"]}, "date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]}, "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]}, - "minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]}, + "minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]}, "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_level.json b/configuration/aus_prospectus/datapoint_level.json index 9847066..5453133 100644 --- a/configuration/aus_prospectus/datapoint_level.json +++ b/configuration/aus_prospectus/datapoint_level.json @@ -21,7 +21,7 @@ "date_of_last_hwm_reset": "share_level", "date_of_last_performance_fee_restructure": "share_level", "high_water_mark_type": "share_level", - "minimum_initial_investment": "share_level", + "minimum_initial_investment": "fund_level", "recoverable_expenses": "share_level", "indirect_costs": "share_level" } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_reported_name.json b/configuration/aus_prospectus/datapoint_reported_name.json index c119485..115122a 100644 --- a/configuration/aus_prospectus/datapoint_reported_name.json +++ b/configuration/aus_prospectus/datapoint_reported_name.json @@ -21,7 +21,7 @@ "date_of_last_hwm_reset": {"english": ["date of last hwm reset"]}, "date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]}, "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]}, - "minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]}, + "minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]}, "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_type.json b/configuration/aus_prospectus/datapoint_type.json new file mode 100644 index 0000000..fa841b9 --- /dev/null +++ b/configuration/aus_prospectus/datapoint_type.json @@ -0,0 +1,27 @@ +{ + "total_annual_dollar_based_charges": "float", + "management_fee_and_costs": "float", + "management_fee": "float", + "performance_fee": "float", + "performance_fee_costs": "float", + "buy_spread": "float", + "sell_spread": "float", + "establishment_fee": "float", + "contribution_fee": "float", + "withdrawal_fee": "float", + "switching_fee": "float", + "activity_fee": "float", + "exit_fee": "float", + "administration_fees": "float", + "interposed_vehicle_performance_fee_cost": "float", + "additional_hurdle": "text", + "benchmark_name": "text", + "reference_rate": "float", + "crystallisation_frequency": "text", + "date_of_last_hwm_reset": "text", + "date_of_last_performance_fee_restructure": "text", + "high_water_mark_type": "text", + "minimum_initial_investment": "integer", + "recoverable_expenses": "float", + "indirect_costs": "float" +} \ No newline at end of file diff --git a/configuration/emea_ar/datapoint_type.json b/configuration/emea_ar/datapoint_type.json new file mode 100644 index 0000000..2f729c8 --- /dev/null +++ b/configuration/emea_ar/datapoint_type.json @@ -0,0 +1,6 @@ +{ + "tor": "float", + "ogc": "float", + "ter": "float", + "performance_fee": "float" +} \ No newline at end of file diff --git a/core/data_extraction.py b/core/data_extraction.py index ea5c7e7..715aa56 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -185,9 +185,64 @@ class DataExtraction: data_list = self.extract_data_by_image() else: data_list = self.extract_data_by_text() + if self.doc_source == "aus_prospectus": + data_list = self.post_supplement_data(data_list) # data_list = remove_abundant_data(data_list) self.output_data_to_file(data_list) return data_list + + def post_supplement_data(self, data_list: list) -> list: + """ + data_dict = {"doc_id": self.doc_id} + data_dict["page_index"] = page_num + data_dict["datapoints"] = ", ".join(page_datapoints) + data_dict["page_text"] = page_text + data_dict["instructions"] = instructions + data_dict["raw_answer"] = response + data_dict["extract_data"] = data + data_dict["extract_way"] = original_way + data_dict["prompt_token"] = result.get("prompt_token", 0) + data_dict["completion_token"] = result.get("completion_token", 0) + data_dict["total_token"] = result.get("total_token", 0) + """ + exist_minimum_initial_investment = False + minimum_initial_investment = -1 + mii_fund_name = "" + mii_dict = None + for data_dict in data_list: + extract_data = data_dict.get("extract_data", {}) + data = extract_data.get("data", []) + for data_item in data: + keys = list(data_item.keys()) + if "minimum_initial_investment" in keys: + exist_minimum_initial_investment = True + minimum_initial_investment = data_item.get("minimum_initial_investment", -1) + mii_fund_name = data_item.get("fund_name", "") + mii_dict = data_dict + break + if exist_minimum_initial_investment: + break + if exist_minimum_initial_investment and minimum_initial_investment != -1: + # get all of funds in data_list + fund_name_list = [] + for data_dict in data_list: + extract_data = data_dict.get("extract_data", {}) + data = extract_data.get("data", []) + for data_item in data: + keys = list(data_item.keys()) + if "fund_name" in keys: + fund_name = data_item.get("fund_name", "") + if len(fund_name) > 0 and fund_name not in fund_name_list and fund_name != mii_fund_name: + fund_name_list.append(fund_name) + # rewrite mii_dict, set each fund name with same minimum_initial_investment value + new_mii_data_list = [] + for fund_name in fund_name_list: + new_data_dict = {"fund_name": fund_name, "minimum_initial_investment": minimum_initial_investment} + new_mii_data_list.append(new_data_dict) + mii_dict["extract_data"]["data"] = new_mii_data_list + return data_list + + def extract_data_by_text(self) -> dict: """ diff --git a/core/page_filter.py b/core/page_filter.py index e64bfe6..93c7d07 100644 --- a/core/page_filter.py +++ b/core/page_filter.py @@ -23,6 +23,7 @@ class FilterPages: self.pdf_file = pdf_file self.output_pdf_text_folder = output_pdf_text_folder self.configuration_folder = f"./configuration/{doc_source}/" + self.doc_source = doc_source misc_config_file = os.path.join(self.configuration_folder, "misc_config.json") if os.path.exists(misc_config_file): with open(misc_config_file, "r", encoding="utf-8") as file: @@ -119,6 +120,7 @@ class FilterPages: domicile_datapoint_config_file = os.path.join(self.configuration_folder, "domicile_datapoints.json") datapoint_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_keyword.json") datapoint_exclude_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_exclude_keyword.json") + datapoint_type_config_file = os.path.join(self.configuration_folder, "datapoint_type.json") with open(language_config_file, "r", encoding="utf-8") as file: self.language_config = json.load(file) @@ -130,6 +132,10 @@ class FilterPages: datapoint_exclude_keywords_config_file, "r", encoding="utf-8" ) as file: self.datapoint_exclude_keywords_config = json.load(file) + with open( + datapoint_type_config_file, "r", encoding="utf-8" + ) as file: + self.datapoint_type_config = json.load(file) def get_doc_info(self) -> dict: if len(self.document_mapping_info_df) == 0: @@ -224,7 +230,8 @@ class FilterPages: if page_index < 2: continue page_num = page_index + 1 - if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and page_num not in self.document_dp_pages: + if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and \ + page_num not in self.document_dp_pages: continue page_text = clean_text(page_text) @@ -237,7 +244,8 @@ class FilterPages: language = self.doc_info.get("language", None) if language is None: language = "english" - if language == "english" and re.search(self.percentage_regex, text) is None: + if self.doc_source == "emea_ar" and language == "english" and \ + re.search(self.percentage_regex, text) is None: continue for datapoint, keywords in self.datapoint_config.items(): find_datapoint = False @@ -257,10 +265,12 @@ class FilterPages: break if need_exclude: continue - - is_valid = self.search_in_sentence_is_valid(search_text, text) - if not is_valid: - continue + is_valid = True + data_type = self.datapoint_type_config.get(datapoint, "float") + if data_type == "float": + is_valid = self.search_in_sentence_is_valid(search_text, text) + if not is_valid: + continue result[datapoint].append(page_index) detail = { "doc_id": self.doc_id, diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index ccb7903..12025ce 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -136,21 +136,35 @@ "The output should be:", "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]", "\n", - "C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".", + "C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", "---Example 1 End---", "The output should be:", - "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]", + "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}]", "---Example 2 Start---", "Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n", "---Example 2 End---", "The output should be:", - "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]" + "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]" ], "buy_spread": [ "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)" + ], + "minimum_initial_investment": [ + "Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.", + "---Example 1 Start---", + "The minimum investment per Pension Plan account is \n$20,000. The minimum initial investment in any \ninvestment option is $5,000.\n\nPerpetual WealthFocus Pension Plan", + "---Example 1 End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"Perpetual WealthFocus Pension Plan\", \"share name\": \"\", \"minimum_initial_investment\": 5000}]", + "\n", + "---Example 2 Start---", + "Prime Super \n\n5 Initial investment amount \n\nThe minimum net total initial investment amount is $10,000. Please note before you open your pension account: If you \nhave made personal contributions into super and wish to claim a tax deduction, you will have to lodge a Notice of \nIntent to Claim form with the relevant super fund (including Prime Super) before you roll your super into the Income \nStreams account.", + "---Example 2 End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"Prime Super\", \"share name\": \"\", \"minimum_initial_investment\": 10000}]" ] } }, diff --git a/main.py b/main.py index f6dc138..4074d4c 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ import os import json +import numpy as np import pandas as pd from glob import glob from tqdm import tqdm @@ -1043,7 +1044,7 @@ def batch_run_documents( ) re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = False + force_save_total_data = True calculate_metrics = False extract_way = "text" @@ -1309,6 +1310,7 @@ def merge_output_data_aus_prospectus( fund_doc_data_df = data_df[ (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33) ] + fund_doc_data_df.fillna("", inplace=True) for index, row in fund_doc_data_df.iterrows(): doc_id = str(row["doc_id"]) page_index = int(row["page_index"]) @@ -1319,7 +1321,6 @@ def merge_output_data_aus_prospectus( value = row["value"] fund_id = row["investment_id"] fund_legal_name = row["investment_name"] - exist = False if fund_id != "": for data in doc_data_list: @@ -1331,7 +1332,14 @@ def merge_output_data_aus_prospectus( if page_index not in data["page_index"]: data["page_index"].append(page_index) exist = True - + else: + for data in doc_data_list: + if data["raw_name"] == raw_name: + update_key = datapoint + data[update_key] = value + if page_index not in data["page_index"]: + data["page_index"].append(page_index) + exist = True if not exist: data = { "DocumentId": doc_id,