add configuration for datapoints data types

update configuration for minimum initial investment
support apply value to all of funds for minimum initial investment
This commit is contained in:
Blade He 2025-02-05 12:08:12 -06:00
parent a8810519f8
commit 01e2a0e38d
9 changed files with 135 additions and 15 deletions

View File

@ -21,7 +21,7 @@
"date_of_last_hwm_reset": {"english": ["date of last hwm reset"]}, "date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
"date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]}, "date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
"high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]}, "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
"minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]}, "minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]},
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]}, "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
} }

View File

@ -21,7 +21,7 @@
"date_of_last_hwm_reset": "share_level", "date_of_last_hwm_reset": "share_level",
"date_of_last_performance_fee_restructure": "share_level", "date_of_last_performance_fee_restructure": "share_level",
"high_water_mark_type": "share_level", "high_water_mark_type": "share_level",
"minimum_initial_investment": "share_level", "minimum_initial_investment": "fund_level",
"recoverable_expenses": "share_level", "recoverable_expenses": "share_level",
"indirect_costs": "share_level" "indirect_costs": "share_level"
} }

View File

@ -21,7 +21,7 @@
"date_of_last_hwm_reset": {"english": ["date of last hwm reset"]}, "date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
"date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]}, "date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
"high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]}, "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
"minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]}, "minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]},
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]}, "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
} }

View File

@ -0,0 +1,27 @@
{
"total_annual_dollar_based_charges": "float",
"management_fee_and_costs": "float",
"management_fee": "float",
"performance_fee": "float",
"performance_fee_costs": "float",
"buy_spread": "float",
"sell_spread": "float",
"establishment_fee": "float",
"contribution_fee": "float",
"withdrawal_fee": "float",
"switching_fee": "float",
"activity_fee": "float",
"exit_fee": "float",
"administration_fees": "float",
"interposed_vehicle_performance_fee_cost": "float",
"additional_hurdle": "text",
"benchmark_name": "text",
"reference_rate": "float",
"crystallisation_frequency": "text",
"date_of_last_hwm_reset": "text",
"date_of_last_performance_fee_restructure": "text",
"high_water_mark_type": "text",
"minimum_initial_investment": "integer",
"recoverable_expenses": "float",
"indirect_costs": "float"
}

View File

@ -0,0 +1,6 @@
{
"tor": "float",
"ogc": "float",
"ter": "float",
"performance_fee": "float"
}

View File

@ -185,9 +185,64 @@ class DataExtraction:
data_list = self.extract_data_by_image() data_list = self.extract_data_by_image()
else: else:
data_list = self.extract_data_by_text() data_list = self.extract_data_by_text()
if self.doc_source == "aus_prospectus":
data_list = self.post_supplement_data(data_list)
# data_list = remove_abundant_data(data_list) # data_list = remove_abundant_data(data_list)
self.output_data_to_file(data_list) self.output_data_to_file(data_list)
return data_list return data_list
def post_supplement_data(self, data_list: list) -> list:
"""
data_dict = {"doc_id": self.doc_id}
data_dict["page_index"] = page_num
data_dict["datapoints"] = ", ".join(page_datapoints)
data_dict["page_text"] = page_text
data_dict["instructions"] = instructions
data_dict["raw_answer"] = response
data_dict["extract_data"] = data
data_dict["extract_way"] = original_way
data_dict["prompt_token"] = result.get("prompt_token", 0)
data_dict["completion_token"] = result.get("completion_token", 0)
data_dict["total_token"] = result.get("total_token", 0)
"""
exist_minimum_initial_investment = False
minimum_initial_investment = -1
mii_fund_name = ""
mii_dict = None
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
keys = list(data_item.keys())
if "minimum_initial_investment" in keys:
exist_minimum_initial_investment = True
minimum_initial_investment = data_item.get("minimum_initial_investment", -1)
mii_fund_name = data_item.get("fund_name", "")
mii_dict = data_dict
break
if exist_minimum_initial_investment:
break
if exist_minimum_initial_investment and minimum_initial_investment != -1:
# get all of funds in data_list
fund_name_list = []
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
keys = list(data_item.keys())
if "fund_name" in keys:
fund_name = data_item.get("fund_name", "")
if len(fund_name) > 0 and fund_name not in fund_name_list and fund_name != mii_fund_name:
fund_name_list.append(fund_name)
# rewrite mii_dict, set each fund name with same minimum_initial_investment value
new_mii_data_list = []
for fund_name in fund_name_list:
new_data_dict = {"fund_name": fund_name, "minimum_initial_investment": minimum_initial_investment}
new_mii_data_list.append(new_data_dict)
mii_dict["extract_data"]["data"] = new_mii_data_list
return data_list
def extract_data_by_text(self) -> dict: def extract_data_by_text(self) -> dict:
""" """

View File

@ -23,6 +23,7 @@ class FilterPages:
self.pdf_file = pdf_file self.pdf_file = pdf_file
self.output_pdf_text_folder = output_pdf_text_folder self.output_pdf_text_folder = output_pdf_text_folder
self.configuration_folder = f"./configuration/{doc_source}/" self.configuration_folder = f"./configuration/{doc_source}/"
self.doc_source = doc_source
misc_config_file = os.path.join(self.configuration_folder, "misc_config.json") misc_config_file = os.path.join(self.configuration_folder, "misc_config.json")
if os.path.exists(misc_config_file): if os.path.exists(misc_config_file):
with open(misc_config_file, "r", encoding="utf-8") as file: with open(misc_config_file, "r", encoding="utf-8") as file:
@ -119,6 +120,7 @@ class FilterPages:
domicile_datapoint_config_file = os.path.join(self.configuration_folder, "domicile_datapoints.json") domicile_datapoint_config_file = os.path.join(self.configuration_folder, "domicile_datapoints.json")
datapoint_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_keyword.json") datapoint_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_keyword.json")
datapoint_exclude_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_exclude_keyword.json") datapoint_exclude_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_exclude_keyword.json")
datapoint_type_config_file = os.path.join(self.configuration_folder, "datapoint_type.json")
with open(language_config_file, "r", encoding="utf-8") as file: with open(language_config_file, "r", encoding="utf-8") as file:
self.language_config = json.load(file) self.language_config = json.load(file)
@ -130,6 +132,10 @@ class FilterPages:
datapoint_exclude_keywords_config_file, "r", encoding="utf-8" datapoint_exclude_keywords_config_file, "r", encoding="utf-8"
) as file: ) as file:
self.datapoint_exclude_keywords_config = json.load(file) self.datapoint_exclude_keywords_config = json.load(file)
with open(
datapoint_type_config_file, "r", encoding="utf-8"
) as file:
self.datapoint_type_config = json.load(file)
def get_doc_info(self) -> dict: def get_doc_info(self) -> dict:
if len(self.document_mapping_info_df) == 0: if len(self.document_mapping_info_df) == 0:
@ -224,7 +230,8 @@ class FilterPages:
if page_index < 2: if page_index < 2:
continue continue
page_num = page_index + 1 page_num = page_index + 1
if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and page_num not in self.document_dp_pages: if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and \
page_num not in self.document_dp_pages:
continue continue
page_text = clean_text(page_text) page_text = clean_text(page_text)
@ -237,7 +244,8 @@ class FilterPages:
language = self.doc_info.get("language", None) language = self.doc_info.get("language", None)
if language is None: if language is None:
language = "english" language = "english"
if language == "english" and re.search(self.percentage_regex, text) is None: if self.doc_source == "emea_ar" and language == "english" and \
re.search(self.percentage_regex, text) is None:
continue continue
for datapoint, keywords in self.datapoint_config.items(): for datapoint, keywords in self.datapoint_config.items():
find_datapoint = False find_datapoint = False
@ -257,10 +265,12 @@ class FilterPages:
break break
if need_exclude: if need_exclude:
continue continue
is_valid = True
is_valid = self.search_in_sentence_is_valid(search_text, text) data_type = self.datapoint_type_config.get(datapoint, "float")
if not is_valid: if data_type == "float":
continue is_valid = self.search_in_sentence_is_valid(search_text, text)
if not is_valid:
continue
result[datapoint].append(page_index) result[datapoint].append(page_index)
detail = { detail = {
"doc_id": self.doc_id, "doc_id": self.doc_id,

View File

@ -136,21 +136,35 @@
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]", "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
"\n", "\n",
"C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".", "C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
"---Example 1 Start---", "---Example 1 Start---",
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
"---Example 1 End---", "---Example 1 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]", "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}]",
"---Example 2 Start---", "---Example 2 Start---",
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n", "Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
"---Example 2 End---", "---Example 2 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]" "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]"
], ],
"buy_spread": [ "buy_spread": [
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)" "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
],
"minimum_initial_investment": [
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
"---Example 1 Start---",
"The minimum investment per Pension Plan account is \n$20,000. The minimum initial investment in any \ninvestment option is $5,000.\n\nPerpetual WealthFocus Pension Plan",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Perpetual WealthFocus Pension Plan\", \"share name\": \"\", \"minimum_initial_investment\": 5000}]",
"\n",
"---Example 2 Start---",
"Prime Super \n\n5 Initial investment amount \n\nThe minimum net total initial investment amount is $10,000. Please note before you open your pension account: If you \nhave made personal contributions into super and wish to claim a tax deduction, you will have to lodge a Notice of \nIntent to Claim form with the relevant super fund (including Prime Super) before you roll your super into the Income \nStreams account.",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Prime Super\", \"share name\": \"\", \"minimum_initial_investment\": 10000}]"
] ]
} }
}, },

14
main.py
View File

@ -1,5 +1,6 @@
import os import os
import json import json
import numpy as np
import pandas as pd import pandas as pd
from glob import glob from glob import glob
from tqdm import tqdm from tqdm import tqdm
@ -1043,7 +1044,7 @@ def batch_run_documents(
) )
re_run_extract_data = True re_run_extract_data = True
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = False force_save_total_data = True
calculate_metrics = False calculate_metrics = False
extract_way = "text" extract_way = "text"
@ -1309,6 +1310,7 @@ def merge_output_data_aus_prospectus(
fund_doc_data_df = data_df[ fund_doc_data_df = data_df[
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33) (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
] ]
fund_doc_data_df.fillna("", inplace=True)
for index, row in fund_doc_data_df.iterrows(): for index, row in fund_doc_data_df.iterrows():
doc_id = str(row["doc_id"]) doc_id = str(row["doc_id"])
page_index = int(row["page_index"]) page_index = int(row["page_index"])
@ -1319,7 +1321,6 @@ def merge_output_data_aus_prospectus(
value = row["value"] value = row["value"]
fund_id = row["investment_id"] fund_id = row["investment_id"]
fund_legal_name = row["investment_name"] fund_legal_name = row["investment_name"]
exist = False exist = False
if fund_id != "": if fund_id != "":
for data in doc_data_list: for data in doc_data_list:
@ -1331,7 +1332,14 @@ def merge_output_data_aus_prospectus(
if page_index not in data["page_index"]: if page_index not in data["page_index"]:
data["page_index"].append(page_index) data["page_index"].append(page_index)
exist = True exist = True
else:
for data in doc_data_list:
if data["raw_name"] == raw_name:
update_key = datapoint
data[update_key] = value
if page_index not in data["page_index"]:
data["page_index"].append(page_index)
exist = True
if not exist: if not exist:
data = { data = {
"DocumentId": doc_id, "DocumentId": doc_id,