add configuration for datapoints data types

update configuration for minimum initial investment
support apply value to all of funds for minimum initial investment
This commit is contained in:
Blade He 2025-02-05 12:08:12 -06:00
parent a8810519f8
commit 01e2a0e38d
9 changed files with 135 additions and 15 deletions

View File

@ -21,7 +21,7 @@
"date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
"date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
"high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
"minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]},
"minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]},
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
}

View File

@ -21,7 +21,7 @@
"date_of_last_hwm_reset": "share_level",
"date_of_last_performance_fee_restructure": "share_level",
"high_water_mark_type": "share_level",
"minimum_initial_investment": "share_level",
"minimum_initial_investment": "fund_level",
"recoverable_expenses": "share_level",
"indirect_costs": "share_level"
}

View File

@ -21,7 +21,7 @@
"date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
"date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
"high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
"minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]},
"minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]},
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
}

View File

@ -0,0 +1,27 @@
{
"total_annual_dollar_based_charges": "float",
"management_fee_and_costs": "float",
"management_fee": "float",
"performance_fee": "float",
"performance_fee_costs": "float",
"buy_spread": "float",
"sell_spread": "float",
"establishment_fee": "float",
"contribution_fee": "float",
"withdrawal_fee": "float",
"switching_fee": "float",
"activity_fee": "float",
"exit_fee": "float",
"administration_fees": "float",
"interposed_vehicle_performance_fee_cost": "float",
"additional_hurdle": "text",
"benchmark_name": "text",
"reference_rate": "float",
"crystallisation_frequency": "text",
"date_of_last_hwm_reset": "text",
"date_of_last_performance_fee_restructure": "text",
"high_water_mark_type": "text",
"minimum_initial_investment": "integer",
"recoverable_expenses": "float",
"indirect_costs": "float"
}

View File

@ -0,0 +1,6 @@
{
"tor": "float",
"ogc": "float",
"ter": "float",
"performance_fee": "float"
}

View File

@ -185,10 +185,65 @@ class DataExtraction:
data_list = self.extract_data_by_image()
else:
data_list = self.extract_data_by_text()
if self.doc_source == "aus_prospectus":
data_list = self.post_supplement_data(data_list)
# data_list = remove_abundant_data(data_list)
self.output_data_to_file(data_list)
return data_list
def post_supplement_data(self, data_list: list) -> list:
"""
data_dict = {"doc_id": self.doc_id}
data_dict["page_index"] = page_num
data_dict["datapoints"] = ", ".join(page_datapoints)
data_dict["page_text"] = page_text
data_dict["instructions"] = instructions
data_dict["raw_answer"] = response
data_dict["extract_data"] = data
data_dict["extract_way"] = original_way
data_dict["prompt_token"] = result.get("prompt_token", 0)
data_dict["completion_token"] = result.get("completion_token", 0)
data_dict["total_token"] = result.get("total_token", 0)
"""
exist_minimum_initial_investment = False
minimum_initial_investment = -1
mii_fund_name = ""
mii_dict = None
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
keys = list(data_item.keys())
if "minimum_initial_investment" in keys:
exist_minimum_initial_investment = True
minimum_initial_investment = data_item.get("minimum_initial_investment", -1)
mii_fund_name = data_item.get("fund_name", "")
mii_dict = data_dict
break
if exist_minimum_initial_investment:
break
if exist_minimum_initial_investment and minimum_initial_investment != -1:
# get all of funds in data_list
fund_name_list = []
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
keys = list(data_item.keys())
if "fund_name" in keys:
fund_name = data_item.get("fund_name", "")
if len(fund_name) > 0 and fund_name not in fund_name_list and fund_name != mii_fund_name:
fund_name_list.append(fund_name)
# rewrite mii_dict, set each fund name with same minimum_initial_investment value
new_mii_data_list = []
for fund_name in fund_name_list:
new_data_dict = {"fund_name": fund_name, "minimum_initial_investment": minimum_initial_investment}
new_mii_data_list.append(new_data_dict)
mii_dict["extract_data"]["data"] = new_mii_data_list
return data_list
def extract_data_by_text(self) -> dict:
"""
keys are

View File

@ -23,6 +23,7 @@ class FilterPages:
self.pdf_file = pdf_file
self.output_pdf_text_folder = output_pdf_text_folder
self.configuration_folder = f"./configuration/{doc_source}/"
self.doc_source = doc_source
misc_config_file = os.path.join(self.configuration_folder, "misc_config.json")
if os.path.exists(misc_config_file):
with open(misc_config_file, "r", encoding="utf-8") as file:
@ -119,6 +120,7 @@ class FilterPages:
domicile_datapoint_config_file = os.path.join(self.configuration_folder, "domicile_datapoints.json")
datapoint_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_keyword.json")
datapoint_exclude_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_exclude_keyword.json")
datapoint_type_config_file = os.path.join(self.configuration_folder, "datapoint_type.json")
with open(language_config_file, "r", encoding="utf-8") as file:
self.language_config = json.load(file)
@ -130,6 +132,10 @@ class FilterPages:
datapoint_exclude_keywords_config_file, "r", encoding="utf-8"
) as file:
self.datapoint_exclude_keywords_config = json.load(file)
with open(
datapoint_type_config_file, "r", encoding="utf-8"
) as file:
self.datapoint_type_config = json.load(file)
def get_doc_info(self) -> dict:
if len(self.document_mapping_info_df) == 0:
@ -224,7 +230,8 @@ class FilterPages:
if page_index < 2:
continue
page_num = page_index + 1
if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and page_num not in self.document_dp_pages:
if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and \
page_num not in self.document_dp_pages:
continue
page_text = clean_text(page_text)
@ -237,7 +244,8 @@ class FilterPages:
language = self.doc_info.get("language", None)
if language is None:
language = "english"
if language == "english" and re.search(self.percentage_regex, text) is None:
if self.doc_source == "emea_ar" and language == "english" and \
re.search(self.percentage_regex, text) is None:
continue
for datapoint, keywords in self.datapoint_config.items():
find_datapoint = False
@ -257,10 +265,12 @@ class FilterPages:
break
if need_exclude:
continue
is_valid = self.search_in_sentence_is_valid(search_text, text)
if not is_valid:
continue
is_valid = True
data_type = self.datapoint_type_config.get(datapoint, "float")
if data_type == "float":
is_valid = self.search_in_sentence_is_valid(search_text, text)
if not is_valid:
continue
result[datapoint].append(page_index)
detail = {
"doc_id": self.doc_id,

View File

@ -136,21 +136,35 @@
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
"\n",
"C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".",
"C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
"---Example 1 Start---",
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}]",
"---Example 2 Start---",
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]"
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]"
],
"buy_spread": [
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
],
"minimum_initial_investment": [
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
"---Example 1 Start---",
"The minimum investment per Pension Plan account is \n$20,000. The minimum initial investment in any \ninvestment option is $5,000.\n\nPerpetual WealthFocus Pension Plan",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Perpetual WealthFocus Pension Plan\", \"share name\": \"\", \"minimum_initial_investment\": 5000}]",
"\n",
"---Example 2 Start---",
"Prime Super \n\n5 Initial investment amount \n\nThe minimum net total initial investment amount is $10,000. Please note before you open your pension account: If you \nhave made personal contributions into super and wish to claim a tax deduction, you will have to lodge a Notice of \nIntent to Claim form with the relevant super fund (including Prime Super) before you roll your super into the Income \nStreams account.",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Prime Super\", \"share name\": \"\", \"minimum_initial_investment\": 10000}]"
]
}
},

14
main.py
View File

@ -1,5 +1,6 @@
import os
import json
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
@ -1043,7 +1044,7 @@ def batch_run_documents(
)
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = False
force_save_total_data = True
calculate_metrics = False
extract_way = "text"
@ -1309,6 +1310,7 @@ def merge_output_data_aus_prospectus(
fund_doc_data_df = data_df[
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
]
fund_doc_data_df.fillna("", inplace=True)
for index, row in fund_doc_data_df.iterrows():
doc_id = str(row["doc_id"])
page_index = int(row["page_index"])
@ -1319,7 +1321,6 @@ def merge_output_data_aus_prospectus(
value = row["value"]
fund_id = row["investment_id"]
fund_legal_name = row["investment_name"]
exist = False
if fund_id != "":
for data in doc_data_list:
@ -1331,7 +1332,14 @@ def merge_output_data_aus_prospectus(
if page_index not in data["page_index"]:
data["page_index"].append(page_index)
exist = True
else:
for data in doc_data_list:
if data["raw_name"] == raw_name:
update_key = datapoint
data[update_key] = value
if page_index not in data["page_index"]:
data["page_index"].append(page_index)
exist = True
if not exist:
data = {
"DocumentId": doc_id,