add configuration for datapoints data types
update configuration for minimum initial investment support apply value to all of funds for minimum initial investment
This commit is contained in:
parent
a8810519f8
commit
01e2a0e38d
|
|
@ -21,7 +21,7 @@
|
|||
"date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
|
||||
"date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
|
||||
"high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
|
||||
"minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]},
|
||||
"minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]},
|
||||
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
|
||||
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
|
||||
}
|
||||
|
|
@ -21,7 +21,7 @@
|
|||
"date_of_last_hwm_reset": "share_level",
|
||||
"date_of_last_performance_fee_restructure": "share_level",
|
||||
"high_water_mark_type": "share_level",
|
||||
"minimum_initial_investment": "share_level",
|
||||
"minimum_initial_investment": "fund_level",
|
||||
"recoverable_expenses": "share_level",
|
||||
"indirect_costs": "share_level"
|
||||
}
|
||||
|
|
@ -21,7 +21,7 @@
|
|||
"date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
|
||||
"date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
|
||||
"high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
|
||||
"minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]},
|
||||
"minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]},
|
||||
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
|
||||
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
|
||||
}
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"total_annual_dollar_based_charges": "float",
|
||||
"management_fee_and_costs": "float",
|
||||
"management_fee": "float",
|
||||
"performance_fee": "float",
|
||||
"performance_fee_costs": "float",
|
||||
"buy_spread": "float",
|
||||
"sell_spread": "float",
|
||||
"establishment_fee": "float",
|
||||
"contribution_fee": "float",
|
||||
"withdrawal_fee": "float",
|
||||
"switching_fee": "float",
|
||||
"activity_fee": "float",
|
||||
"exit_fee": "float",
|
||||
"administration_fees": "float",
|
||||
"interposed_vehicle_performance_fee_cost": "float",
|
||||
"additional_hurdle": "text",
|
||||
"benchmark_name": "text",
|
||||
"reference_rate": "float",
|
||||
"crystallisation_frequency": "text",
|
||||
"date_of_last_hwm_reset": "text",
|
||||
"date_of_last_performance_fee_restructure": "text",
|
||||
"high_water_mark_type": "text",
|
||||
"minimum_initial_investment": "integer",
|
||||
"recoverable_expenses": "float",
|
||||
"indirect_costs": "float"
|
||||
}
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"tor": "float",
|
||||
"ogc": "float",
|
||||
"ter": "float",
|
||||
"performance_fee": "float"
|
||||
}
|
||||
|
|
@ -185,10 +185,65 @@ class DataExtraction:
|
|||
data_list = self.extract_data_by_image()
|
||||
else:
|
||||
data_list = self.extract_data_by_text()
|
||||
if self.doc_source == "aus_prospectus":
|
||||
data_list = self.post_supplement_data(data_list)
|
||||
# data_list = remove_abundant_data(data_list)
|
||||
self.output_data_to_file(data_list)
|
||||
return data_list
|
||||
|
||||
def post_supplement_data(self, data_list: list) -> list:
|
||||
"""
|
||||
data_dict = {"doc_id": self.doc_id}
|
||||
data_dict["page_index"] = page_num
|
||||
data_dict["datapoints"] = ", ".join(page_datapoints)
|
||||
data_dict["page_text"] = page_text
|
||||
data_dict["instructions"] = instructions
|
||||
data_dict["raw_answer"] = response
|
||||
data_dict["extract_data"] = data
|
||||
data_dict["extract_way"] = original_way
|
||||
data_dict["prompt_token"] = result.get("prompt_token", 0)
|
||||
data_dict["completion_token"] = result.get("completion_token", 0)
|
||||
data_dict["total_token"] = result.get("total_token", 0)
|
||||
"""
|
||||
exist_minimum_initial_investment = False
|
||||
minimum_initial_investment = -1
|
||||
mii_fund_name = ""
|
||||
mii_dict = None
|
||||
for data_dict in data_list:
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
data = extract_data.get("data", [])
|
||||
for data_item in data:
|
||||
keys = list(data_item.keys())
|
||||
if "minimum_initial_investment" in keys:
|
||||
exist_minimum_initial_investment = True
|
||||
minimum_initial_investment = data_item.get("minimum_initial_investment", -1)
|
||||
mii_fund_name = data_item.get("fund_name", "")
|
||||
mii_dict = data_dict
|
||||
break
|
||||
if exist_minimum_initial_investment:
|
||||
break
|
||||
if exist_minimum_initial_investment and minimum_initial_investment != -1:
|
||||
# get all of funds in data_list
|
||||
fund_name_list = []
|
||||
for data_dict in data_list:
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
data = extract_data.get("data", [])
|
||||
for data_item in data:
|
||||
keys = list(data_item.keys())
|
||||
if "fund_name" in keys:
|
||||
fund_name = data_item.get("fund_name", "")
|
||||
if len(fund_name) > 0 and fund_name not in fund_name_list and fund_name != mii_fund_name:
|
||||
fund_name_list.append(fund_name)
|
||||
# rewrite mii_dict, set each fund name with same minimum_initial_investment value
|
||||
new_mii_data_list = []
|
||||
for fund_name in fund_name_list:
|
||||
new_data_dict = {"fund_name": fund_name, "minimum_initial_investment": minimum_initial_investment}
|
||||
new_mii_data_list.append(new_data_dict)
|
||||
mii_dict["extract_data"]["data"] = new_mii_data_list
|
||||
return data_list
|
||||
|
||||
|
||||
|
||||
def extract_data_by_text(self) -> dict:
|
||||
"""
|
||||
keys are
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ class FilterPages:
|
|||
self.pdf_file = pdf_file
|
||||
self.output_pdf_text_folder = output_pdf_text_folder
|
||||
self.configuration_folder = f"./configuration/{doc_source}/"
|
||||
self.doc_source = doc_source
|
||||
misc_config_file = os.path.join(self.configuration_folder, "misc_config.json")
|
||||
if os.path.exists(misc_config_file):
|
||||
with open(misc_config_file, "r", encoding="utf-8") as file:
|
||||
|
|
@ -119,6 +120,7 @@ class FilterPages:
|
|||
domicile_datapoint_config_file = os.path.join(self.configuration_folder, "domicile_datapoints.json")
|
||||
datapoint_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_keyword.json")
|
||||
datapoint_exclude_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_exclude_keyword.json")
|
||||
datapoint_type_config_file = os.path.join(self.configuration_folder, "datapoint_type.json")
|
||||
|
||||
with open(language_config_file, "r", encoding="utf-8") as file:
|
||||
self.language_config = json.load(file)
|
||||
|
|
@ -130,6 +132,10 @@ class FilterPages:
|
|||
datapoint_exclude_keywords_config_file, "r", encoding="utf-8"
|
||||
) as file:
|
||||
self.datapoint_exclude_keywords_config = json.load(file)
|
||||
with open(
|
||||
datapoint_type_config_file, "r", encoding="utf-8"
|
||||
) as file:
|
||||
self.datapoint_type_config = json.load(file)
|
||||
|
||||
def get_doc_info(self) -> dict:
|
||||
if len(self.document_mapping_info_df) == 0:
|
||||
|
|
@ -224,7 +230,8 @@ class FilterPages:
|
|||
if page_index < 2:
|
||||
continue
|
||||
page_num = page_index + 1
|
||||
if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and page_num not in self.document_dp_pages:
|
||||
if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and \
|
||||
page_num not in self.document_dp_pages:
|
||||
continue
|
||||
|
||||
page_text = clean_text(page_text)
|
||||
|
|
@ -237,7 +244,8 @@ class FilterPages:
|
|||
language = self.doc_info.get("language", None)
|
||||
if language is None:
|
||||
language = "english"
|
||||
if language == "english" and re.search(self.percentage_regex, text) is None:
|
||||
if self.doc_source == "emea_ar" and language == "english" and \
|
||||
re.search(self.percentage_regex, text) is None:
|
||||
continue
|
||||
for datapoint, keywords in self.datapoint_config.items():
|
||||
find_datapoint = False
|
||||
|
|
@ -257,10 +265,12 @@ class FilterPages:
|
|||
break
|
||||
if need_exclude:
|
||||
continue
|
||||
|
||||
is_valid = self.search_in_sentence_is_valid(search_text, text)
|
||||
if not is_valid:
|
||||
continue
|
||||
is_valid = True
|
||||
data_type = self.datapoint_type_config.get(datapoint, "float")
|
||||
if data_type == "float":
|
||||
is_valid = self.search_in_sentence_is_valid(search_text, text)
|
||||
if not is_valid:
|
||||
continue
|
||||
result[datapoint].append(page_index)
|
||||
detail = {
|
||||
"doc_id": self.doc_id,
|
||||
|
|
|
|||
|
|
@ -136,21 +136,35 @@
|
|||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
|
||||
"\n",
|
||||
"C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".",
|
||||
"C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
||||
"---Example 1 Start---",
|
||||
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
|
||||
"---Example 1 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]",
|
||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}]",
|
||||
"---Example 2 Start---",
|
||||
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
|
||||
"---Example 2 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]"
|
||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]"
|
||||
],
|
||||
"buy_spread": [
|
||||
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
||||
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
|
||||
],
|
||||
"minimum_initial_investment": [
|
||||
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
|
||||
"---Example 1 Start---",
|
||||
"The minimum investment per Pension Plan account is \n$20,000. The minimum initial investment in any \ninvestment option is $5,000.\n\nPerpetual WealthFocus Pension Plan",
|
||||
"---Example 1 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Perpetual WealthFocus Pension Plan\", \"share name\": \"\", \"minimum_initial_investment\": 5000}]",
|
||||
"\n",
|
||||
"---Example 2 Start---",
|
||||
"Prime Super \n\n5 Initial investment amount \n\nThe minimum net total initial investment amount is $10,000. Please note before you open your pension account: If you \nhave made personal contributions into super and wish to claim a tax deduction, you will have to lodge a Notice of \nIntent to Claim form with the relevant super fund (including Prime Super) before you roll your super into the Income \nStreams account.",
|
||||
"---Example 2 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Prime Super\", \"share name\": \"\", \"minimum_initial_investment\": 10000}]"
|
||||
]
|
||||
}
|
||||
},
|
||||
|
|
|
|||
14
main.py
14
main.py
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from glob import glob
|
||||
from tqdm import tqdm
|
||||
|
|
@ -1043,7 +1044,7 @@ def batch_run_documents(
|
|||
)
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = False
|
||||
force_save_total_data = True
|
||||
calculate_metrics = False
|
||||
|
||||
extract_way = "text"
|
||||
|
|
@ -1309,6 +1310,7 @@ def merge_output_data_aus_prospectus(
|
|||
fund_doc_data_df = data_df[
|
||||
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
|
||||
]
|
||||
fund_doc_data_df.fillna("", inplace=True)
|
||||
for index, row in fund_doc_data_df.iterrows():
|
||||
doc_id = str(row["doc_id"])
|
||||
page_index = int(row["page_index"])
|
||||
|
|
@ -1319,7 +1321,6 @@ def merge_output_data_aus_prospectus(
|
|||
value = row["value"]
|
||||
fund_id = row["investment_id"]
|
||||
fund_legal_name = row["investment_name"]
|
||||
|
||||
exist = False
|
||||
if fund_id != "":
|
||||
for data in doc_data_list:
|
||||
|
|
@ -1331,7 +1332,14 @@ def merge_output_data_aus_prospectus(
|
|||
if page_index not in data["page_index"]:
|
||||
data["page_index"].append(page_index)
|
||||
exist = True
|
||||
|
||||
else:
|
||||
for data in doc_data_list:
|
||||
if data["raw_name"] == raw_name:
|
||||
update_key = datapoint
|
||||
data[update_key] = value
|
||||
if page_index not in data["page_index"]:
|
||||
data["page_index"].append(page_index)
|
||||
exist = True
|
||||
if not exist:
|
||||
data = {
|
||||
"DocumentId": doc_id,
|
||||
|
|
|
|||
Loading…
Reference in New Issue