Merge branches 'aus_prospectus_ravi' and 'aus_prospectus_ravi' of https://msstash.morningstar.com/scm/dc/dc-ml-emea-ar into aus_prospectus_ravi
This commit is contained in:
commit
76fbb7c071
|
|
@ -3,7 +3,6 @@
|
||||||
/utils/__pycache__
|
/utils/__pycache__
|
||||||
/__pycache__/*.pyc
|
/__pycache__/*.pyc
|
||||||
/core/__pycache__/*.pyc
|
/core/__pycache__/*.pyc
|
||||||
/test_calc_metrics.py
|
|
||||||
/test_metrics
|
/test_metrics
|
||||||
/data
|
/data
|
||||||
/sample_documents/japan_prospectus.txt
|
/sample_documents/japan_prospectus.txt
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,96 @@
|
||||||
|
from flask import Flask, request, jsonify, render_template
|
||||||
|
from flasgger import Swagger, swag_from
|
||||||
|
from main import EMEA_AR_Parsing
|
||||||
|
from utils.logger import logger
|
||||||
|
from utils.biz_utils import clean_folder
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
template = {
|
||||||
|
"info": {
|
||||||
|
"title": "Australia Prospectus Data Extraction API",
|
||||||
|
"description": 'Australia Prospectus Data Extraction API',
|
||||||
|
"version": "1.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
app = Flask(__name__)
|
||||||
|
# By Swagger, we can see the API documentation in the browser, the example URL is http://127.0.0.1:8080/apidocs/
|
||||||
|
swagger = Swagger(app, template=template)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/automation/api/model/aus_prospectus', methods=['POST'])
|
||||||
|
@swag_from('yml/aus_prospectus.yml')
|
||||||
|
def aus_prospectus_data_extract():
|
||||||
|
"""
|
||||||
|
Extract Australia Prospectus data from Australia Prospectus PDF document
|
||||||
|
input sample:
|
||||||
|
{
|
||||||
|
"doc_id": "412778803"
|
||||||
|
}
|
||||||
|
output: Australia Prospectus cost data as a list of dictionaries
|
||||||
|
:return:
|
||||||
|
:rtype:
|
||||||
|
"""
|
||||||
|
logger.info('Australia Prospectus data extraction begin')
|
||||||
|
doc_id = request.json.get('doc_id')
|
||||||
|
|
||||||
|
if not doc_id:
|
||||||
|
return jsonify({"error": "doc_id is required"}), 400
|
||||||
|
|
||||||
|
pdf_folder = r"./data/aus_prospectus/pdf/"
|
||||||
|
output_pdf_text_folder = r"./data/aus_prospectus/output/pdf_text/"
|
||||||
|
output_extract_data_folder = r"./data/aus_prospectus/output/extract_data/docs/"
|
||||||
|
output_mapping_data_folder = r"./data/aus_prospectus/output/mapping_data/docs/"
|
||||||
|
drilldown_folder = r"./data/aus_prospectus/output/drilldown/"
|
||||||
|
db_mapping_document_folder = r"./data/aus_prospectus/output/db_mapping/document/"
|
||||||
|
db_mapping_provider_folder = r"./data/aus_prospectus/output/db_mapping/provider/"
|
||||||
|
extract_way = "text"
|
||||||
|
|
||||||
|
os.makedirs(pdf_folder, exist_ok=True)
|
||||||
|
os.makedirs(output_pdf_text_folder, exist_ok=True)
|
||||||
|
os.makedirs(output_extract_data_folder, exist_ok=True)
|
||||||
|
os.makedirs(output_mapping_data_folder, exist_ok=True)
|
||||||
|
os.makedirs(drilldown_folder, exist_ok=True)
|
||||||
|
os.makedirs(db_mapping_document_folder, exist_ok=True)
|
||||||
|
os.makedirs(db_mapping_provider_folder, exist_ok=True)
|
||||||
|
|
||||||
|
clean_folder(pdf_folder)
|
||||||
|
clean_folder(output_pdf_text_folder)
|
||||||
|
clean_folder(output_extract_data_folder)
|
||||||
|
clean_folder(output_mapping_data_folder)
|
||||||
|
clean_folder(drilldown_folder)
|
||||||
|
clean_folder(db_mapping_document_folder)
|
||||||
|
clean_folder(db_mapping_provider_folder)
|
||||||
|
|
||||||
|
re_run_extract_data = False
|
||||||
|
re_run_mapping_data = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
||||||
|
doc_source="aus_prospectus",
|
||||||
|
pdf_folder=pdf_folder,
|
||||||
|
output_pdf_text_folder=output_pdf_text_folder,
|
||||||
|
output_extract_data_folder=output_extract_data_folder,
|
||||||
|
output_mapping_data_folder=output_mapping_data_folder,
|
||||||
|
extract_way=extract_way,
|
||||||
|
drilldown_folder=drilldown_folder,
|
||||||
|
compare_with_provider=False)
|
||||||
|
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
||||||
|
doc_mapping_data = emea_ar_parsing.mapping_data(
|
||||||
|
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
|
||||||
|
)
|
||||||
|
results = {"extract_data": doc_mapping_data}
|
||||||
|
return jsonify(results)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
|
results = {"extract_data": [],
|
||||||
|
"annotation_data": [],
|
||||||
|
"error": str(e)}
|
||||||
|
return jsonify(results)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Add use_reloader = False to avoid init twice
|
||||||
|
app.run(host='0.0.0.0', port="8080", debug=False, use_reloader=False)
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,8 +1,8 @@
|
||||||
{
|
{
|
||||||
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
|
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar", "administration fees and costs", "Administration fee", "Administration fees"]},
|
||||||
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees"]},
|
"management_fee_and_costs": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]},
|
||||||
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "investment fees and costs", "investment fee and costs", "Management costs", "Investment fees"]},
|
"management_fee": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]},
|
||||||
"performance_fee": {"english": ["performance fee", "performance fees"]},
|
"performance_fee_costs": {"english": ["performance fee", "performance fees"]},
|
||||||
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
|
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
|
||||||
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
|
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
|
||||||
"administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
|
"administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
|
||||||
|
|
@ -11,5 +11,5 @@
|
||||||
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]},
|
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]},
|
||||||
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
|
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
|
||||||
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
|
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
|
||||||
"change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
|
"change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
|
||||||
}
|
}
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
"total_annual_dollar_based_charges": "share_level",
|
"total_annual_dollar_based_charges": "share_level",
|
||||||
"management_fee_and_costs": "share_level",
|
"management_fee_and_costs": "share_level",
|
||||||
"management_fee": "share_level",
|
"management_fee": "share_level",
|
||||||
"performance_fee": "share_level",
|
"performance_fee_costs": "share_level",
|
||||||
"buy_spread": "share_level",
|
"buy_spread": "share_level",
|
||||||
"sell_spread": "share_level",
|
"sell_spread": "share_level",
|
||||||
"administration_fees": "share_level",
|
"administration_fees": "share_level",
|
||||||
|
|
@ -11,5 +11,5 @@
|
||||||
"minimum_initial_investment": "fund_level",
|
"minimum_initial_investment": "fund_level",
|
||||||
"indirect_costs": "share_level",
|
"indirect_costs": "share_level",
|
||||||
"recoverable_expenses": "share_level",
|
"recoverable_expenses": "share_level",
|
||||||
"change_recoverable_expanses": "share_level"
|
"change_recoverable_expenses": "share_level"
|
||||||
}
|
}
|
||||||
|
|
@ -1,15 +1,15 @@
|
||||||
{
|
{
|
||||||
"total_annual_dollar_based_charges": "total annual dollar based charges",
|
|
||||||
"management_fee_and_costs": "management fee and costs",
|
"management_fee_and_costs": "management fee and costs",
|
||||||
"management_fee": "management fee",
|
"management_fee": "management fee",
|
||||||
"performance_fee": "performance fee",
|
"administration_fees": "administration fee",
|
||||||
|
"performance_fee_costs": "performance fee",
|
||||||
|
"interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost",
|
||||||
"buy_spread": "buy spread",
|
"buy_spread": "buy spread",
|
||||||
"sell_spread": "sell spread",
|
"sell_spread": "sell spread",
|
||||||
"administration_fees": "administration fee",
|
"total_annual_dollar_based_charges": "total annual dollar based charges",
|
||||||
"interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost",
|
|
||||||
"benchmark_name": "benchmark name",
|
|
||||||
"minimum_initial_investment": "minimum initial investment",
|
"minimum_initial_investment": "minimum initial investment",
|
||||||
|
"benchmark_name": "benchmark name",
|
||||||
"indirect_costs": "indirect cost",
|
"indirect_costs": "indirect cost",
|
||||||
"recoverable_expenses": "recoverable expenses",
|
"recoverable_expenses": "recoverable expenses",
|
||||||
"change_recoverable_expanses": "change recoverable expanses"
|
"change_recoverable_expenses": "change recoverable expanses"
|
||||||
}
|
}
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
{
|
{
|
||||||
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
|
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar", "administration fees and costs", "Administration fee", "Administration fees"]},
|
||||||
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]},
|
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]},
|
||||||
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]},
|
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]},
|
||||||
"performance_fee": {"english": ["performance fee", "performance fees"]},
|
"performance_fee_costs": {"english": ["performance fee", "performance fees"]},
|
||||||
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
|
"buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
|
||||||
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
|
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
|
||||||
"administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
|
"administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
|
||||||
|
|
@ -11,5 +11,5 @@
|
||||||
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]},
|
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]},
|
||||||
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
|
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
|
||||||
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
|
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
|
||||||
"change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
|
"change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
|
||||||
}
|
}
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
"total_annual_dollar_based_charges": "float",
|
"total_annual_dollar_based_charges": "float",
|
||||||
"management_fee_and_costs": "float",
|
"management_fee_and_costs": "float",
|
||||||
"management_fee": "float",
|
"management_fee": "float",
|
||||||
"performance_fee": "float",
|
"performance_fee_costs": "float",
|
||||||
"buy_spread": "float",
|
"buy_spread": "float",
|
||||||
"sell_spread": "float",
|
"sell_spread": "float",
|
||||||
"administration_fees": "float",
|
"administration_fees": "float",
|
||||||
|
|
@ -11,5 +11,5 @@
|
||||||
"minimum_initial_investment": "integer",
|
"minimum_initial_investment": "integer",
|
||||||
"indirect_costs": "float",
|
"indirect_costs": "float",
|
||||||
"recoverable_expenses": "float",
|
"recoverable_expenses": "float",
|
||||||
"change_recoverable_expanses": "float"
|
"change_recoverable_expenses": "float"
|
||||||
}
|
}
|
||||||
|
|
@ -24,7 +24,7 @@
|
||||||
"total_annual_dollar_based_charges",
|
"total_annual_dollar_based_charges",
|
||||||
"management_fee_and_costs",
|
"management_fee_and_costs",
|
||||||
"management_fee",
|
"management_fee",
|
||||||
"performance_fee",
|
"performance_fee_costs",
|
||||||
"buy_spread",
|
"buy_spread",
|
||||||
"sell_spread",
|
"sell_spread",
|
||||||
"administration_fees",
|
"administration_fees",
|
||||||
|
|
@ -32,7 +32,7 @@
|
||||||
"benchmark_name",
|
"benchmark_name",
|
||||||
"minimum_initial_investment",
|
"minimum_initial_investment",
|
||||||
"indirect_costs",
|
"indirect_costs",
|
||||||
"change_recoverable_expanses",
|
"change_recoverable_expenses",
|
||||||
"recoverable_expenses"
|
"recoverable_expenses"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,35 @@
|
||||||
|
{
|
||||||
|
"details": [
|
||||||
|
{
|
||||||
|
"regex_all_list":
|
||||||
|
["\\nIndirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n",
|
||||||
|
"\\n(Investment\\s*Option|Fund)[\\s\\S]*?Management\\s*fee[\\s\\S]*?Indirect\\s*costs[\\s\\S]*?performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?indirect\\s*costs[\\s\\S]*?(interposed\\s*vehicles|managers\\s*vehicles)\\s*\\n",
|
||||||
|
"\\nOption\\s*name\\s*Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n"],
|
||||||
|
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n",
|
||||||
|
"comments": ["item 0: document 410899007",
|
||||||
|
"item 1: document 539266880, 539266817, 539261734",
|
||||||
|
"item 2: document 539266893"]
|
||||||
|
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regex_all_list":
|
||||||
|
["Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Transactions\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads\\s*\\(\\%\\)\\s*\\n"],
|
||||||
|
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n",
|
||||||
|
"comments": ["item 0: document 410899007"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regex_all_list":
|
||||||
|
["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n",
|
||||||
|
"Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"],
|
||||||
|
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n",
|
||||||
|
"comments": ["item 0: document 401212184, page 17",
|
||||||
|
"item 1: document 401212184, page 18 - 20"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regex_all_list":
|
||||||
|
["Investment\\s*option\\s*Administration fees[\\s\\S]*?administration\\s*costs\\s*Investment\\s*fees[\\s\\S]*?investment\\s*costs\\s*Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Estimated\\s*administration[\\s\\S]*?transaction\\s*costs[\\s\\S]*?annual\\s*fees\\s*and\\s*costs\\s*\\(\\%\\s*pa\\)\\s*\\n"],
|
||||||
|
"replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n",
|
||||||
|
"comments": ["item 0: document 411062815, page 17"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
"details": []
|
||||||
|
}
|
||||||
|
|
@ -9,7 +9,8 @@ from utils.gpt_utils import chat
|
||||||
from utils.pdf_util import PDFUtil
|
from utils.pdf_util import PDFUtil
|
||||||
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
|
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name, remove_abundant_data
|
from utils.biz_utils import add_slash_to_text_as_regex, clean_text, \
|
||||||
|
get_most_similar_name, remove_abundant_data, replace_special_table_header
|
||||||
|
|
||||||
|
|
||||||
class DataExtraction:
|
class DataExtraction:
|
||||||
|
|
@ -73,6 +74,7 @@ class DataExtraction:
|
||||||
self.datapoint_level_config = self.get_datapoint_level()
|
self.datapoint_level_config = self.get_datapoint_level()
|
||||||
self.datapoint_type_config = self.get_datapoint_type()
|
self.datapoint_type_config = self.get_datapoint_type()
|
||||||
self.datapoint_name_config = self.get_datapoint_name()
|
self.datapoint_name_config = self.get_datapoint_name()
|
||||||
|
self.replace_table_header_config = self.get_replace_table_header_config()
|
||||||
self.datapoint_reported_name_config, self.non_english_reported_name_config = \
|
self.datapoint_reported_name_config, self.non_english_reported_name_config = \
|
||||||
self.get_datapoint_reported_name()
|
self.get_datapoint_reported_name()
|
||||||
self.extract_way = extract_way
|
self.extract_way = extract_way
|
||||||
|
|
@ -115,6 +117,8 @@ class DataExtraction:
|
||||||
if self.doc_source == "aus_prospectus" and self.document_category.upper() == "MIS":
|
if self.doc_source == "aus_prospectus" and self.document_category.upper() == "MIS":
|
||||||
if "administration_fees" in list(datapoint_page_info.keys()):
|
if "administration_fees" in list(datapoint_page_info.keys()):
|
||||||
datapoint_page_info.pop("administration_fees")
|
datapoint_page_info.pop("administration_fees")
|
||||||
|
if "total_annual_dollar_based_charges" in list(datapoint_page_info.keys()):
|
||||||
|
datapoint_page_info.pop("total_annual_dollar_based_charges")
|
||||||
return datapoint_page_info
|
return datapoint_page_info
|
||||||
|
|
||||||
def get_investment_objective_pages(self):
|
def get_investment_objective_pages(self):
|
||||||
|
|
@ -205,6 +209,15 @@ class DataExtraction:
|
||||||
datapoint_name = json.load(f)
|
datapoint_name = json.load(f)
|
||||||
return datapoint_name
|
return datapoint_name
|
||||||
|
|
||||||
|
def get_replace_table_header_config(self) -> str:
|
||||||
|
replace_table_header_file = os.path.join(self.configuration_folder, "replace_table_header.json")
|
||||||
|
if os.path.exists(replace_table_header_file):
|
||||||
|
with open(replace_table_header_file, "r", encoding="utf-8") as f:
|
||||||
|
replace_table_header_config = json.load(f).get("details", [])
|
||||||
|
return replace_table_header_config
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
def get_pdf_page_text_dict(self) -> dict:
|
def get_pdf_page_text_dict(self) -> dict:
|
||||||
pdf_util = PDFUtil(self.pdf_file)
|
pdf_util = PDFUtil(self.pdf_file)
|
||||||
success, text, page_text_dict = pdf_util.extract_text()
|
success, text, page_text_dict = pdf_util.extract_text()
|
||||||
|
|
@ -271,6 +284,8 @@ class DataExtraction:
|
||||||
keys = list(data_item.keys())
|
keys = list(data_item.keys())
|
||||||
if "administration_fees" in keys:
|
if "administration_fees" in keys:
|
||||||
data_item.pop("administration_fees")
|
data_item.pop("administration_fees")
|
||||||
|
if "total_annual_dollar_based_charges" in keys:
|
||||||
|
data_item.pop("total_annual_dollar_based_charges")
|
||||||
keys = [key for key in list(data_item.keys()) if key not in ["fund_name", "share_name"]]
|
keys = [key for key in list(data_item.keys()) if key not in ["fund_name", "share_name"]]
|
||||||
if len(keys) == 0:
|
if len(keys) == 0:
|
||||||
remove_items.append(data_item)
|
remove_items.append(data_item)
|
||||||
|
|
@ -301,6 +316,8 @@ class DataExtraction:
|
||||||
raw_name_dict.pop(raw_name_as_production_name)
|
raw_name_dict.pop(raw_name_as_production_name)
|
||||||
|
|
||||||
for data_dict in data_list:
|
for data_dict in data_list:
|
||||||
|
# if data_dict.get("page_index", -1) > 9:
|
||||||
|
# break
|
||||||
extract_data = data_dict.get("extract_data", {})
|
extract_data = data_dict.get("extract_data", {})
|
||||||
data = extract_data.get("data", [])
|
data = extract_data.get("data", [])
|
||||||
remove_item_list = []
|
remove_item_list = []
|
||||||
|
|
@ -311,7 +328,12 @@ class DataExtraction:
|
||||||
share_name = data_item.get("share_name", "")
|
share_name = data_item.get("share_name", "")
|
||||||
raw_name = self.get_raw_name(fund_name, share_name)
|
raw_name = self.get_raw_name(fund_name, share_name)
|
||||||
if raw_name.lower() in self.document_production.lower():
|
if raw_name.lower() in self.document_production.lower():
|
||||||
dp_keys = [key for key in keys if key not in ["fund_name", "share_name"]]
|
dp_keys = [key for key in keys if key not in ["fund_name",
|
||||||
|
"share_name",
|
||||||
|
"management_fee_and_costs",
|
||||||
|
"management_fee",
|
||||||
|
"buy_spread",
|
||||||
|
"sell_spread"]]
|
||||||
for dp_key in dp_keys:
|
for dp_key in dp_keys:
|
||||||
if dp_key not in datapoint_list_with_production_name:
|
if dp_key not in datapoint_list_with_production_name:
|
||||||
datapoint_list_with_production_name.append(dp_key)
|
datapoint_list_with_production_name.append(dp_key)
|
||||||
|
|
@ -447,8 +469,13 @@ class DataExtraction:
|
||||||
"""
|
"""
|
||||||
management_fee_costs_list = []
|
management_fee_costs_list = []
|
||||||
management_fee_list = []
|
management_fee_list = []
|
||||||
|
complex_rule_keywords = "Recoverable expenses \nEstimated other indirect costs"
|
||||||
for data_dict in data_list:
|
for data_dict in data_list:
|
||||||
extract_data = data_dict.get("extract_data", {})
|
extract_data = data_dict.get("extract_data", {})
|
||||||
|
exist_complex_rule_keywords = False
|
||||||
|
page_text = data_dict.get("page_text", "")
|
||||||
|
if complex_rule_keywords in page_text:
|
||||||
|
exist_complex_rule_keywords = True
|
||||||
data = extract_data.get("data", [])
|
data = extract_data.get("data", [])
|
||||||
for data_item in data:
|
for data_item in data:
|
||||||
keys = list(data_item.keys())
|
keys = list(data_item.keys())
|
||||||
|
|
@ -466,11 +493,17 @@ class DataExtraction:
|
||||||
if (mf_fund_name == fund_name and mf_share_name == share_name) or \
|
if (mf_fund_name == fund_name and mf_share_name == share_name) or \
|
||||||
(len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and
|
(len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and
|
||||||
(mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))):
|
(mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))):
|
||||||
mf_value = mf.get("management_fee", -1)
|
if exist_complex_rule_keywords and \
|
||||||
if mf_value != -1 and mf_value >= management_fee:
|
("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
|
||||||
mf["management_fee"] = management_fee
|
mfc["management_fee"] = management_fee
|
||||||
found = True
|
found = True
|
||||||
break
|
break
|
||||||
|
else:
|
||||||
|
mf_value = mf.get("management_fee", -1)
|
||||||
|
if mf_value != -1 and mf_value >= management_fee:
|
||||||
|
mf["management_fee"] = management_fee
|
||||||
|
found = True
|
||||||
|
break
|
||||||
if not found:
|
if not found:
|
||||||
management_fee_list.append({"fund_name": fund_name,
|
management_fee_list.append({"fund_name": fund_name,
|
||||||
"share_name": share_name,
|
"share_name": share_name,
|
||||||
|
|
@ -485,11 +518,17 @@ class DataExtraction:
|
||||||
if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \
|
if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \
|
||||||
(len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and
|
(len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and
|
||||||
(mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))):
|
(mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))):
|
||||||
mfc_value = mfc.get("management_fee_and_costs", -1)
|
if exist_complex_rule_keywords and \
|
||||||
if mfc_value != -1 and mfc_value <= management_fee_costs:
|
("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
|
||||||
mfc["management_fee_and_costs"] = management_fee_costs
|
mfc["management_fee_and_costs"] = management_fee_costs
|
||||||
found = True
|
found = True
|
||||||
break
|
break
|
||||||
|
else:
|
||||||
|
mfc_value = mfc.get("management_fee_and_costs", -1)
|
||||||
|
if mfc_value != -1 and mfc_value <= management_fee_costs:
|
||||||
|
mfc["management_fee_and_costs"] = management_fee_costs
|
||||||
|
found = True
|
||||||
|
break
|
||||||
if not found:
|
if not found:
|
||||||
management_fee_costs_list.append({"fund_name": fund_name,
|
management_fee_costs_list.append({"fund_name": fund_name,
|
||||||
"share_name": share_name,
|
"share_name": share_name,
|
||||||
|
|
@ -575,7 +614,7 @@ class DataExtraction:
|
||||||
previous_page_datapoints = []
|
previous_page_datapoints = []
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
# if page_num != 40:
|
# if page_num not in [4, 5]:
|
||||||
# continue
|
# continue
|
||||||
if page_num in handled_page_num_list:
|
if page_num in handled_page_num_list:
|
||||||
continue
|
continue
|
||||||
|
|
@ -593,6 +632,7 @@ class DataExtraction:
|
||||||
else:
|
else:
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
|
|
||||||
|
page_text = replace_special_table_header(self.replace_table_header_config, page_text)
|
||||||
extract_data = self.extract_data_by_page(
|
extract_data = self.extract_data_by_page(
|
||||||
page_num,
|
page_num,
|
||||||
page_text,
|
page_text,
|
||||||
|
|
@ -657,6 +697,8 @@ class DataExtraction:
|
||||||
)
|
)
|
||||||
if not with_same_structure_table:
|
if not with_same_structure_table:
|
||||||
break
|
break
|
||||||
|
next_page_text = replace_special_table_header(self.replace_table_header_config,
|
||||||
|
next_page_text)
|
||||||
target_text = current_text + next_page_text
|
target_text = current_text + next_page_text
|
||||||
else:
|
else:
|
||||||
target_text = ""
|
target_text = ""
|
||||||
|
|
@ -1507,6 +1549,32 @@ class DataExtraction:
|
||||||
complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "")
|
complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "")
|
||||||
with_special_rule_title = False
|
with_special_rule_title = False
|
||||||
for datapoint in datapoints:
|
for datapoint in datapoints:
|
||||||
|
find_complex_special_rule = False
|
||||||
|
if page_text is not None and len(page_text) > 0:
|
||||||
|
complex_special_rule_list = complex_special_rule.get(datapoint, [])
|
||||||
|
for complex_special_rule in complex_special_rule_list:
|
||||||
|
complex_keywords = complex_special_rule.get("keywords", [])
|
||||||
|
if len(complex_keywords) == 0:
|
||||||
|
continue
|
||||||
|
exist_keywords = False
|
||||||
|
for special_keywords in complex_keywords:
|
||||||
|
special_keywrods_regex = add_slash_to_text_as_regex(special_keywords)
|
||||||
|
if special_keywords in page_text or \
|
||||||
|
re.search(special_keywrods_regex, page_text) is not None:
|
||||||
|
exist_keywords = True
|
||||||
|
break
|
||||||
|
if exist_keywords:
|
||||||
|
complex_prompts_list = complex_special_rule.get("prompts", [])
|
||||||
|
if len(complex_prompts_list) > 0:
|
||||||
|
if not with_special_rule_title:
|
||||||
|
instructions.append("Special rule:\n")
|
||||||
|
with_special_rule_title = True
|
||||||
|
complex_prompts = "\n".join(complex_prompts_list)
|
||||||
|
instructions.append(complex_prompts)
|
||||||
|
instructions.append("\n\n")
|
||||||
|
find_complex_special_rule = True
|
||||||
|
if find_complex_special_rule:
|
||||||
|
continue
|
||||||
special_rule_list = special_rule_info.get(datapoint, [])
|
special_rule_list = special_rule_info.get(datapoint, [])
|
||||||
if len(special_rule_list) > 0:
|
if len(special_rule_list) > 0:
|
||||||
if not with_special_rule_title:
|
if not with_special_rule_title:
|
||||||
|
|
@ -1515,26 +1583,7 @@ class DataExtraction:
|
||||||
special_rule = "\n".join(special_rule_list)
|
special_rule = "\n".join(special_rule_list)
|
||||||
instructions.append(special_rule)
|
instructions.append(special_rule)
|
||||||
instructions.append("\n\n")
|
instructions.append("\n\n")
|
||||||
if page_text is None or len(page_text) == 0:
|
|
||||||
continue
|
|
||||||
complex_special_rule_list = complex_special_rule.get(datapoint, [])
|
|
||||||
for complex_special_rule in complex_special_rule_list:
|
|
||||||
complex_keywords = complex_special_rule.get("keywords", [])
|
|
||||||
if len(complex_keywords) == 0:
|
|
||||||
continue
|
|
||||||
exist_keywords = False
|
|
||||||
for special_keywords in complex_keywords:
|
|
||||||
special_keywrods_regex = add_slash_to_text_as_regex(special_keywords)
|
|
||||||
if special_keywords in page_text or \
|
|
||||||
re.search(special_keywrods_regex, page_text) is not None:
|
|
||||||
exist_keywords = True
|
|
||||||
break
|
|
||||||
if exist_keywords:
|
|
||||||
complex_prompts_list = complex_special_rule.get("prompts", [])
|
|
||||||
if len(complex_prompts_list) > 0:
|
|
||||||
complex_prompts = "\n".join(complex_prompts_list)
|
|
||||||
instructions.append(complex_prompts)
|
|
||||||
instructions.append("\n\n")
|
|
||||||
instructions.append("\n")
|
instructions.append("\n")
|
||||||
|
|
||||||
instructions.append("Special cases:\n")
|
instructions.append("Special cases:\n")
|
||||||
|
|
@ -1563,27 +1612,9 @@ class DataExtraction:
|
||||||
contents_list = special_case.get("contents", [])
|
contents_list = special_case.get("contents", [])
|
||||||
contents = "\n".join(contents_list)
|
contents = "\n".join(contents_list)
|
||||||
instructions.append(contents)
|
instructions.append(contents)
|
||||||
instructions.append("\n\n")
|
instructions.append("\n")
|
||||||
instructions.append("\n")
|
instructions.append("\n")
|
||||||
|
|
||||||
# extreme_complex_config_list = special_cases.get("extreme_complex", [])
|
|
||||||
# if len(extreme_complex_config_list) > 0:
|
|
||||||
# for extreme_complex_config in extreme_complex_config_list:
|
|
||||||
# regex = extreme_complex_config.get("regex", "")
|
|
||||||
# if len(regex) == 0:
|
|
||||||
# continue
|
|
||||||
# search = re.search(regex, page_text)
|
|
||||||
# if search is not None:
|
|
||||||
# title = extreme_complex_config.get("title", "")
|
|
||||||
# title = f"{special_cases_number}. {title} "
|
|
||||||
# special_cases_number += 1
|
|
||||||
# instructions.append(title)
|
|
||||||
# instructions.append("\n")
|
|
||||||
# contents_list = extreme_complex_config.get("contents", [])
|
|
||||||
# contents = "\n".join(contents_list)
|
|
||||||
# instructions.append(contents)
|
|
||||||
# instructions.append("\n\n")
|
|
||||||
|
|
||||||
instructions.append("Output requirement:\n")
|
instructions.append("Output requirement:\n")
|
||||||
output_requirement = self.instructions_config.get("output_requirement", {})
|
output_requirement = self.instructions_config.get("output_requirement", {})
|
||||||
output_requirement_common_list = output_requirement.get("common", [])
|
output_requirement_common_list = output_requirement.get("common", [])
|
||||||
|
|
|
||||||
|
|
@ -228,7 +228,178 @@ class DataMapping:
|
||||||
mapped_data["similarity"] = 1
|
mapped_data["similarity"] = 1
|
||||||
|
|
||||||
self.output_mapping_file(mapped_data_list)
|
self.output_mapping_file(mapped_data_list)
|
||||||
return mapped_data_list
|
|
||||||
|
if self.doc_source == "aus_prospectus":
|
||||||
|
output_data_folder_splits = self.output_data_excel_folder.split("output")
|
||||||
|
if len(output_data_folder_splits) == 2:
|
||||||
|
merged_data_folder = f'{output_data_folder_splits[0]}output/merged_data/docs/'
|
||||||
|
os.makedirs(merged_data_folder, exist_ok=True)
|
||||||
|
|
||||||
|
merged_data_json_folder = os.path.join(merged_data_folder, "json/")
|
||||||
|
os.makedirs(merged_data_json_folder, exist_ok=True)
|
||||||
|
|
||||||
|
merged_data_excel_folder = os.path.join(merged_data_folder, "excel/")
|
||||||
|
os.makedirs(merged_data_excel_folder, exist_ok=True)
|
||||||
|
merged_data_list = self.merge_output_data_aus_prospectus(mapped_data_list,
|
||||||
|
merged_data_json_folder,
|
||||||
|
merged_data_excel_folder)
|
||||||
|
return merged_data_list
|
||||||
|
else:
|
||||||
|
return mapped_data_list
|
||||||
|
|
||||||
|
def merge_output_data_aus_prospectus(self,
|
||||||
|
mapped_data_list: list,
|
||||||
|
merged_data_json_folder: str,
|
||||||
|
merged_data_excel_folder: str):
|
||||||
|
# TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
|
||||||
|
if mapped_data_list is None or len(mapped_data_list) == 0:
|
||||||
|
return
|
||||||
|
if merged_data_json_folder is None or len(merged_data_json_folder) == 0:
|
||||||
|
return
|
||||||
|
if merged_data_excel_folder is None or len(merged_data_excel_folder) == 0:
|
||||||
|
return
|
||||||
|
mapping_data_df = pd.DataFrame(mapped_data_list)
|
||||||
|
mapping_data_df.reset_index(drop=True, inplace=True)
|
||||||
|
mapping_data_df.fillna("", inplace=True)
|
||||||
|
|
||||||
|
document_mapping_df = self.document_mapping_info_df
|
||||||
|
document_mapping_df.fillna("", inplace=True)
|
||||||
|
|
||||||
|
datapoint_keyword_config_file = (
|
||||||
|
f"./configuration/{self.doc_source}/datapoint_name.json"
|
||||||
|
)
|
||||||
|
with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f:
|
||||||
|
datapoint_keyword_config = json.load(f)
|
||||||
|
datapoint_name_list = list(datapoint_keyword_config.keys())
|
||||||
|
total_data_list = []
|
||||||
|
|
||||||
|
doc_date = str(document_mapping_df["EffectiveDate"].values[0])[0:10]
|
||||||
|
share_doc_data_df = mapping_data_df[(mapping_data_df["investment_type"] == 1)]
|
||||||
|
exist_raw_name_list = []
|
||||||
|
for index, row in share_doc_data_df.iterrows():
|
||||||
|
doc_id = str(row["doc_id"])
|
||||||
|
page_index = int(row["page_index"])
|
||||||
|
raw_fund_name = str(row["raw_fund_name"])
|
||||||
|
raw_share_name = str(row["raw_share_name"])
|
||||||
|
raw_name = str(row["raw_name"])
|
||||||
|
datapoint = str(row["datapoint"])
|
||||||
|
value = row["value"]
|
||||||
|
investment_type = row["investment_type"]
|
||||||
|
share_class_id = row["investment_id"]
|
||||||
|
share_class_legal_name = row["investment_name"]
|
||||||
|
fund_id = ""
|
||||||
|
fund_legal_name = ""
|
||||||
|
if share_class_id != "":
|
||||||
|
record_row = document_mapping_df[document_mapping_df["SecId"] == share_class_id]
|
||||||
|
if len(record_row) > 0:
|
||||||
|
fund_id = record_row["FundId"].values[0]
|
||||||
|
fund_legal_name = record_row["FundName"].values[0]
|
||||||
|
|
||||||
|
exist = False
|
||||||
|
for exist_raw_name_info in exist_raw_name_list:
|
||||||
|
exist_raw_name = exist_raw_name_info["raw_name"]
|
||||||
|
exist_investment_type = exist_raw_name_info["investment_type"]
|
||||||
|
exist_investment_id = exist_raw_name_info["investment_id"]
|
||||||
|
if (
|
||||||
|
exist_raw_name == raw_name
|
||||||
|
and exist_investment_type == investment_type
|
||||||
|
) or (len(exist_investment_id) > 0 and exist_investment_id == share_class_id):
|
||||||
|
exist = True
|
||||||
|
break
|
||||||
|
if not exist:
|
||||||
|
data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"effective_date": doc_date,
|
||||||
|
"raw_fund_name": raw_fund_name,
|
||||||
|
"raw_share_name": raw_share_name,
|
||||||
|
"raw_name": raw_name,
|
||||||
|
"fund_id": fund_id,
|
||||||
|
"fund_name": fund_legal_name,
|
||||||
|
"sec_id": share_class_id,
|
||||||
|
"sec_name": share_class_legal_name,
|
||||||
|
"page_index": [],
|
||||||
|
}
|
||||||
|
for datapoint_name in datapoint_name_list:
|
||||||
|
data[datapoint_name] = ""
|
||||||
|
exist_raw_name_list.append(
|
||||||
|
{"raw_name": raw_name, "investment_type": investment_type, "investment_id": share_class_id}
|
||||||
|
)
|
||||||
|
total_data_list.append(data)
|
||||||
|
# find data from total_data_list by raw_name
|
||||||
|
for data in total_data_list:
|
||||||
|
if data["raw_name"] == raw_name:
|
||||||
|
update_key = datapoint
|
||||||
|
data[update_key] = value
|
||||||
|
if page_index not in data["page_index"]:
|
||||||
|
data["page_index"].append(page_index)
|
||||||
|
break
|
||||||
|
if len(share_class_id) > 0 and data["sec_id"] == share_class_id:
|
||||||
|
update_key = datapoint
|
||||||
|
if len(str(data[update_key])) == 0:
|
||||||
|
data[update_key] = value
|
||||||
|
if page_index not in data["page_index"]:
|
||||||
|
data["page_index"].append(page_index)
|
||||||
|
break
|
||||||
|
|
||||||
|
fund_doc_data_df = mapping_data_df[(mapping_data_df["investment_type"] == 33)]
|
||||||
|
fund_doc_data_df.fillna("", inplace=True)
|
||||||
|
for index, row in fund_doc_data_df.iterrows():
|
||||||
|
doc_id = str(row["doc_id"])
|
||||||
|
page_index = int(row["page_index"])
|
||||||
|
raw_fund_name = str(row["raw_fund_name"])
|
||||||
|
raw_share_name = ""
|
||||||
|
raw_name = str(row["raw_name"])
|
||||||
|
datapoint = str(row["datapoint"])
|
||||||
|
value = row["value"]
|
||||||
|
fund_id = row["investment_id"]
|
||||||
|
fund_legal_name = row["investment_name"]
|
||||||
|
exist = False
|
||||||
|
if fund_id != "":
|
||||||
|
for data in total_data_list:
|
||||||
|
if (fund_id != "" and data["fund_id"] == fund_id) or (
|
||||||
|
data["raw_fund_name"] == raw_fund_name
|
||||||
|
):
|
||||||
|
update_key = datapoint
|
||||||
|
data[update_key] = value
|
||||||
|
if page_index not in data["page_index"]:
|
||||||
|
data["page_index"].append(page_index)
|
||||||
|
exist = True
|
||||||
|
else:
|
||||||
|
for data in total_data_list:
|
||||||
|
if data["raw_name"] == raw_name:
|
||||||
|
update_key = datapoint
|
||||||
|
data[update_key] = value
|
||||||
|
if page_index not in data["page_index"]:
|
||||||
|
data["page_index"].append(page_index)
|
||||||
|
exist = True
|
||||||
|
if not exist:
|
||||||
|
data = {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"effective_date": doc_date,
|
||||||
|
"raw_fund_name": raw_fund_name,
|
||||||
|
"raw_share_name": "",
|
||||||
|
"raw_name": raw_name,
|
||||||
|
"fund_id": fund_id,
|
||||||
|
"fund_name": fund_legal_name,
|
||||||
|
"sec_id": "",
|
||||||
|
"sec_name": "",
|
||||||
|
"page_index": [page_index],
|
||||||
|
}
|
||||||
|
for datapoint_name in datapoint_name_list:
|
||||||
|
data[datapoint_name] = ""
|
||||||
|
data[datapoint] = value
|
||||||
|
total_data_list.append(data)
|
||||||
|
total_data_df = pd.DataFrame(total_data_list)
|
||||||
|
total_data_df.fillna("", inplace=True)
|
||||||
|
|
||||||
|
merged_data_excel_file = os.path.join(merged_data_excel_folder, f"merged_{self.doc_id}.xlsx")
|
||||||
|
with pd.ExcelWriter(merged_data_excel_file) as writer:
|
||||||
|
total_data_df.to_excel(writer, index=False, sheet_name="merged_data")
|
||||||
|
|
||||||
|
merged_data_json_file = os.path.join(merged_data_json_folder, f"merged_{self.doc_id}.json")
|
||||||
|
with open(merged_data_json_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(total_data_list, f, ensure_ascii=False, indent=4)
|
||||||
|
return total_data_list
|
||||||
|
|
||||||
def get_raw_name_db_match_result(
|
def get_raw_name_db_match_result(
|
||||||
self, raw_name_list, investment_type: str, iter_count: int = 30
|
self, raw_name_list, investment_type: str, iter_count: int = 30
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,6 @@
|
||||||
"management_fee_and_costs": "Management fee and costs is share level data.",
|
"management_fee_and_costs": "Management fee and costs is share level data.",
|
||||||
"management_fee": "Management fee is share level data.",
|
"management_fee": "Management fee is share level data.",
|
||||||
"performance_fee_costs": "Performance fee costs is share class level data.",
|
"performance_fee_costs": "Performance fee costs is share class level data.",
|
||||||
"performance_fee": "Performance fees is share class level data.",
|
|
||||||
"buy_spread": "Buy spread is share class level data.",
|
"buy_spread": "Buy spread is share class level data.",
|
||||||
"sell_spread": "Sell spread is share class level data.",
|
"sell_spread": "Sell spread is share class level data.",
|
||||||
"establishment_fee": "Establishment fee is share class level data.",
|
"establishment_fee": "Establishment fee is share class level data.",
|
||||||
|
|
@ -97,7 +96,6 @@
|
||||||
"management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.",
|
"management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.",
|
||||||
"management_fee": "Management fee is belong to percentage number, the value should be less than 100.",
|
"management_fee": "Management fee is belong to percentage number, the value should be less than 100.",
|
||||||
"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.",
|
"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.",
|
||||||
"performance_fee": "Performance fees is belong to percentage number, the value should be less than 100.",
|
|
||||||
"buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.",
|
"buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.",
|
||||||
"sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.",
|
"sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.",
|
||||||
"establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.",
|
"establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.",
|
||||||
|
|
@ -118,7 +116,7 @@
|
||||||
"minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00",
|
"minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00",
|
||||||
"indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100.",
|
"indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100.",
|
||||||
"recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.",
|
"recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.",
|
||||||
"change_recoverable_expanses": "Change recoverable expanses is belong to percentage number, the value should be less than 100."
|
"change_recoverable_expenses": "Change recoverable expenses is belong to percentage number, the value should be less than 100."
|
||||||
},
|
},
|
||||||
"special_rule": {
|
"special_rule": {
|
||||||
"management_fee_and_costs": [
|
"management_fee_and_costs": [
|
||||||
|
|
@ -145,24 +143,41 @@
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": []}",
|
"{\"data\": []}",
|
||||||
"\n",
|
"\n",
|
||||||
"B. If there are multiple Management fee and costs sub-columns, here is the rule: ",
|
"B. The table title is with Ongoing annual fees and costs.",
|
||||||
"B.1 With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
"B.1 Management fees and costs should not include transaction costs and performance fees.",
|
||||||
|
"---Example Start",
|
||||||
|
"Ongoing annual\nfees and costs\nC Class and E Class -P Class - Performance \nStandard Fee Option Fee Option \n36 \n(E Class is closed to \nnew investors) \nPlatinum International Fund 1.56% p.a. 1.46% p.a. \nOngoing annual fees and costs include estimated management fees and costs, estimated \ntransaction costs and estimated performance fees (for P Class – Performance Fee Option \nonly). Please see page 36 for further information.",
|
||||||
|
"---Example End",
|
||||||
|
"The values 1.56 and 1.46 include estimated management fees and costs, estimated \ntransaction costs and estimated performance fees, should ignore them.",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": []}",
|
||||||
|
"B.2 If with pure management fees and costs in table, please extract relevant values",
|
||||||
|
"---Example Start---",
|
||||||
|
"Fees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nManagement fees and costs \nEstimated management fees and costs \nper annum are: \nPlatinum International Fund 1.41% 1.16%\nPlatinum Global Fund (Long Only) 1.35% 1.10%\n",
|
||||||
|
"---Example End---",
|
||||||
|
"a. For this example, there is pure \"Management fees and costs\", please extract relevant values.",
|
||||||
|
"b. This example mentioned share classes, please output according to share class.",
|
||||||
|
"The output should be",
|
||||||
|
"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"management_fee_and_costs\": 1.41, \"management_fee\": 1.41}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"management_fee_and_costs\": 1.41, \"management_fee\": 1.41}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"management_fee_and_costs\": 1.16, \"management_fee\": 1.16}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.35}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.35}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"management_fee_and_costs\": 1.1, \"management_fee\": 1.1}]}",
|
||||||
|
"\n",
|
||||||
|
"C. If there are multiple Management fee and costs sub-columns, here is the rule: ",
|
||||||
|
"C.1 With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
||||||
"---Example Start---",
|
"---Example Start---",
|
||||||
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
|
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
|
||||||
"---Example End---",
|
"---Example End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]}",
|
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee_costs\": 0.06}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"B.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".",
|
"C.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".",
|
||||||
"The management_fee is the value of \"Management fee (% pa)\".",
|
"The management_fee is the value of \"Management fee (% pa)\".",
|
||||||
"The management_fee_and_costs is the value of \"Total management cost (% pa)\".",
|
"The management_fee_and_costs is the value of \"Total management cost (% pa)\".",
|
||||||
"---Example Start---",
|
"---Example Start---",
|
||||||
"Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n",
|
"Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n",
|
||||||
"---Example End---",
|
"---Example End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
|
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee_costs\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
"D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
|
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
|
|
@ -173,25 +188,30 @@
|
||||||
"---Example 2 End---",
|
"---Example 2 End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}",
|
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}",
|
||||||
|
"---Example 3 Start---",
|
||||||
|
"Fund name \nManagement \nfees and costs \n(p.a.) 1 \nBuy/sell \nspread \n(%) 2 \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity \nIncome Fund \n1.22% 0.05\nFirst Sentier Wholesale Geared \nShare Fund 3 \n1.04%(g)/2.18%(n) 4 0.20–0.50 5 \n\n",
|
||||||
|
"---Example 3 End---",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"First Sentier Wholesale Equity Income Fund\", \"share name\": \"First Sentier Wholesale Equity Income Fund\", \"management_fee_and_costs\": 1.22, \"management_fee\": 1.22, \"buy_spread\": 0.05, \"sell_spread\": 0.05}, {\"fund name\": \"First Sentier Wholesale Geared Share Fund\", \"share name\": \"First Sentier Wholesale Geared Share Fund\", \"management_fee_and_costs\": 2.18, \"management_fee\": 2.18, \"buy_spread\": 0.5, \"sell_spread\": 0.5}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"D. If only find \"Management fees\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
"E. If only find \"Management fees\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Underlying FundManagement fee component \nVanguard High Growth Index Fund1.50% p.a. of the NAV of the Underlying Fund\n",
|
"Underlying FundManagement fee component \nVanguard High Growth Index Fund1.50% p.a. of the NAV of the Underlying Fund\n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Vanguard High Growth Index Fund\", \"share name\": \"Vanguard High Growth Index Fund\", \"management_fee_and_costs\": 1.5, \"management_fee\": 1.5}]}",
|
"{\"data\": [{\"fund name\": \"Vanguard High Growth Index Fund\", \"share name\": \"Vanguard High Growth Index Fund\", \"management_fee_and_costs\": 1.5, \"management_fee\": 1.5}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"E. If with columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\", please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
"F. If with columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\", please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"\n\nInvestment option \nInvestment fees \nand costs (excl \nPerformance Fees) \nPerformance \nFee \nTransaction \ncosts \nTotal \ninvestment \nfees and costs \nBalanced 0.53% 0.43% 0.13%1.09% \nCapital Stable \n0.32% \n0.18% \n0.09% \n0.59% \n",
|
"\n\nInvestment option \nInvestment fees \nand costs (excl \nPerformance Fees) \nPerformance \nFee \nTransaction \ncosts \nTotal \ninvestment \nfees and costs \nBalanced 0.53% 0.43% 0.13%1.09% \nCapital Stable \n0.32% \n0.18% \n0.09% \n0.59% \n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
"For this example, please ignore the \"Total investment fees and costs\" and \"Transaction costs\" columns, ",
|
"For this example, please ignore the \"Total investment fees and costs\" and \"Transaction costs\" columns, ",
|
||||||
"just output the values from \"Investment fees and costs (excl Performance Fees)\" as management_fee and management_fee_and_costs, ",
|
"just output the values from \"Investment fees and costs (excl Performance Fees)\" as management_fee and management_fee_and_costs, ",
|
||||||
"output the values from \"Performance Fee\" as performance_fee.",
|
"output the values from \"Performance Fee\" as performance_fee_costs.",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee\": 0.18}]}",
|
"{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee_costs\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee_costs\": 0.18}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"F. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00%, please ignore and output empty.",
|
"G. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00% or 0.55%-1.00%, please ignore and output empty.",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to ‘Administration fee \nrebate’ section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown ‘in the ‘Management \nfees and costs’ section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n",
|
"Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to ‘Administration fee \nrebate’ section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown ‘in the ‘Management \nfees and costs’ section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
|
|
@ -202,60 +222,138 @@
|
||||||
"---Example 2 End---",
|
"---Example 2 End---",
|
||||||
"The relevant values: 0.07 and 1.00, are in the range, should ignore, so the output should be:",
|
"The relevant values: 0.07 and 1.00, are in the range, should ignore, so the output should be:",
|
||||||
"{\"data\": []}",
|
"{\"data\": []}",
|
||||||
|
"---Example 3 Start---",
|
||||||
|
"Management fees and costs \n0.67–1.17% p.a. (estimated) \nThe fees and costs for \nmanaging your investment \n",
|
||||||
|
"---Example 3 End---",
|
||||||
|
"The relevant values: 0.67 and 1.17, are in the range, should ignore, so the output should be:",
|
||||||
|
"{\"data\": []}",
|
||||||
"\n",
|
"\n",
|
||||||
"G. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.",
|
"H. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Fees and costs for \nyour investment options \n\nAdministration fees and costs apply in addition to the fees and costs shown in this table. Please refer to the PDS and Fee Brochure for \nfurther information about fees and costs, including how the figures shown below are calculated. \n\nThe investment fees and \ncosts are made up of \nPerformance \nfee \nPlus \nother \ninvestment \nfees and \ncosts \nEquals \ninvestment \nfees and \ncosts \nTransaction \ncosts (net) \nBuy-sell \nspreads \nTransaction \ncosts \n(gross) 1 \n% pa \n% pa \n% pa \nEntry %/ \nExit % \n% pa \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \n",
|
"Fees and costs for \nyour investment options \n\nAdministration fees and costs apply in addition to the fees and costs shown in this table. Please refer to the PDS and Fee Brochure for \nfurther information about fees and costs, including how the figures shown below are calculated. \n\nThe investment fees and \ncosts are made up of \nPerformance \nfee \nPlus \nother \ninvestment \nfees and \ncosts \nEquals \ninvestment \nfees and \ncosts \nTransaction \ncosts (net) \nBuy-sell \nspreads \nTransaction \ncosts \n(gross) 1 \n% pa \n% pa \n% pa \nEntry %/ \nExit % \n% pa \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
"The column: \"Equals investment fees and costs\" is the sum of \"Performance fee\" and \"Plus other investment fees and costs\", we should ignore the \"Performance fee\" value, just output the \"Plus other investment fees and costs\" value.",
|
"The column: \"Equals investment fees and costs\" is the sum of \"Performance fee\" and \"Plus other investment fees and costs\", we should ignore the \"Performance fee\" value, just output the \"Plus other investment fees and costs\" value.",
|
||||||
"The \"Plus other investment fees and costs\" could be the values for both of \"management fee\" and \"management fee and costs\", so the output should be:",
|
"The \"Plus other investment fees and costs\" could be the values for both of \"management fee\" and \"management fee and costs\", so the output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
|
"{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee_costs\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee_costs\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
|
||||||
"---Example 2 Start---",
|
"---Example 2 Start---",
|
||||||
"MANAGEMENT COSTS AND TRANSACTION COSTS \n\nOption name Management costs \nEstimated \nperformance \nfee (pa) 1 \nTotal management\ncosts (including\nestimated performance\nfee) pa\nTransaction costs \nper transaction (%) \nMULTI-MANAGER MULTI-SECTOR (These investment options are located in the ‘Investment Options Menu’ on pages 18 to 19.) \nFirstChoice Wholesale Defensive 0.85% 0.85% 0.15\nFirstChoice Wholesale Conservative 0.90% 0.02%1 0.92% 1 0.15 \n",
|
"MANAGEMENT COSTS AND TRANSACTION COSTS \n\nOption name Management costs \nEstimated \nperformance \nfee (pa) 1 \nTotal management\ncosts (including\nestimated performance\nfee) pa\nTransaction costs \nper transaction (%) \nMULTI-MANAGER MULTI-SECTOR (These investment options are located in the ‘Investment Options Menu’ on pages 18 to 19.) \nFirstChoice Wholesale Defensive 0.85% 0.85% 0.15\nFirstChoice Wholesale Conservative 0.90% 0.02%1 0.92% 1 0.15 \n",
|
||||||
"---Example 2 End---",
|
"---Example 2 End---",
|
||||||
"The column: \"Total management costs (including estimated performance fee) pa\" is the sum of \"Management costs\" and \"Estimated performance fee (pa)\", we should ignore the \"Estimated performance fee (pa)\" value, just output the \"Management costs\" value.",
|
"The column: \"Total management costs (including estimated performance fee) pa\" is the sum of \"Management costs\" and \"Estimated performance fee (pa)\", we should ignore the \"Estimated performance fee (pa)\" value, just output the \"Management costs\" value.",
|
||||||
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\", so the output should be:",
|
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\", so the output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"FirstChoice Wholesale Defensive\", \"share name\": \"FirstChoice Wholesale Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.85}, {\"fund name\": \"FirstChoice Wholesale Conservative\", \"share name\": \"FirstChoice Wholesale Conservative\", \"management_fee_and_costs\": 0.9, \"management_fee\": 0.9, \"performance_fee\": 0.02}]}",
|
"{\"data\": [{\"fund name\": \"FirstChoice Wholesale Defensive\", \"share name\": \"FirstChoice Wholesale Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.85}, {\"fund name\": \"FirstChoice Wholesale Conservative\", \"share name\": \"FirstChoice Wholesale Conservative\", \"management_fee_and_costs\": 0.9, \"management_fee\": 0.9, \"performance_fee_costs\": 0.02}]}",
|
||||||
"---Example 3 Start---",
|
"---Example 3 Start---",
|
||||||
"Investment \noption \nInvestment fees and \ncosts (p.a.) \n1 \nTransaction \ncosts (p.a.) \nMySuper/ \nBalanced \n0.38% (including 0.09% \nPerformance fee) \n0.18% \nManaged \nGrowth \n0.38% (including 0.11% \nPerformance fee) \n0.08% \n",
|
"Investment \noption \nInvestment fees and \ncosts (p.a.) \n1 \nTransaction \ncosts (p.a.) \nMySuper/ \nBalanced \n0.38% (including 0.09% \nPerformance fee) \n0.18% \nManaged \nGrowth \n0.38% (including 0.11% \nPerformance fee) \n0.08% \n",
|
||||||
"---Example 3 End---",
|
"---Example 3 End---",
|
||||||
"The column: \"Investment fees and costs (p.a.)\", \"including Performance fee\", meaning the value is the sum of \"Management costs\" and \"performance fee\", We should subtract the \"performance fee\" value, just output the \"Management costs\" value.",
|
"The column: \"Investment fees and costs (p.a.)\", \"including Performance fee\", meaning the value is the sum of \"Management costs\" and \"performance fee\", We should subtract the \"performance fee\" value, just output the \"Management costs\" value.",
|
||||||
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\".",
|
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\".",
|
||||||
"So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee is 0.09.",
|
"So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee_costs is 0.09.",
|
||||||
"For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee is 0.11.",
|
"For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee_costs is 0.11.",
|
||||||
"So the output should be:",
|
"So the output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee\": 0.11}]}"
|
"{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee_costs\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee_costs\": 0.11}]}",
|
||||||
|
"---Example 4 Start---",
|
||||||
|
"Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return – Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n",
|
||||||
|
"---Example 4 End---",
|
||||||
|
"The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.",
|
||||||
|
"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
|
||||||
|
"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
|
||||||
|
"So the output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}",
|
||||||
|
"\n",
|
||||||
|
"I. Some table is very complex, with many data points columns, please extract the relevant values.",
|
||||||
|
"---Example 1 Start---",
|
||||||
|
"Option name \nTotal administration\nand investment\nfees and costs (p.a.)\n= \nAdministration\nfees and\ncosts (p.a.)\n+ \nInvestment fees \nand costs (p.a.) \n2 \n+ \nPerformance \nfee (p.a.) \n1 \nBuy/sell\nspread\n(%)\n6 \nCFS Multi-Manager Multi-Sector (These investment options are located in the Investment Options Menu.) \nCFS Defensive \n0.94% \n0.20% 0.74%0.15 \nCFS Conservative 1.04% \n1 \n0.20% 0.81% 0.03%\n1 \n0.15 \n",
|
||||||
|
"---Example 1 End---",
|
||||||
|
"For this table, there are \"Administration fees and costs (p.a.)\" as administration_fees, ",
|
||||||
|
"\"Investment fees and costs (p.a.)\" as management_fee_and_costs and management_fee, ",
|
||||||
|
"\"Performance fee (p.a.)\" as performance_fee_costs, ",
|
||||||
|
"\"Buy/sell spread (%)\" as buy_spread and sell_spread.",
|
||||||
|
"If one row has 5 decimal numbers, ",
|
||||||
|
"the 2nd decimal number is the administration_fees, ",
|
||||||
|
"the 3rd decimal number is the management_fee_and_costs and management_fee, ",
|
||||||
|
"the 4th decimal number is the performance_fee_costs, ",
|
||||||
|
"the 5th decimal number is the buy_spread and sell_spread.",
|
||||||
|
"If one row has 4 decimal numbers, ",
|
||||||
|
"the 2nd decimal number is the administration_fees, ",
|
||||||
|
"the 3rd decimal number is the management_fee_and_costs and management_fee, ",
|
||||||
|
"the 4th decimal number is the buy_spread and sell_spread.",
|
||||||
|
"Please always ignore the 1st decimal number, we need not the total sum values.",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee_costs\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}",
|
||||||
|
"J. If exist **\"Maximum management fee\"** in context, please ignore relevant values.",
|
||||||
|
"---Example Start---",
|
||||||
|
"Fund name \nMaximum \nmanagement \nfee (p.a.) \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity Income Fund 3.075% \nAUSTRALIAN SHARE \nFirst Sentier Wholesale Australian Share Fund 1.538%",
|
||||||
|
"---Example End---",
|
||||||
|
"The values in example is **Maximum management fee**, should ignore all of them.",
|
||||||
|
"The Output should be:",
|
||||||
|
"{\"data\": []}"
|
||||||
],
|
],
|
||||||
"administration_fees":[
|
"administration_fees":[
|
||||||
"Administration fees and costs is share class level data.",
|
"Administration fees and costs and total annual dollar-based charges are share class level data.",
|
||||||
"Simple case:",
|
"Simple case:",
|
||||||
"----Example 1 Start----",
|
"----Example 1 Start----",
|
||||||
"Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n",
|
"Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n",
|
||||||
"----Example 1 End----",
|
"----Example 1 End----",
|
||||||
|
"According to example, the administration fee is $1.30 per week plus 0.29% pa, so administration_fees is 0.29, ",
|
||||||
|
"total_annual_dollar_based_charges is 1.30 * 52 = 67.6",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}",
|
"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29, \"total_annual_dollar_based_charges\": 67.6}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"----Example 2 Start----",
|
"----Example 2 Start----",
|
||||||
"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply)",
|
"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply)",
|
||||||
"----Example 2 End----",
|
"----Example 2 End----",
|
||||||
"The administration fee is $1.00 per week plus 0.17% pa, so the output should be:",
|
"According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ",
|
||||||
"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17}]}",
|
"total_annual_dollar_based_charges is 1 * 52 = 52",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}",
|
||||||
|
"---Example 3 Start---",
|
||||||
|
"\nPrime Super Income Stream\nType of fee \nor cost \nAmount How and when paid \nOngoing annual fees and costs \n1 \nAdministration \nfees and costs \nAdministration \nfees of $1.30 \nper week \nPlus \n0.50% p.a. of \nyour account \nbalance, capped \nat $500 p.a. \nDeducted from your \naccount on the last \nbusiness day of each \nmonth, except if you \nare leaving Prime \nSuper, in which case \nit is deducted prior to \nyour exit from Prime \nSuper. \nInvestment \nfees and costs \n2 \n0.07% to 1.00% \nof assets p.a. \ndepending on \nthe investment \noption \nTaken into account \nprior to the declaration \nof weekly earning \nrates. This cost is not \ndeducted directly from \nyour account. \n",
|
||||||
|
"---Example 3 End---",
|
||||||
|
"According to example, the administration fee is $1.30 per week plus 0.50% p.a., so administration_fees is 0.5, ",
|
||||||
|
"total_annual_dollar_based_charges is 1.30 * 52 = 67.6",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"Prime Super Income Stream\", \"share name\": \"Prime Super Income Stream\", \"administration_fees\": 0.5, \"total_annual_dollar_based_charges\": 67.6}]}",
|
||||||
|
"---Example 4 Start---",
|
||||||
|
"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nTTR income stream Retirement income stream Reference \nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply) \nRefer to the ‘Fees and \nother costs’ section on \npages 40-46 for details \n",
|
||||||
|
"---Example 4 End---",
|
||||||
|
"According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ",
|
||||||
|
"total_annual_dollar_based_charges is 1 * 52 = 52",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"Complex cases:",
|
"Complex cases:",
|
||||||
"A. Need to add multiple numbers together.",
|
"A. Need to add multiple numbers together.",
|
||||||
"----Example 1 Start----",
|
"---Example 1 Start---",
|
||||||
"MLC MasterKey Super & Pension Fundamentals \n\nType of fee or cost \nOngoing annual fees and costs 1 \n\nAdministration fees and \ncosts \n\nAccount balance \n\nFirst $150,000 \n\nRemaining balance \nover $150,000 \n\nThe percentage Administration fee \ncharged to each account you have \n(excluding the fixed fee and Trustee \nLevy) is capped at $2,500 pa. \n\nPlus \n\nTrustee Levy of 0.02% pa of your \naccount balance. \n\nPlus \n\nAmount \n\nHow and when paid \n\nPercentage fee \n(% pa) \n\n0.30 \n\n0.10 \n\nAdministration fee \n\nThe Administration fee is deducted monthly from your account and will \nbe rounded off to 2 decimal points. As a result of the rounding, the total \nannual amount may slightly differ. \n\nThe percentage fee for each month is calculated using your average Super \nand Pension account balance for the previous month. \n\nThe Trustee Levy will be deducted monthly from your account balance. \n\nThe levy amount for each month is calculated using your account balance \nat the date it's deducted. \n\nYou won't see these costs as direct charges to your account. They reduce \nthe balance held in reserves used to cover certain costs related to the \nrunning of the MLC Super Fund. \n\n4 \n\nMLC MasterKey Super & Pension Fundamentals Product Disclosure Statement",
|
"MLC MasterKey Super & Pension Fundamentals \n\nType of fee or cost \nOngoing annual fees and costs 1 \n\nAdministration fees and \ncosts \n\nAccount balance \n\nFirst $150,000 \n\nRemaining balance \nover $150,000 \n\nThe percentage Administration fee \ncharged to each account you have \n(excluding the fixed fee and Trustee \nLevy) is capped at $2,500 pa. \n\nPlus \n\nTrustee Levy of 0.02% pa of your \naccount balance. \n\nPlus \n\nAmount \n\nHow and when paid \n\nPercentage fee \n(% pa) \n\n0.30 \n\n0.10 \n\nAdministration fee \n\nThe Administration fee is deducted monthly from your account and will \nbe rounded off to 2 decimal points. As a result of the rounding, the total \nannual amount may slightly differ. \n\nThe percentage fee for each month is calculated using your average Super \nand Pension account balance for the previous month. \n\nThe Trustee Levy will be deducted monthly from your account balance. \n\nThe levy amount for each month is calculated using your account balance \nat the date it's deducted. \n\nYou won't see these costs as direct charges to your account. They reduce \nthe balance held in reserves used to cover certain costs related to the \nrunning of the MLC Super Fund. \n\n4 \n\nMLC MasterKey Super & Pension Fundamentals Product Disclosure Statement",
|
||||||
"----Example 1 End----",
|
"---Example 1 End---",
|
||||||
"For this case, the relevant values: first: 0.30%, remaining balance over: 0.10%, Plus Trustee Levy: 0.02%.",
|
"For this case, the relevant values: first: 0.30%, remaining balance over: 0.10%, Plus Trustee Levy: 0.02%.",
|
||||||
"Please ignore the remaining balance over 0.10%, add first: 0.30% and Plus Trustee Levy: 0.02% = 0.32%",
|
"Please ignore the remaining balance over 0.10%, add first: 0.30% and Plus Trustee Levy: 0.02% = 0.32%",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"administration_fees\": 0.32}]}"
|
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"administration_fees\": 0.32}]}",
|
||||||
|
"---Example 2 Start---",
|
||||||
|
"Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Fund’s \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n",
|
||||||
|
"---Example 2 End---",
|
||||||
|
"According to example, the total annual dollar-based charges is $78.00 p.a. ($1.50 per week), so total_annual_dollar_based_charges is 78.",
|
||||||
|
"Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore this part.",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"Hostplus Superannuation and Personal Super Plan\", \"share name\": \"Hostplus Superannuation and Personal Super Plan\", \"total_annual_dollar_based_charges\": 78}]}"
|
||||||
|
],
|
||||||
|
"total_annual_dollar_based_charges": [
|
||||||
|
"Total annual dollar-based charges are share class level data.",
|
||||||
|
"Its value corresponds to the administration fees and costs that are charged on a weekly basis.",
|
||||||
|
"----Example 1 Start----",
|
||||||
|
"MLC MasterKey Super & Pension Fundamentals\nType of fee or cost \nOngoing annual fees and costs 1 \nAmount \nHow and when paid \nOther administration costs paid from \nreserves of 0.00% pa of your account \nbalance. \nPlus \nA fixed fee of $1.50 per week \nThis fee is deducted monthly if your account balance is below $50,000 \nwhen the percentage administration fee is deducted. \nInvestment fees and \ncosts 2 \nInvestment fees and estimated costs \nfor MLC Horizon 4 Balanced Portfolio, \n1.20% pa. \nYou won ’ t see these fees and costs as direct charges to your account. \nThey're reflected in the daily unit price of each investment option and will \nreduce the net return on your investment \nInvestment fees and estimated costs \nfor other investment options, ranges \nfrom 0.00% pa to 2.84% pa \n(estimated). \nTransaction costs \nMLC Horizon 4 Balanced Portfolio, \n0.06% pa (estimated). \nOther investment options, ranges \nfrom 0.00% pa to 0.24% pa \n(estimated). \nYou won ’ t see these costs as direct charges to your account. They're \nreflected in the daily unit price of each investment option and will reduce \nthe net return on your investment. \nMember activity related fees and costs \nBuy-sell spread \nYou won ’ t see this fee as a direct charge to your account. It ’ s reflected in \nthe buy and sell unit price of each investment option when there ’ s a \ntransaction on your account. \nMLC Horizon 4 Balanced Portfolio, \n0.10%/0.10% \nOther investment options, ranges \nfrom 0.00%/0.00% to 0.30%/0.30% \nThe current buy-sell spreads of an investment option are available at \nmlc.com.au/buysellspreads \n",
|
||||||
|
"----Example 1 End----",
|
||||||
|
"According to example, the fixed fee is $1.50 per week, so total_annual_dollar_based_charges is 1.50 * 52 = 78",
|
||||||
|
"In the context, also with management fees and costs, management fee, buy_spread and sell_spread for specific fund: MLC Horizon 4 Balanced Portfolio.",
|
||||||
|
"Please output the relevant values based on specific fund name.",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
|
||||||
],
|
],
|
||||||
"buy_spread": [
|
"buy_spread": [
|
||||||
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
||||||
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
|
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
|
||||||
],
|
],
|
||||||
"performance_fee": [
|
"performance_fee_costs": [
|
||||||
"Performance fees is share class level data.",
|
"Performance fees is share class level data.",
|
||||||
"If the performance fees is with the range, please ignore and output empty.",
|
"If the performance fees is with the range, please ignore and output empty.",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
|
|
@ -329,89 +427,48 @@
|
||||||
{
|
{
|
||||||
"management_fee_and_costs": [
|
"management_fee_and_costs": [
|
||||||
{
|
{
|
||||||
"keywords": ["Estimated investment \ncosts \nAdministration \nfees"],
|
"keywords": ["Administration fees \nEstimated administration costs \nInvestment fees"],
|
||||||
"prompts": ["Complex management fee and costs rule:",
|
"prompts": ["Complex management fee and costs rule:",
|
||||||
"If the table with columns:",
|
"If the table with columns:",
|
||||||
"\"Administration fees (% pa)\", \"Investment fees (% pa)\" and \"Estimated other investment costs (% pa)\"",
|
"\"Administration fees\", \"Investment fees\" ,\"Estimated other investment costs\" and \"Estimated performance fees\"",
|
||||||
"The administration_fees is \"Administration fees (% pa)\"",
|
"The administration_fees is \"Administration fees\"",
|
||||||
"The management_fee is \"Investment fees (% pa)\".",
|
"The management_fee is \"Investment fees\".",
|
||||||
"The management_fee_and_costs is \"Investment fees (% pa)\" + \"Estimated other investment costs (% pa)\".",
|
"The management_fee_and_costs is \"Investment fees\" + \"Estimated other investment costs\".",
|
||||||
|
"The performance_fee_costs is \"Estimated performance fees\"",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Investment \noption \nAdministration fees and \nestimated administration costs \nInvestment fees and estimated \ninvestment costs \nEstimated investment \ncosts \nAdministration \nfees \n(% pa) \nInvestment \nfees \n(% pa) \n2 \nEstimated \ntotal \nongoing \nEstimated \nadministration \ncosts \n(% pa) \n1 \nEstimated \nperformance \nfees \n(% pa) \n3 \nEstimated \ntransaction \ncosts \n(% pa) \n5 \nEstimated \nother \ninvestment \ncosts \n(% pa) \n4 \nannual \nfees and \ncosts \n(% pa) \nCash \nPerpetual Cash \n0.10% \n0.00% \n0.00% \nn/a \n0.00% \n0.02% \n0.12% \nFixed income and credit \nBentham Global \nIncome \n0.25% \n0.00% \n0.67% \nn/a \n0.00% \n0.05% \n0.97% \nProperty and infrastructure \nLazard Global \nListed \nInfrastructure \n0.25% \n0.00% \n0.80% \nn/a \n0.08% \n0.09% \n1.22% \n",
|
"\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \nCash \nPerpetual Cash \n0.10% \n0.00% \n0.00% \nn/a \n0.00% \n0.02% \n0.12% \nFixed income and credit \nBentham Global \nIncome \n0.25% \n0.00% \n0.67% \nn/a \n0.00% \n0.05% \n0.97% \nInternetional shares \nPerpetual Global \nInnovation Share \n0.25% \n0.00% \n0.99% \n2.30 \n0.01% \n0.27% \n3.82% \n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
"For this case, although the table header is with disorder issue during PDF contents extraction issue.",
|
"The data points numbers order in data row (for example: 0.25% \n0.00% \n0.99% \n2.30 \n0.01% \n0.27% \n3.82% \n) is correct as initial table structure.",
|
||||||
"But the data points numbers order in data row (for example: 0.25% \n0.00% \n0.80% \nn/a \n0.08% \n0.09% \n1.22% \n) is correct as initial table structure.",
|
|
||||||
"Please pay attention below information",
|
"Please pay attention below information",
|
||||||
"Assume the column sequence number is from 1.",
|
"Assume the column sequence number is from 1.",
|
||||||
"\"Administration fees (% pa)\" values are as the column 1 numbers, \"Investment fees (% pa)\" values are as the column 3 numbers, \"Estimated other investment costs (% pa)\" values are as the column 5 numbers.",
|
"\"Administration fees\" values are as the column 1 numbers, \"Investment fees\" values are as the column 3 numbers, \"Estimated other investment costs\" values are as the column 5 numbers, \"Estimated performance fees\" values are as the column 4 numbers.",
|
||||||
"For fund: Lazard Global Listed Infrastructure, the administration_fees should be the column 1 number: 0.25, the management_fee should be the column 3 number: 0.8, the management_fee_and_costs should be 0.88 = 0.8(the column 3 number) + 0.08 (the column 5 number)",
|
"For fund: Perpetual Global Innovation Share, the administration_fees should be the column 1 number: 0.25, the management_fee should be the column 3 number: 0.99, the management_fee_and_costs should be 1 = 0.99(the column 3 number) + 0.01 (the column 5 number), the performance_fee_costs should be 2.3 (the column 4 number)",
|
||||||
"Therefore, the output should be:",
|
"Therefore, the output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Perpetual Cash\", \"share name\": \"Perpetual Cash\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"administration_fees\": 0.10}, {\"fund name\": \"Bentham Global Income\", \"share name\": \"Bentham Global Income\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0, \"administration_fees\": 0.25}]}, {\"fund name\": \"Lazard Global Listed Infrastructure\", \"share name\": \"Lazard Global Listed Infrastructure\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.08, \"administration_fees\": 0.25}"
|
"{\"data\": [{\"fund name\": \"Perpetual Cash\", \"share name\": \"Perpetual Cash\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"administration_fees\": 0.10}, {\"fund name\": \"Bentham Global Income\", \"share name\": \"Bentham Global Income\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"administration_fees\": 0.25}]}, {\"fund name\": \"Perpetual Global Innovation Share\", \"share name\": \"Perpetual Global Innovation Share\", \"management_fee_and_costs\": 1, \"management_fee\": 0.99, \"administration_fees\": 0.25, \"performance_fee_costs\": 2.3}"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"keywords": ["Entry Fee \nNil Entry"],
|
"keywords": ["Entry Fee option \nNil Entry option"],
|
||||||
"prompts": ["Complex management fee and costs rule:",
|
"prompts": ["Complex management fee and costs rule:",
|
||||||
"If the table with columns:",
|
"If the table with columns:",
|
||||||
"\"Entry Fee option\", \"Nil Entry Free option\", \"Estimated other investment costs\", \"Estimated Performance fees (B)\"",
|
"\"Entry Fee option\", \"Nil Entry option\", \"Estimated Other investment costs\", \"Estimated Performance fees\"",
|
||||||
"The performance_fee is \"Estimated Performance fees (B)\"",
|
"The performance_fee_costs is \"Estimated Performance fees\"",
|
||||||
"The fund name's tail is \"Entry Fee\" for \"Entry Fee option\", e.g. if fund name is \"MultiSeries 30\", the Entry Fee fund name is \"MultiSeries 30 Entry Fee\"",
|
"The fund name's tail is \"Entry Fee\" for \"Entry Fee option\", e.g. if fund name is \"MultiSeries 30\", the Entry Fee fund name is \"MultiSeries 30 Entry Fee\"",
|
||||||
"The fund name's tail is \"Nil Entry\" for \"Nil Entry Free option\", e.g. if fund name is \"MultiSeries 30\", the Nil Entry fund name is \"MultiSeries 30 Nil Entry\".",
|
"The fund name's tail is \"Nil Entry\" for \"Nil Entry option\", e.g. if fund name is \"MultiSeries 30\", the Nil Entry fund name is \"MultiSeries 30 Nil Entry\".",
|
||||||
"For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".",
|
"For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".",
|
||||||
"For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry Free option\" + \"Estimated other investment costs\".",
|
"For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry option\" + \"Estimated other investment costs\".",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n",
|
"Management Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
"For this case, although the table header is with disorder issue during PDF contents extraction issue.",
|
"The data points numbers order in data row (for example: 2.14 2.990.02 0.000.21 2.37 3.22) is correct as initial table structure.",
|
||||||
"But the data points numbers order in data row (for example: 2.14 2.990.02 0.000.21 2.37 3.22) is correct as initial table structure.",
|
|
||||||
"Please pay attention below information",
|
"Please pay attention below information",
|
||||||
"Assume the column sequence number is from 1.",
|
"Assume the column sequence number is from 1.",
|
||||||
"\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry Free option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees (B)\" values are as the column 4 numbers.",
|
"\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.",
|
||||||
"For main fund: Platinum Asia with values: 2.14 2.990.02 0.000.21 2.37 3.22, ",
|
"For main fund: Platinum Asia with values: 2.14 2.990.02 0.000.21 2.37 3.22, ",
|
||||||
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)",
|
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
|
||||||
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)",
|
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
|
||||||
"Therefore, the output should be:",
|
"Therefore, the output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee\": 0}"
|
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}"
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"keywords": ["Indirect costs \ni \nEstimated performance fees"],
|
|
||||||
"prompts": ["Complex management fee and costs rule:",
|
|
||||||
"If the table with columns:",
|
|
||||||
"\"Management fee (% pa)\", \"Indirect costs\", \"Estimated performance fees\", \"Buy/sell spreads\"",
|
|
||||||
"The management_fee is \"Management fee (% pa)\".",
|
|
||||||
"The management_fee_costs is \"Management fee (% pa)\" + \"Indirect costs\".",
|
|
||||||
"The performance_fee is \"Estimated performance fees\"",
|
|
||||||
"The buy_spread and sell_spread are \"Buy/sell spreads\".",
|
|
||||||
"---Example 1 Start---",
|
|
||||||
"Indirect costs \ni\nEstimated performance fees \nii\nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nRecoverable \nexpenses \niii \nEstimated other \nindirect costs \nPerformance \nfees charged to \nthe Investment \nOption by \nunderlying \nmanagers \nPerformance \nfees charged by \ninterposed \nvehicles \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 \nMyNorth \nAustralian Fixed \nInterest Index \niv \n0.25 \n0.01 \n0.00 \n0.00 \n0.00 \n0.07 \n0.10/0.10 \nMyNorth \nInternational \nFixed Interest \nIndex - Hedged \n",
|
|
||||||
"---Example 1 End---",
|
|
||||||
"For this case: ",
|
|
||||||
"a. The table header is with disorder issue during PDF contents extraction issue.",
|
|
||||||
"b. The fund name is after the data row, e.g. MyNorth Australian Fixed Interest Index",
|
|
||||||
"c. The data points numbers order in data row, for example: \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 is correct as initial table structure.",
|
|
||||||
"The 1st number: 0.20 is the management_fee, the 2nd number and the 3th number: 0.01 0.00 are the indirect costs, ",
|
|
||||||
"the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the performance_fee by interposed vehicles, ",
|
|
||||||
"the 6th number: 0.00 is the transaction costs, ",
|
|
||||||
"the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.",
|
|
||||||
"The management_fee_and_costs is management_fee + indirect costs = 0.20 + 0.01 + 0.00= 0.21",
|
|
||||||
"The output should be: ",
|
|
||||||
"{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"performance_fee\": 0.00, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"performance_fee\": 0.00, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
|
|
||||||
"\n",
|
|
||||||
"---Example 2 Start---",
|
|
||||||
"Indirect costs \ni \nEstimated performance fees \nii \nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nMyNorth Index \nModerately \nDefensive \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 \nMyNorth Index \nBalanced \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.09/0.09 \n",
|
|
||||||
"---Example 2 End---",
|
|
||||||
"For this case: ",
|
|
||||||
"a. The table header is with disorder issue during PDF contents extraction issue.",
|
|
||||||
"b. The fund name is before the data row, e.g. MyNorth Index Moderately \nDefensive",
|
|
||||||
"c. The data points numbers order in data row, for example: \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 is correct as initial table structure.",
|
|
||||||
"The 1st number: 0.55 is the management_fee, the 2nd number and the 3th number: 0.00 0.00 are the indirect costs, ",
|
|
||||||
"the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the performance_fee by interposed vehicles, ",
|
|
||||||
"the 6th number: 0.01 is the transaction costs, ",
|
|
||||||
"the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.",
|
|
||||||
"The management_fee_and_costs is management_fee + indirect costs = 0.55 + 0.00 + 0.00= 0.55",
|
|
||||||
"The output should be: ",
|
|
||||||
"{\"data\": [{\"fund name\": \"MyNorth Index Moderately Defensive\", \"share name\": \"MyNorth Index Moderately Defensive\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.00, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth Index Balanced\", \"share name\": \"MyNorth Index Balanced\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.00, \"buy_spread\": 0.09, \"sell_spread\": 0.09}]}"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -433,6 +490,48 @@
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
|
"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"keywords": ["Recoverable expenses \nEstimated other indirect costs"],
|
||||||
|
"prompts": ["Complex management fee and costs rule:",
|
||||||
|
"If the table with columns:",
|
||||||
|
"\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"",
|
||||||
|
"The management_fee is \"Management fee (% pa)\".",
|
||||||
|
"The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".",
|
||||||
|
"The recoverable_expenses is \"Recoverable expenses\"",
|
||||||
|
"The indirect_costs is \"Estimated other indirect costs\"",
|
||||||
|
"The performance_fee_costs is \"Peformance fees charged to the Investment Option by underlying managers\".",
|
||||||
|
"The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"",
|
||||||
|
"The buy_spread and sell_spread are \"Buy/sell spreads\".",
|
||||||
|
"---Example 1 Start---",
|
||||||
|
"Investment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \nNorth Active Defensive \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \nNorth Active Moderately \nDefensive \n0.72 \n0.07 \n0.04 \n0.00 \n0.01 \n0.14 \n0.09/0.09 \nNorth Index Growth \n0.45 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.06/0.06 \nNorth Index High Growth \n0.45 \n0.00 \n0.01 \n0.00 \n0.00 \n0.01 \n0.06/0.07 \n",
|
||||||
|
"---Example 1 End---",
|
||||||
|
"For this case: ",
|
||||||
|
"a. The fund name is before the data row, e.g. North Active Defensive",
|
||||||
|
"c. The data points numbers in data row. ",
|
||||||
|
"For example: \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \n is with correct order as initial table structure.",
|
||||||
|
"The 1st number: 0.62 is the management_fee,",
|
||||||
|
"the 2nd number: 0.18 is the recoverable_expenses,",
|
||||||
|
"the 3rd number: 0.05 is the indirect_costs",
|
||||||
|
"the 4th number: 0.00 is the performance_fee_costs,",
|
||||||
|
"the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ",
|
||||||
|
"the 6th number: 0.14 is the Transaction costs (% pa).",
|
||||||
|
"the 7th number: 0.08 is the buy_spread, ",
|
||||||
|
"the 8th number: 0.08 is the sell_spread.",
|
||||||
|
"The management_fee_and_costs is Management fee (i) + Recoverable expenses + Estimated other indirect costs = 0.62 + 0.18 + 0.05= 0.85",
|
||||||
|
"**Attention: Ignore Transaction costs (% pa), the 6th number, DO NOT APPLY ITS VALUE TO CALCULATE management_fee_and_costs!!!**",
|
||||||
|
"The output should be: ",
|
||||||
|
"{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}",
|
||||||
|
"---Example 2 Start---",
|
||||||
|
"Investment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 \nMyNorth \nAustralian Fixed \nInterest Index \niv \n0.25 \n0.01 \n0.00 \n0.00 \n0.00 \n0.07 \n0.10/0.10 \nMyNorth \nInternational \nFixed Interest \nIndex - Hedged \n",
|
||||||
|
"---Example 2 End---",
|
||||||
|
"For this case: ",
|
||||||
|
"a. This table header is same as Example 1.",
|
||||||
|
"b. The algorithm to calculate management_fee_and_costs is same as Example 1.",
|
||||||
|
"c. The difference is **the fund name is after the data row, e.g. the fund name of the first data row is: MyNorth Australian Fixed Interest Index**",
|
||||||
|
"The output should be: ",
|
||||||
|
"{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
@ -525,11 +624,10 @@
|
||||||
"share 2",
|
"share 2",
|
||||||
"share 3"
|
"share 3"
|
||||||
],
|
],
|
||||||
"total_annual_dollar_based_charges_value": [125.00, 95.00, 26.00],
|
"total_annual_dollar_based_charges_value": [65, 57, 67.6],
|
||||||
"management_fee_and_costs_value": [2.63, 1.58, 2.55],
|
"management_fee_and_costs_value": [2.63, 1.58, 2.55],
|
||||||
"management_fee_value": [0.85, 1.10, 0.23],
|
"management_fee_value": [0.85, 1.10, 0.23],
|
||||||
"performance_fee_value": [0.03, 0.21, 0.08],
|
"performance_fee_costs_value": [0.03, 0.21, 0.08],
|
||||||
"performance_fee_costs_value": [0.05, 0.25, 0.09],
|
|
||||||
"buy_spread_value": [0.10, 0.15, 0.12],
|
"buy_spread_value": [0.10, 0.15, 0.12],
|
||||||
"sell_spread_value": [0.10, 0.10, 0.15],
|
"sell_spread_value": [0.10, 0.10, 0.15],
|
||||||
"establishment_fee_value": [0.75, 1.20, 0.25],
|
"establishment_fee_value": [0.75, 1.20, 0.25],
|
||||||
|
|
@ -548,13 +646,13 @@
|
||||||
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
|
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
|
||||||
"indirect_costs_value": [0.12, 0.16, 0.02],
|
"indirect_costs_value": [0.12, 0.16, 0.02],
|
||||||
"recoverable_expenses_value": [0.01, 0.05, 0.06],
|
"recoverable_expenses_value": [0.01, 0.05, 0.06],
|
||||||
"change_recoverable_expanses_value": [0.01, 0.02, 0.03]
|
"change_recoverable_expenses_value": [0.01, 0.02, 0.03]
|
||||||
},
|
},
|
||||||
"dp_reported_name" : {
|
"dp_reported_name" : {
|
||||||
"total_annual_dollar_based_charges": "Total annual dollar based charges",
|
"total_annual_dollar_based_charges": "Total annual dollar based charges",
|
||||||
"management_fee_and_costs": "Management fee and costs",
|
"management_fee_and_costs": "Management fee and costs",
|
||||||
"management_fee": "Management fee",
|
"management_fee": "Management fee",
|
||||||
"performance_fee": "Performance fee",
|
"performance_fee_costs": "Performance fee",
|
||||||
"buy_spread": "Buy spread",
|
"buy_spread": "Buy spread",
|
||||||
"sell_spread": "Sell spread",
|
"sell_spread": "Sell spread",
|
||||||
"administration_fees": "Administration fee",
|
"administration_fees": "Administration fee",
|
||||||
|
|
@ -563,7 +661,7 @@
|
||||||
"minimum_initial_investment": "Minimum initial investment",
|
"minimum_initial_investment": "Minimum initial investment",
|
||||||
"indirect_costs": "Indirect cost",
|
"indirect_costs": "Indirect cost",
|
||||||
"recoverable_expenses": "Recoverable expenses",
|
"recoverable_expenses": "Recoverable expenses",
|
||||||
"change_recoverable_expanses": "Change recoverable expanses",
|
"change_recoverable_expenses": "Change recoverable expenses",
|
||||||
"establishment_fee": "Establishment fee",
|
"establishment_fee": "Establishment fee",
|
||||||
"contribution_fee": "Contribution fee",
|
"contribution_fee": "Contribution fee",
|
||||||
"withdrawal_fee": "Withdrawal fee",
|
"withdrawal_fee": "Withdrawal fee",
|
||||||
|
|
|
||||||
97
main.py
97
main.py
|
|
@ -279,7 +279,39 @@ class EMEA_AR_Parsing:
|
||||||
)
|
)
|
||||||
with open(json_file, "r", encoding="utf-8") as f:
|
with open(json_file, "r", encoding="utf-8") as f:
|
||||||
doc_mapping_data = json.load(f)
|
doc_mapping_data = json.load(f)
|
||||||
return doc_mapping_data
|
if self.doc_source == "aus_prospectus":
|
||||||
|
output_data_folder_splits = output_data_json_folder.split("output")
|
||||||
|
if len(output_data_folder_splits) == 2:
|
||||||
|
merged_data_folder = f'{output_data_folder_splits[0]}output/merged_data/docs/'
|
||||||
|
os.makedirs(merged_data_folder, exist_ok=True)
|
||||||
|
|
||||||
|
merged_data_json_folder = os.path.join(merged_data_folder, "json/")
|
||||||
|
os.makedirs(merged_data_json_folder, exist_ok=True)
|
||||||
|
|
||||||
|
merged_data_excel_folder = os.path.join(merged_data_folder, "excel/")
|
||||||
|
os.makedirs(merged_data_excel_folder, exist_ok=True)
|
||||||
|
|
||||||
|
merged_data_file = os.path.join(merged_data_json_folder, f"merged_{self.doc_id}.json")
|
||||||
|
if os.path.exists(merged_data_file):
|
||||||
|
with open(merged_data_file, "r", encoding="utf-8") as f:
|
||||||
|
merged_data_list = json.load(f)
|
||||||
|
return merged_data_list
|
||||||
|
else:
|
||||||
|
data_mapping = DataMapping(
|
||||||
|
self.doc_id,
|
||||||
|
self.datapoints,
|
||||||
|
data_from_gpt,
|
||||||
|
self.document_mapping_info_df,
|
||||||
|
self.output_mapping_data_folder,
|
||||||
|
self.doc_source,
|
||||||
|
compare_with_provider=self.compare_with_provider
|
||||||
|
)
|
||||||
|
merged_data_list = data_mapping.merge_output_data_aus_prospectus(doc_mapping_data,
|
||||||
|
merged_data_json_folder,
|
||||||
|
merged_data_excel_folder)
|
||||||
|
return merged_data_list
|
||||||
|
else:
|
||||||
|
return doc_mapping_data
|
||||||
"""
|
"""
|
||||||
doc_id,
|
doc_id,
|
||||||
datapoints: list,
|
datapoints: list,
|
||||||
|
|
@ -499,7 +531,17 @@ def batch_start_job(
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Saving mapping data to {output_mapping_total_folder}")
|
logger.info(f"Saving mapping data to {output_mapping_total_folder}")
|
||||||
unique_doc_ids = result_mappingdata_df["doc_id"].unique().tolist()
|
result_mappingdata_df_columns = list(result_mappingdata_df.columns)
|
||||||
|
doc_id_column = ""
|
||||||
|
if "doc_id" in result_mappingdata_df_columns:
|
||||||
|
doc_id_column = "doc_id"
|
||||||
|
if "DocumentId" in result_mappingdata_df_columns:
|
||||||
|
doc_id_column = "DocumentId"
|
||||||
|
|
||||||
|
if doc_id_column == "":
|
||||||
|
logger.error(f"Cannot find doc_id column in mapping data")
|
||||||
|
return
|
||||||
|
unique_doc_ids = result_mappingdata_df[doc_id_column].unique().tolist()
|
||||||
os.makedirs(output_mapping_total_folder, exist_ok=True)
|
os.makedirs(output_mapping_total_folder, exist_ok=True)
|
||||||
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
||||||
file_name = f"mapping_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx"
|
file_name = f"mapping_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx"
|
||||||
|
|
@ -507,11 +549,11 @@ def batch_start_job(
|
||||||
file_name = f"{total_data_prefix}_{file_name}"
|
file_name = f"{total_data_prefix}_{file_name}"
|
||||||
output_file = os.path.join(output_mapping_total_folder, file_name)
|
output_file = os.path.join(output_mapping_total_folder, file_name)
|
||||||
|
|
||||||
doc_mapping_data_in_db = only_output_mapping_data_in_db(result_mappingdata_df)
|
# doc_mapping_data_in_db = only_output_mapping_data_in_db(result_mappingdata_df)
|
||||||
with pd.ExcelWriter(output_file) as writer:
|
with pd.ExcelWriter(output_file) as writer:
|
||||||
doc_mapping_data_in_db.to_excel(
|
# doc_mapping_data_in_db.to_excel(
|
||||||
writer, index=False, sheet_name="data_in_doc_mapping"
|
# writer, index=False, sheet_name="data_in_doc_mapping"
|
||||||
)
|
# )
|
||||||
result_mappingdata_df.to_excel(
|
result_mappingdata_df.to_excel(
|
||||||
writer, index=False, sheet_name="total_mapping_data"
|
writer, index=False, sheet_name="total_mapping_data"
|
||||||
)
|
)
|
||||||
|
|
@ -519,27 +561,6 @@ def batch_start_job(
|
||||||
writer, index=False, sheet_name="extract_data"
|
writer, index=False, sheet_name="extract_data"
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
|
||||||
doc_source == "aus_prospectus"
|
|
||||||
and document_mapping_file is not None
|
|
||||||
and len(document_mapping_file) > 0
|
|
||||||
and os.path.exists(document_mapping_file)
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
merged_total_data_folder = os.path.join(
|
|
||||||
output_mapping_total_folder, "merged/"
|
|
||||||
)
|
|
||||||
os.makedirs(merged_total_data_folder, exist_ok=True)
|
|
||||||
data_file_base_name = os.path.basename(output_file)
|
|
||||||
output_merged_data_file_path = os.path.join(
|
|
||||||
merged_total_data_folder, "merged_" + data_file_base_name
|
|
||||||
)
|
|
||||||
merge_output_data_aus_prospectus(
|
|
||||||
output_file, document_mapping_file, output_merged_data_file_path
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error: {e}")
|
|
||||||
|
|
||||||
if calculate_metrics:
|
if calculate_metrics:
|
||||||
prediction_sheet_name = "data_in_doc_mapping"
|
prediction_sheet_name = "data_in_doc_mapping"
|
||||||
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
||||||
|
|
@ -1431,7 +1452,7 @@ def get_aus_prospectus_document_category():
|
||||||
|
|
||||||
|
|
||||||
def test_post_adjust_extract_data():
|
def test_post_adjust_extract_data():
|
||||||
doc_id = "454036250"
|
doc_id = "539266814"
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
@ -1470,7 +1491,8 @@ def test_post_adjust_extract_data():
|
||||||
with open(data_file_path, "r", encoding="utf-8") as f:
|
with open(data_file_path, "r", encoding="utf-8") as f:
|
||||||
data_list = json.load(f)
|
data_list = json.load(f)
|
||||||
# data_list = data_extraction.remove_duplicate_data(data_list)
|
# data_list = data_extraction.remove_duplicate_data(data_list)
|
||||||
data_list = data_extraction.post_adjust_for_value_with_production_name(data_list)
|
# data_list = data_extraction.post_adjust_for_value_with_production_name(data_list)
|
||||||
|
data_list = data_extraction.post_supplement_data(data_list)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
@ -1525,8 +1547,21 @@ if __name__ == "__main__":
|
||||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
||||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
# document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||||
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
|
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
|
||||||
# special_doc_id_list: list = ["384508026"]
|
# special_doc_id_list: list = ["410899007", "539266880", "539266817",
|
||||||
# special_doc_id_list: list = ["401212184"]
|
# "539261734", "539266893"]
|
||||||
|
# special_doc_id_list: list = ["530101994",
|
||||||
|
# "539241700",
|
||||||
|
# "539261734",
|
||||||
|
# "539266814",
|
||||||
|
# "539266817",
|
||||||
|
# "539266874",
|
||||||
|
# "539266880",
|
||||||
|
# "539266893",
|
||||||
|
# "544886057",
|
||||||
|
# "550769189",
|
||||||
|
# "553449663"]
|
||||||
|
special_doc_id_list = ["420339794", "441280757", "454036250", "471206458", "412778803"]
|
||||||
|
# special_doc_id_list = ["441280757"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
292
prepare_data.py
292
prepare_data.py
|
|
@ -8,10 +8,12 @@ import re
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
import json_repair
|
import json_repair
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
from utils.pdf_download import download_pdf_from_documents_warehouse
|
from utils.pdf_download import download_pdf_from_documents_warehouse
|
||||||
from utils.pdf_util import PDFUtil
|
from utils.pdf_util import PDFUtil
|
||||||
|
from core.auz_nz.hybrid_solution_script import final_function_to_match
|
||||||
|
|
||||||
|
|
||||||
def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
|
def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
|
||||||
|
|
@ -1465,16 +1467,294 @@ def prepare_multi_fund_aus_prospectus_document(data_folder: str = r"/data/aus_pr
|
||||||
f.write(f"{doc_id}\n")
|
f.write(f"{doc_id}\n")
|
||||||
|
|
||||||
|
|
||||||
|
def set_mapping_to_ravi_data():
|
||||||
|
data_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx"
|
||||||
|
data_sheet = "Sheet1"
|
||||||
|
mapping_file_path = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||||
|
mapping_sheet = "document_mapping"
|
||||||
|
output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"
|
||||||
|
set_mapping_to_raw_name_data(data_file_path, data_sheet, mapping_file_path, mapping_sheet, output_file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def set_mapping_to_data_side_documents_data():
|
||||||
|
# data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/Audited file_phase2.xlsx"
|
||||||
|
# data_sheet = "all"
|
||||||
|
# mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||||
|
# mapping_sheet = "document_mapping"
|
||||||
|
# output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/audited_file_phase2_with_mapping.xlsx"
|
||||||
|
|
||||||
|
data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx"
|
||||||
|
data_sheet = "ground_truth"
|
||||||
|
raw_name_column = "raw_share_name"
|
||||||
|
mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
|
||||||
|
mapping_sheet = "document_mapping"
|
||||||
|
raw_name_mapping_column = None
|
||||||
|
output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
|
||||||
|
set_mapping_to_raw_name_data(data_file_path=data_file_path,
|
||||||
|
data_sheet=data_sheet,
|
||||||
|
raw_name_column=raw_name_column,
|
||||||
|
mapping_file_path=mapping_file_path,
|
||||||
|
mapping_sheet=mapping_sheet,
|
||||||
|
raw_name_mapping_column=raw_name_mapping_column,
|
||||||
|
output_file_path=output_file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx",
|
||||||
|
data_sheet: str = "Sheet1",
|
||||||
|
raw_name_column: str = "raw_share_name",
|
||||||
|
mapping_file_path: str = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx",
|
||||||
|
mapping_sheet: str = "document_mapping",
|
||||||
|
raw_name_mapping_column: str = None,
|
||||||
|
output_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"):
|
||||||
|
data_df = pd.read_excel(data_file_path, sheet_name=data_sheet)
|
||||||
|
data_df["provider_id"] = ""
|
||||||
|
data_df["provider_name"] = ""
|
||||||
|
data_df["fund_id"] = ""
|
||||||
|
data_df["fund_name"] = ""
|
||||||
|
data_df["sec_id"] = ""
|
||||||
|
data_df["sec_name"] = ""
|
||||||
|
|
||||||
|
mapping_data = pd.read_excel(mapping_file_path, sheet_name=mapping_sheet)
|
||||||
|
|
||||||
|
doc_id_list = data_df["doc_id"].unique().tolist()
|
||||||
|
for doc_id in doc_id_list:
|
||||||
|
doc_data = data_df[data_df["doc_id"] == doc_id]
|
||||||
|
raw_name_list = doc_data[raw_name_column].unique().tolist()
|
||||||
|
|
||||||
|
doc_mapping_data = mapping_data[mapping_data["DocumentId"] == doc_id]
|
||||||
|
if len(doc_mapping_data) == 0:
|
||||||
|
continue
|
||||||
|
provider_id = doc_mapping_data["CompanyId"].values[0]
|
||||||
|
provider_name = doc_mapping_data["CompanyName"].values[0]
|
||||||
|
data_df.loc[(data_df["doc_id"] == doc_id), "provider_id"] = provider_id
|
||||||
|
data_df.loc[(data_df["doc_id"] == doc_id), "provider_name"] = provider_name
|
||||||
|
if raw_name_mapping_column is not None and raw_name_mapping_column == "FundLegalName":
|
||||||
|
doc_db_name_list = doc_mapping_data[raw_name_mapping_column].unique().tolist()
|
||||||
|
for raw_name in raw_name_list:
|
||||||
|
find_df = doc_mapping_data[doc_mapping_data[raw_name_mapping_column] == raw_name]
|
||||||
|
if find_df is not None and len(find_df) == 1:
|
||||||
|
sec_id = find_df["FundClassId"].values[0]
|
||||||
|
sec_name = find_df["FundClassLegalName"].values[0]
|
||||||
|
fund_id = find_df["FundId"].values[0]
|
||||||
|
fund_name = find_df["FundLegalName"].values[0]
|
||||||
|
# update doc_data which raw_share_name is same as raw_share_name
|
||||||
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_id"] = sec_id
|
||||||
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_name"] = sec_name
|
||||||
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_id"] = fund_id
|
||||||
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_name"] = fund_name
|
||||||
|
else:
|
||||||
|
doc_db_name_list = doc_mapping_data["FundClassLegalName"].unique().tolist()
|
||||||
|
all_match_result = get_raw_name_db_match_result(doc_id,
|
||||||
|
provider_name,
|
||||||
|
raw_name_list,
|
||||||
|
doc_db_name_list,
|
||||||
|
iter_count=60)
|
||||||
|
for raw_share_name in raw_name_list:
|
||||||
|
if all_match_result.get(raw_share_name) is not None:
|
||||||
|
matched_db_share_name = all_match_result[raw_share_name]
|
||||||
|
if (
|
||||||
|
matched_db_share_name is not None
|
||||||
|
and len(matched_db_share_name) > 0
|
||||||
|
):
|
||||||
|
# get SecId from self.doc_fund_class_mapping
|
||||||
|
find_share_df = doc_mapping_data[doc_mapping_data["FundClassLegalName"] == matched_db_share_name]
|
||||||
|
if find_share_df is not None and len(find_share_df) > 0:
|
||||||
|
sec_id = find_share_df["FundClassId"].values[0]
|
||||||
|
fund_id = find_share_df["FundId"].values[0]
|
||||||
|
fund_name = find_share_df["FundLegalName"].values[0]
|
||||||
|
# update doc_data which raw_share_name is same as raw_share_name
|
||||||
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_id"] = sec_id
|
||||||
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_name"] = matched_db_share_name
|
||||||
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_id"] = fund_id
|
||||||
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_name"] = fund_name
|
||||||
|
try:
|
||||||
|
data_df = data_df[["doc_id",
|
||||||
|
"provider_id",
|
||||||
|
"provider_name",
|
||||||
|
"raw_fund_name",
|
||||||
|
"fund_id",
|
||||||
|
"fund_name",
|
||||||
|
"raw_share_name",
|
||||||
|
"sec_id",
|
||||||
|
"sec_name",
|
||||||
|
"management_fee_and_costs",
|
||||||
|
"management_fee",
|
||||||
|
"administration_fees",
|
||||||
|
"minimum_initial_investment",
|
||||||
|
"benchmark_name",
|
||||||
|
"performance_fee",
|
||||||
|
"performance_fee_charged",
|
||||||
|
"buy_spread",
|
||||||
|
"sell_spread",
|
||||||
|
"total_annual_dollar_based_charges",
|
||||||
|
"interposed_vehicle_performance_fee_cost",
|
||||||
|
"establishment_fee",
|
||||||
|
"contribution_fee",
|
||||||
|
"withdrawal_fee",
|
||||||
|
"exit_fee",
|
||||||
|
"switching_fee",
|
||||||
|
"activity_fee",
|
||||||
|
"hurdle_rate",
|
||||||
|
"analyst_name"
|
||||||
|
]]
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
with open(output_file_path, "wb") as file:
|
||||||
|
data_df.to_excel(file, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def get_raw_name_db_match_result(
|
||||||
|
doc_id: str, provider_name: str, raw_name_list: list, doc_share_name_list: list, iter_count: int = 30
|
||||||
|
):
|
||||||
|
# split raw_name_list into several parts which each part is with 30 elements
|
||||||
|
# The reason to split is to avoid invoke token limitation issues from CahtGPT
|
||||||
|
raw_name_list_parts = [
|
||||||
|
raw_name_list[i : i + iter_count]
|
||||||
|
for i in range(0, len(raw_name_list), iter_count)
|
||||||
|
]
|
||||||
|
all_match_result = {}
|
||||||
|
doc_share_name_list = deepcopy(doc_share_name_list)
|
||||||
|
for raw_name_list in raw_name_list_parts:
|
||||||
|
match_result, doc_share_name_list = get_final_function_to_match(
|
||||||
|
doc_id, provider_name, raw_name_list, doc_share_name_list
|
||||||
|
)
|
||||||
|
all_match_result.update(match_result)
|
||||||
|
return all_match_result
|
||||||
|
|
||||||
|
def get_final_function_to_match(doc_id, provider_name, raw_name_list, db_name_list):
|
||||||
|
if len(db_name_list) == 0:
|
||||||
|
match_result = {}
|
||||||
|
for raw_name in raw_name_list:
|
||||||
|
match_result[raw_name] = ""
|
||||||
|
else:
|
||||||
|
match_result = final_function_to_match(
|
||||||
|
doc_id=doc_id,
|
||||||
|
pred_list=raw_name_list,
|
||||||
|
db_list=db_name_list,
|
||||||
|
provider_name=provider_name,
|
||||||
|
doc_source="aus_prospectus"
|
||||||
|
)
|
||||||
|
matched_name_list = list(match_result.values())
|
||||||
|
db_name_list = remove_matched_names(db_name_list, matched_name_list)
|
||||||
|
return match_result, db_name_list
|
||||||
|
|
||||||
|
def remove_matched_names(target_name_list: list, matched_name_list: list):
|
||||||
|
if len(matched_name_list) == 0:
|
||||||
|
return target_name_list
|
||||||
|
|
||||||
|
matched_name_list = list(set(matched_name_list))
|
||||||
|
matched_name_list = [
|
||||||
|
value for value in matched_name_list if value is not None and len(value) > 0
|
||||||
|
]
|
||||||
|
for matched_name in matched_name_list:
|
||||||
|
if (
|
||||||
|
matched_name is not None
|
||||||
|
and len(matched_name) > 0
|
||||||
|
and matched_name in target_name_list
|
||||||
|
):
|
||||||
|
target_name_list.remove(matched_name)
|
||||||
|
return target_name_list
|
||||||
|
|
||||||
|
|
||||||
|
def adjust_data_file(source_file: str,
|
||||||
|
targe_file: str):
|
||||||
|
source_data = pd.read_excel(source_file, sheet_name="Sheet1")
|
||||||
|
source_doc_id_list = source_data["DocumentId"].unique().tolist()
|
||||||
|
|
||||||
|
target_data = pd.read_excel(targe_file, sheet_name="Sheet1")
|
||||||
|
#remove target_data which doc_id is in source_doc_id_list
|
||||||
|
target_data = target_data[~target_data["DocumentId"].isin(source_doc_id_list)]
|
||||||
|
# concat source_data and target_data
|
||||||
|
target_data = pd.concat([source_data, target_data], ignore_index=True)
|
||||||
|
with open(targe_file, "wb") as file:
|
||||||
|
target_data.to_excel(file, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def set_provider_to_ground_truth(groud_truth_file: str,
|
||||||
|
ground_truth_sheet: str,
|
||||||
|
document_mapping_file: str,
|
||||||
|
document_mapping_sheet: str):
|
||||||
|
ground_truth_df = pd.read_excel(groud_truth_file, sheet_name=ground_truth_sheet)
|
||||||
|
ground_truth_df["provider_id"] = ""
|
||||||
|
ground_truth_df["provider_name"] = ""
|
||||||
|
|
||||||
|
mapping_data = pd.read_excel(document_mapping_file, sheet_name=document_mapping_sheet)
|
||||||
|
|
||||||
|
doc_id_list = ground_truth_df["DocumentId"].unique().tolist()
|
||||||
|
for doc_id in doc_id_list:
|
||||||
|
doc_mapping_data = mapping_data[mapping_data["DocumentId"] == doc_id]
|
||||||
|
if len(doc_mapping_data) == 0:
|
||||||
|
continue
|
||||||
|
provider_id = doc_mapping_data["CompanyId"].values[0]
|
||||||
|
provider_name = doc_mapping_data["CompanyName"].values[0]
|
||||||
|
ground_truth_df.loc[(ground_truth_df["DocumentId"] == doc_id), "provider_id"] = provider_id
|
||||||
|
ground_truth_df.loc[(ground_truth_df["DocumentId"] == doc_id), "provider_name"] = provider_name
|
||||||
|
try:
|
||||||
|
ground_truth_df = ground_truth_df[["DocumentId",
|
||||||
|
"provider_id",
|
||||||
|
"provider_name",
|
||||||
|
"raw_fund_name",
|
||||||
|
"FundId",
|
||||||
|
"FundLegalName",
|
||||||
|
"raw_share_name",
|
||||||
|
"FundClassId",
|
||||||
|
"FundClassLegalName",
|
||||||
|
"management_fee_and_costs",
|
||||||
|
"management_fee",
|
||||||
|
"administration_fees",
|
||||||
|
"minimum_initial_investment",
|
||||||
|
"benchmark_name",
|
||||||
|
"performance_fee",
|
||||||
|
"performance_fee_charged",
|
||||||
|
"buy_spread",
|
||||||
|
"sell_spread",
|
||||||
|
"total_annual_dollar_based_charges",
|
||||||
|
"interposed_vehicle_performance_fee_cost",
|
||||||
|
"establishment_fee",
|
||||||
|
"contribution_fee",
|
||||||
|
"withdrawal_fee",
|
||||||
|
"exit_fee",
|
||||||
|
"switching_fee",
|
||||||
|
"activity_fee",
|
||||||
|
"hurdle_rate",
|
||||||
|
"analyst_name"
|
||||||
|
]]
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
with open(groud_truth_file, "wb") as file:
|
||||||
|
ground_truth_df.to_excel(file, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def update_data_by_latest_ground_truth():
|
||||||
|
# TODO: update current ground truth data by the latest version
|
||||||
|
latest_ground_truth_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
update_data_by_latest_ground_truth()
|
||||||
|
# set_provider_to_ground_truth(
|
||||||
|
# groud_truth_file=r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx",
|
||||||
|
# ground_truth_sheet="Sheet1",
|
||||||
|
# document_mapping_file=r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx",
|
||||||
|
# document_mapping_sheet="document_mapping"
|
||||||
|
# )
|
||||||
|
|
||||||
|
# set_mapping_to_data_side_documents_data()
|
||||||
|
|
||||||
|
# source_file = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx"
|
||||||
|
# target_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
|
||||||
|
# adjust_data_file(source_file=source_file, targe_file=target_file)
|
||||||
|
|
||||||
# pdf_exist()
|
# pdf_exist()
|
||||||
# prepare_multi_fund_aus_prospectus_document()
|
# prepare_multi_fund_aus_prospectus_document()
|
||||||
merge_aus_document_prospectus_data(aus_data_folder=r"/data/aus_prospectus/basic_information/17_documents/",
|
# merge_aus_document_prospectus_data(aus_data_folder=r"/data/aus_prospectus/basic_information/17_documents/",
|
||||||
aus_document_mapping_file="aus_prospectus_17_documents_mapping.xlsx",
|
# aus_document_mapping_file="aus_prospectus_17_documents_mapping.xlsx",
|
||||||
aus_prospectus_data_file="aus_prospectus_data_17_documents_secid.xlsx",
|
# aus_prospectus_data_file="aus_prospectus_data_17_documents_secid.xlsx",
|
||||||
document_mapping_sheet="document_mapping",
|
# document_mapping_sheet="document_mapping",
|
||||||
output_file="aus_prospectus_17_documents_data.xlsx",
|
# output_file="aus_prospectus_17_documents_data.xlsx",
|
||||||
output_sheet="aus_document_prospectus")
|
# output_sheet="aus_document_prospectus")
|
||||||
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
||||||
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
||||||
# get_document_with_all_4_data_points(folder, file_name, None)
|
# get_document_with_all_4_data_points(folder, file_name, None)
|
||||||
|
|
|
||||||
|
|
@ -1034,3 +1034,69 @@ def remove_abundant_data_detail(data_detail_list: list,
|
||||||
if remove_data in data_detail_list:
|
if remove_data in data_detail_list:
|
||||||
data_detail_list.remove(remove_data)
|
data_detail_list.remove(remove_data)
|
||||||
return data_detail_list
|
return data_detail_list
|
||||||
|
|
||||||
|
|
||||||
|
def replace_special_table_header(replace_table_header_config: list, page_text: str):
|
||||||
|
"""
|
||||||
|
For some special table header, replace to the standard header
|
||||||
|
e.g.
|
||||||
|
raw header 1:
|
||||||
|
Investment Option \n
|
||||||
|
Management \nfee (i) \n(% pa) \n
|
||||||
|
Indirect costs (i) \n(% pa) \n
|
||||||
|
Estimated performance fees (ii) \n(% pa) \n
|
||||||
|
Transaction \ncosts (% pa) \n
|
||||||
|
Buy/sell \nspreads (%) \n
|
||||||
|
Recoverable \nexpenses (iii) \n
|
||||||
|
Estimated \nother \nindirect costs \n
|
||||||
|
Performance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \n
|
||||||
|
Performance fees \ncharged by \ninterposed \nvehicles \n
|
||||||
|
|
||||||
|
raw header 2:
|
||||||
|
Fund \n
|
||||||
|
Management \nfee 1 \n(% pa) \n
|
||||||
|
Indirect costs1\n(% pa)\n
|
||||||
|
Estimated performance fees2\n(% pa)\n
|
||||||
|
Transaction \ncosts \n(% pa) \n
|
||||||
|
Buy/sell \nspreads (%) \n
|
||||||
|
Recoverable \nexpenses 3 \n
|
||||||
|
Estimated \nother indirect \ncosts \n
|
||||||
|
Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n
|
||||||
|
Performance \nfees charged \nby interposed \nvehicles \n
|
||||||
|
|
||||||
|
There are 2 layers of headers, the first layer is the main header, the second layer is the sub header
|
||||||
|
The purpose is to merge the sub header to the main header
|
||||||
|
Indirect costs (i) \n(% pa) replace to Recoverable expenses\nEstimated other indirect costs
|
||||||
|
Estimated performance fees2\n(% pa) replace to Performance fees charged to the Fund by underlying managers\nPerformance fees charged by interposed vehicles
|
||||||
|
|
||||||
|
Remove the second layer header.
|
||||||
|
e.g.
|
||||||
|
Recoverable \nexpenses (iii) \n
|
||||||
|
Estimated \nother \nindirect costs \n
|
||||||
|
Performance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \n
|
||||||
|
Performance fees \ncharged by \ninterposed \nvehicles \n
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
Recoverable \nexpenses 3 \n
|
||||||
|
Estimated \nother indirect \ncosts \n
|
||||||
|
Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n
|
||||||
|
Performance \nfees charged \nby interposed \nvehicles \n
|
||||||
|
"""
|
||||||
|
if replace_table_header_config is None or len(replace_table_header_config) == 0:
|
||||||
|
return page_text
|
||||||
|
updated_text = False
|
||||||
|
for replace_info in replace_table_header_config:
|
||||||
|
for regex_all in replace_info.get("regex_all_list", []):
|
||||||
|
table_header_search = re.search(regex_all, page_text)
|
||||||
|
if table_header_search is not None:
|
||||||
|
original_text = table_header_search.group()
|
||||||
|
page_text = re.sub(regex_all, replace_info.get("replace_text", original_text), page_text)
|
||||||
|
updated_text = True
|
||||||
|
break
|
||||||
|
if updated_text:
|
||||||
|
break
|
||||||
|
return page_text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
Example to extract data from Australia Prospectus PDF Document.
|
||||||
|
Sample:
|
||||||
|
{
|
||||||
|
"doc_id": "412778803"
|
||||||
|
}
|
||||||
|
Author: Blade He
|
||||||
|
---
|
||||||
|
parameters:
|
||||||
|
- name: Australia Prospectus Document Id
|
||||||
|
in: body
|
||||||
|
type: string
|
||||||
|
required: true
|
||||||
|
description: Example to extract data from Australia Prospectus PDF Document.
|
||||||
|
default: {"doc_id": "412778803"}
|
||||||
|
schema:
|
||||||
|
required:
|
||||||
|
- Document Id
|
||||||
|
properties:
|
||||||
|
doc_id:
|
||||||
|
description: Australia Prospectus Document Id
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
responses:
|
||||||
|
200:
|
||||||
|
description: succesfully.
|
||||||
|
400:
|
||||||
|
description: failed.
|
||||||
Loading…
Reference in New Issue