1. optimize for administration fees.

2. optimize for management fees
This commit is contained in:
Blade He 2025-02-27 17:36:41 -06:00
parent 543cab74e1
commit d0128d6279
4 changed files with 99 additions and 32 deletions

View File

@ -65,7 +65,8 @@ class DataExtraction:
self.provider_fund_name_list = ( self.provider_fund_name_list = (
self.provider_mapping_df["FundName"].unique().tolist() self.provider_mapping_df["FundName"].unique().tolist()
) )
self.datapoint_page_info = datapoint_page_info self.document_category, self.document_production = self.get_document_category_production()
self.datapoint_page_info = self.get_datapoint_page_info(datapoint_page_info)
self.page_nums_with_datapoints = self.get_page_nums_from_datapoint_page_info() self.page_nums_with_datapoints = self.get_page_nums_from_datapoint_page_info()
self.datapoints = datapoints self.datapoints = datapoints
self.instructions_config = self.get_instructions_config() self.instructions_config = self.get_instructions_config()
@ -74,7 +75,6 @@ class DataExtraction:
self.datapoint_name_config = self.get_datapoint_name() self.datapoint_name_config = self.get_datapoint_name()
self.datapoint_reported_name_config, self.non_english_reported_name_config = \ self.datapoint_reported_name_config, self.non_english_reported_name_config = \
self.get_datapoint_reported_name() self.get_datapoint_reported_name()
self.document_category, self.document_production = self.get_document_category_production()
self.extract_way = extract_way self.extract_way = extract_way
self.output_image_folder = output_image_folder self.output_image_folder = output_image_folder
@ -107,6 +107,16 @@ class DataExtraction:
return document_category, document_production return document_category, document_production
def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict:
"""
If document source is aus_propectus and document category is MIS
then remove the administration_fee from datapoint_page_info
"""
if self.doc_source == "aus_prospectus" and self.document_category.upper() == "MIS":
if "administration_fees" in list(datapoint_page_info.keys()):
datapoint_page_info.pop("administration_fees")
return datapoint_page_info
def get_investment_objective_pages(self): def get_investment_objective_pages(self):
investment_objective_pages = [] investment_objective_pages = []
if self.document_type == 1: if self.document_type == 1:
@ -239,12 +249,38 @@ class DataExtraction:
data_dict["completion_token"] = result.get("completion_token", 0) data_dict["completion_token"] = result.get("completion_token", 0)
data_dict["total_token"] = result.get("total_token", 0) data_dict["total_token"] = result.get("total_token", 0)
""" """
data_list = self.supplement_minimum_initial_investment(data_list)
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list) data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
data_list = self.remove_duplicate_data(data_list) data_list = self.remove_duplicate_data(data_list)
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name: if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
data_list = self.post_adjust_management_fee_costs(data_list) data_list = self.post_adjust_management_fee_costs(data_list)
if "minimum_initial_investment" not in datapoint_list_with_production_name:
data_list = self.supplement_minimum_initial_investment(data_list) data_list = self.check_administration_fees(data_list)
return data_list
def check_administration_fees(self, data_list: list):
"""
If document source is aus_prospectus and document category is MIS, then remove the administration fees from data_list
"""
if self.doc_source == "aus_prospectus" and self.document_category.upper() == "MIS":
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
remove_items = []
for data_item in data:
keys = list(data_item.keys())
if "administration_fees" in keys:
data_item.pop("administration_fees")
keys = [key for key in list(data_item.keys()) if key not in ["fund_name", "share_name"]]
if len(keys) == 0:
remove_items.append(data_item)
for remove_item in remove_items:
try:
if remove_item in extract_data["data"]:
extract_data["data"].remove(remove_item)
except:
pass
return data_list return data_list
def post_adjust_for_value_with_production_name(self, data_list: list): def post_adjust_for_value_with_production_name(self, data_list: list):
@ -258,13 +294,12 @@ class DataExtraction:
if raw_name.lower() in self.document_production.lower(): if raw_name.lower() in self.document_production.lower():
raw_name_as_production_name = raw_name raw_name_as_production_name = raw_name
break break
datapoint_list_with_production_name = []
if raw_name_as_production_name is None: if raw_name_as_production_name is None:
return data_list return data_list, datapoint_list_with_production_name
raw_name_dict.pop(raw_name_as_production_name) raw_name_dict.pop(raw_name_as_production_name)
datapoint_list_with_production_name = []
for data_dict in data_list: for data_dict in data_list:
extract_data = data_dict.get("extract_data", {}) extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", []) data = extract_data.get("data", [])
@ -299,7 +334,7 @@ class DataExtraction:
extract_data["data"].extend(new_dp_item_list) extract_data["data"].extend(new_dp_item_list)
if len(datapoint_list_with_production_name) == 0: if len(datapoint_list_with_production_name) == 0:
return data_list return data_list, datapoint_list_with_production_name
for data_dict in data_list: for data_dict in data_list:
extract_data = data_dict.get("extract_data", {}) extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", []) data = extract_data.get("data", [])
@ -541,7 +576,7 @@ class DataExtraction:
previous_page_datapoints = [] previous_page_datapoints = []
previous_page_fund_name = None previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
# if page_num != 344: # if page_num < 73:
# continue # continue
if page_num in handled_page_num_list: if page_num in handled_page_num_list:
continue continue
@ -1245,8 +1280,9 @@ class DataExtraction:
def get_datapoints_by_page_num(self, page_num: int) -> list: def get_datapoints_by_page_num(self, page_num: int) -> list:
datapoints = [] datapoints = []
for datapoint in self.datapoints: for datapoint in self.datapoints:
if page_num in self.datapoint_page_info[datapoint]: if page_num in self.datapoint_page_info.get(datapoint, []):
datapoints.append(datapoint) datapoints.append(datapoint)
return datapoints return datapoints

View File

@ -0,0 +1,9 @@
{
"prompts": [
"Assume there is a data table in current page contents, is there the table with same table structure in the next page contents?",
"The meaning of \"same\" is: with totally same table columns for the table in both of current page and next page.",
"Please output JSON format, the format example is:",
"{\"answer\": \"Yes\"} or {\"answer\": \"No\"}",
"Answer:\n"
]
}

View File

@ -123,7 +123,6 @@
"Management fee and cost = Management fee + indirect cost + recoverable expense (Also known as Expense recovery cost or recovery fee or Expense recovery fee or expense recoveries) + Manager fee or Responsible entity fee.", "Management fee and cost = Management fee + indirect cost + recoverable expense (Also known as Expense recovery cost or recovery fee or Expense recovery fee or expense recoveries) + Manager fee or Responsible entity fee.",
"If there are multiple Management fee and costs reported names, here is the priority rule:", "If there are multiple Management fee and costs reported names, here is the priority rule:",
"A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".", "A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
"---Example 1 Start---", "---Example 1 Start---",
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n", "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
"---Example 2 End---", "---Example 2 End---",
@ -175,20 +174,54 @@
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\n", "Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\n",
"---Example 2 End---", "---Example 2 End---",
"For this case, the first \"Entry Fee Option\" value is 1.44, the first \"Estimated Other investment costs\" value is 0.00, the sum is 1.44, so the output should be:", "For this case, the first \"Entry Fee Option\" value is 1.44, the first \"Estimated Other investment costs\" value is 0.00, the sum is 1.44, so the output should be:",
"{\"data\": [{\"fund name\": \"Pendal Concentrated Global Shares Hedged II\", \"share name\": \"Pendal Concentrated Global Shares Hedged II\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44}]}" "{\"data\": [{\"fund name\": \"Pendal Concentrated Global Shares Hedged II\", \"share name\": \"Pendal Concentrated Global Shares Hedged II\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44}]}",
"\n",
"E. If the management fee/ management fee and costs is with the range, please ignore and output empty.",
"---Example 1 Start---",
"Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to Administration fee \nrebate section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown in the Management \nfees and costs section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n",
"---Example 1 End---",
"The relevant values: 0.26 and 1.82, are in the range, so the output should be:",
"{\"data\": []}",
"\n",
"F. If the management fee and costs including the performance fee, please ignore the performance fee value, just output the management fee and costs value.",
"---Example 1 Start---",
"Fees and costs for \nyour investment options \n\nAdministration fees and costs apply in addition to the fees and costs shown in this table. Please refer to the PDS and Fee Brochure for \nfurther information about fees and costs, including how the figures shown below are calculated. \n\nThe investment fees and \ncosts are made up of \nPerformance \nfee \nPlus \nother \ninvestment \nfees and \ncosts \nEquals \ninvestment \nfees and \ncosts \nTransaction \ncosts (net) \nBuy-sell \nspreads \nTransaction \ncosts \n(gross) 1 \n% pa \n% pa \n% pa \nEntry %/ \nExit % \n% pa \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \n",
"---Example 1 End---",
"The column: \"Equals investment fees and costs\" is the sum of \"Performance fee\" and \"Plus other investment fees and costs\", we should ignore the \"Performance fee\" value, just output the \"Plus other investment fees and costs\" value.",
"The \"Plus other investment fees and costs\" could be the values for both of \"management fee\" and \"management fee and costs\", so the output should be:",
"{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
], ],
"administration_fees":[ "administration_fees":[
"Administration fees and costs is share class level data.", "Administration fees and costs is share class level data.",
"Simple case:",
"----Example 1 Start----", "----Example 1 Start----",
"Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n", "Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n",
"----Example 1 End----", "----Example 1 End----",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}" "{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}",
"Complex cases:",
"A. Need to add multiple numbers together.",
"----Example 1 Start----",
"MLC MasterKey Super & Pension Fundamentals \n\nType of fee or cost \nOngoing annual fees and costs 1 \n\nAdministration fees and \ncosts \n\nAccount balance \n\nFirst $150,000 \n\nRemaining balance \nover $150,000 \n\nThe percentage Administration fee \ncharged to each account you have \n(excluding the fixed fee and Trustee \nLevy) is capped at $2,500 pa. \n\nPlus \n\nTrustee Levy of 0.02% pa of your \naccount balance. \n\nPlus \n\nAmount \n\nHow and when paid \n\nPercentage fee \n(% pa) \n\n0.30 \n\n0.10 \n\nAdministration fee \n\nThe Administration fee is deducted monthly from your account and will \nbe rounded off to 2 decimal points. As a result of the rounding, the total \nannual amount may slightly differ. \n\nThe percentage fee for each month is calculated using your average Super \nand Pension account balance for the previous month. \n\nThe Trustee Levy will be deducted monthly from your account balance. \n\nThe levy amount for each month is calculated using your account balance \nat the date it's deducted. \n\nYou won't see these costs as direct charges to your account. They reduce \nthe balance held in reserves used to cover certain costs related to the \nrunning of the MLC Super Fund. \n\n4 \n\nMLC MasterKey Super & Pension Fundamentals Product Disclosure Statement",
"----Example 1 End----",
"For this case, the relevant values: first: 0.30%, remaining balance over: 0.10%, Plus Trustee Levy: 0.02%.",
"Please ignore the remaining balance over 0.10%, add first: 0.30% and Plus Trustee Levy: 0.02% = 0.32%",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"administration_fees\": 0.32}]}"
], ],
"buy_spread": [ "buy_spread": [
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)" "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
], ],
"performance_fee": [
"Performance fees is share class level data.",
"If the performance fees is with the range, please ignore and output empty.",
"---Example 1 Start---",
"Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.",
"---Example 1 End---",
"The relevant values: 0.00 and 2.18, are in the range, so the output should be:",
"{\"data\": []}"
],
"minimum_initial_investment": [ "minimum_initial_investment": [
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.", "Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
"---Example 1 Start---", "---Example 1 Start---",
@ -211,10 +244,11 @@
"{\"data\": [{\"fund name\": \"unknown\", \"minimum_initial_investment\": 500000}]}", "{\"data\": [{\"fund name\": \"unknown\", \"minimum_initial_investment\": 500000}]}",
"\n", "\n",
"---Example 4 Start---", "---Example 4 Start---",
"Contributions and access \nto your investment \n• \n• \nWe provide choice and flexibility for your investment with access to your money at anytime. \nStart your investment with as little as $1,000. \n• \nEstablish a regular savings plan. \n28 \n• \nYou can switch between the investment options and also rebalance within your selected \noptions at any time. \n• \nMinimum withdrawal $500. \n", "Lifeplan Investment Bond Product Disclosure Statement \n\nThe Lifeplan Bond at a glance \n\nAt a glance Description Refer to page(s) \nContributions and access \nto your investment \n• \n• \nWe provide choice and flexibility for your investment with access to your money at anytime. \nStart your investment with as little as $1,000. \n• \nEstablish a regular savings plan. \n28 \n• \nYou can switch between the investment options and also rebalance within your selected \noptions at any time. \n• \nMinimum withdrawal $500. \n",
"---Example 4 End---", "---Example 4 End---",
"If can't find the specific fund name, please apply production name, e.g. Lifeplan Investment Bond",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"unknown\", \"minimum_initial_investment\": 1000}]}" "{\"data\": [{\"fund name\": \"Lifeplan Investment Bond\", \"minimum_initial_investment\": 1000}]}"
], ],
"benchmark_name": [ "benchmark_name": [
"Benchmark is fund leval data, usually as index fund name, e.g. S&P/ASX 300 A-REIT Total Return Index ", "Benchmark is fund leval data, usually as index fund name, e.g. S&P/ASX 300 A-REIT Total Return Index ",

20
main.py
View File

@ -151,6 +151,7 @@ class EMEA_AR_Parsing:
data_from_gpt = data_extraction.extract_data() data_from_gpt = data_extraction.extract_data()
except Exception as e: except Exception as e:
logger.error(f"Error: {e}") logger.error(f"Error: {e}")
print_exc()
data_from_gpt = {"data": []} data_from_gpt = {"data": []}
# Drilldown data to relevant PDF document # Drilldown data to relevant PDF document
@ -1042,8 +1043,8 @@ def batch_run_documents(
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
) )
re_run_extract_data = False re_run_extract_data = True
re_run_mapping_data = False re_run_mapping_data = True
force_save_total_data = True force_save_total_data = True
calculate_metrics = False calculate_metrics = False
@ -1520,20 +1521,7 @@ if __name__ == "__main__":
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
# special_doc_id_list: list = [ # special_doc_id_list: list = ["420339794"]
# "539790009",
# "542300403",
# "542301117",
# "542306317",
# "547567013",
# "552505237",
# "552505278",
# "554431052",
# "554851189",
# "555377021",
# "555654388",
# ]
# special_doc_id_list: list = ["454036250"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (