optimize performance_fee_costs by document 391080133

This commit is contained in:
Blade He 2025-03-12 14:45:48 -05:00
parent c7c36dbdd2
commit 765772e5a8
4 changed files with 23 additions and 7 deletions

View File

@ -30,6 +30,12 @@
["Investment\\s*option\\s*Administration fees[\\s\\S]*?administration\\s*costs\\s*Investment\\s*fees[\\s\\S]*?investment\\s*costs\\s*Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Estimated\\s*administration[\\s\\S]*?transaction\\s*costs[\\s\\S]*?annual\\s*fees\\s*and\\s*costs\\s*\\(\\%\\s*pa\\)\\s*\\n"],
"replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n",
"comments": ["item 0: document 411062815, page 17"]
},
{
"regex_all_list":
["\\nFund\\s*name\\s*Management\\s*fee\\s*Indirect\\s*costs\\s*Recoverable\\s*expenses[\\s\\S]*?performance.*\\s*fee\\s*Estimated\\s*other\\s*indirect\\s*costs\\s*\\n"],
"replace_text": "\nFund name \nManagement fee \nRecoverable expenses \nEstimated performance-related fee \nEstimated other indirect costs \n",
"comments": ["item 0: document 391080133, page 21"]
}
]
}

View File

@ -614,7 +614,7 @@ class DataExtraction:
previous_page_datapoints = []
previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items():
# if page_num not in [4, 5]:
# if page_num not in [20]:
# continue
if page_num in handled_page_num_list:
continue

View File

@ -171,11 +171,21 @@
"C.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".",
"The management_fee is the value of \"Management fee (% pa)\".",
"The management_fee_and_costs is the value of \"Total management cost (% pa)\".",
"---Example Start---",
"---Example 1 Start---",
"Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n",
"---Example End---",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee_costs\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
"---Example 2 Start---",
"\nFund name \nManagement fee \nRecoverable expenses \nEstimated performance-related fee \nEstimated other indirect costs \nPathways 30 \n1.16% pa \n0.02% pa \n0.05% pa \n0.05% pa \nPathways 70 \n1.30% pa \n0.01% pa \n0.06% pa \n0.04% pa \n",
"---Example 2 End---",
"The management_fee_and_costs is the value of \"Management fee\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".",
"The management_fee is the value of \"Management fee\".",
"The performance_fee_costs is the value of \"Estimated performance-related fee\".",
"The indirect_costs is the value of \"Estimated other indirect costs\".",
"The recoverable_expenses is the value of \"Recoverable expenses\".",
"The output should be:",
"{\"data\": [{\"fund name\": \"Pathways 30\", \"share name\": \"Pathways 30\", \"management_fee_and_costs\": 1.23, \"management_fee\": 1.16, \"recoverable_expenses\": 0.02, \"performance_fee_costs\": 0.05, \"indirect_costs\": 0.05}, {\"fund name\": \"Pathways 70\", \"share name\": \"Pathways 70\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.3, \"recoverable_expenses\": 0.01, \"performance_fee_costs\": 0.06, \"indirect_costs\": 0.04}]}",
"\n",
"D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
"---Example 1 Start---",

View File

@ -1526,8 +1526,8 @@ if __name__ == "__main__":
# special_doc_id_list = ["553242411"]
re_run_extract_data = False
re_run_mapping_data = False
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = True
doc_source = "aus_prospectus"
# doc_source = "emea_ar"
@ -1560,8 +1560,8 @@ if __name__ == "__main__":
# "544886057",
# "550769189",
# "553449663"]
special_doc_id_list = ["420339794", "441280757", "454036250", "471206458", "412778803"]
# special_doc_id_list = ["441280757"]
special_doc_id_list = ["391080133"]
# special_doc_id_list = ["391080133", ""]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (