From 4edc4b476810385decd0d043c545a20fe16abf2a Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 24 Mar 2025 17:10:16 -0500 Subject: [PATCH 1/9] clean code --- core/data_extraction.py | 45 ++++++-- .../data_extraction_prompts_config.json | 91 ++++++++++++--- main.py | 4 +- performance.ipynb | 107 +++++++----------- prepare_data.py | 25 ++-- ...s_prospectus_verify_6_documents_sample.txt | 6 + 6 files changed, 180 insertions(+), 98 deletions(-) create mode 100644 sample_documents/aus_prospectus_verify_6_documents_sample.txt diff --git a/core/data_extraction.py b/core/data_extraction.py index 6796661..277c1f3 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -394,11 +394,22 @@ class DataExtraction: fund_name = data_item.get("fund_name", "") if len(fund_name) == 0: continue + share_name = data_item.get("share_name", "") + + updated_fund_name = self.update_pension_ttr_fund_name(fund_name) + if updated_fund_name != fund_name: + fund_name = updated_fund_name + data_item["fund_name"] = fund_name + updated_share_name = self.update_pension_ttr_fund_name(share_name) + if updated_share_name != share_name: + share_name = updated_share_name + data_item["share_name"] = share_name + fund_name_splits = fund_name.split() - if fund_name_splits[-1] == "TTR": + if fund_name_splits[-1] == "TTR" and fund_name not in ttr_fund_name_list: ttr_fund_name_list.append(fund_name) exist_ttr = True - if fund_name_splits[-1] == "Pension": + if fund_name_splits[-1] == "Pension" and fund_name not in pension_fund_name_list: pension_fund_name_list.append(fund_name) exist_pension = True if exist_ttr and exist_pension: @@ -449,6 +460,22 @@ class DataExtraction: data.extend(new_item_list) return data_list + def update_pension_ttr_fund_name(self, investment_name: str): + pension_prefix_list = ["retirement account", "account-based pension", "account based pension"] + ttr_prefix_list = ["transition to retirement account", "pre-retirement pension", "pre retirement pension"] + investment_name_lower = investment_name.lower() + for pension_prefix in pension_prefix_list: + if investment_name_lower.startswith(pension_prefix) and investment_name_lower != pension_prefix: + pension_prefix_split = pension_prefix.split() + investment_name = " ".join(investment_name.split()[len(pension_prefix_split):]) + " Pension" + break + for ttr_prefix in ttr_prefix_list: + if investment_name_lower.startswith(ttr_prefix) and investment_name_lower != ttr_prefix: + ttr_prefix_split = ttr_prefix.split() + investment_name = " ".join(investment_name.split()[len(ttr_prefix_split):]) + " TTR" + break + return investment_name + def check_administration_fees(self, data_list: list): """ If document source is aus_prospectus and document category is MIS, then remove the administration fees from data_list @@ -668,9 +695,10 @@ class DataExtraction: for mf in management_fee_list: mf_fund_name = mf.get("fund_name", "") mf_share_name = mf.get("share_name", "") - if (mf_fund_name == fund_name and mf_share_name == share_name) or \ - (len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and - (mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))): + # if (mf_fund_name == fund_name and mf_share_name == share_name) or \ + # (len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and + # (mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))): + if (mf_fund_name == fund_name and mf_share_name == share_name): if exist_complex_rule_keywords and \ ("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys): mf["management_fee"] = management_fee @@ -693,9 +721,10 @@ class DataExtraction: for mfc in management_fee_costs_list: mfc_fund_name = mfc.get("fund_name", "") mfc_share_name = mfc.get("share_name", "") - if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \ - (len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and - (mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))): + # if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \ + # (len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and + # (mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))): + if (mfc_fund_name == fund_name and mfc_share_name == share_name): if exist_complex_rule_keywords and \ ("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys): mfc["management_fee_and_costs"] = management_fee_costs diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index f266a8f..22f8105 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -48,6 +48,24 @@ "---Example End---", "Correct fund name: MLC Horizon 2 Income Portfolio", "Correct share name: MLC Horizon 2 Income Portfolio", + "f. In table header, \"Retirement account\" or \"Account-based pension\" means \"Pension\"; ", + "\"Transition to Retirement account\" or \"Pre-retirement pension\" means \"TTR\". ", + "Please append them to the fund name and share name.", + "f.1 Example 1", + "---Example 1 Start---", + "Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCash 0.05%0.00% 0.00% 0.05%\n", + "---Example 1 End---", + "The prefix is \"Retirement account\", the investment option is \"Cash\", so fund name and share name should be: \"Retirement account Cash\".", + "f.2 Example 2", + "---Example 2 Start---", + "Transition to Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCash 0.05%0.00% 0.00% 0.05%\n", + "---Example 2 End---", + "The prefix is \"Transition to Retirement account\", the investment option is \"Cash\", so fund name and share name should be: \"Transition to Retirement account Cash\".", + "f.3 Example 3", + "---Example 3 Start---", + "Fees and costs* \n\nRetirement account Transition to Retirement account \nAdministration fees (taken directly \nfrom your account) \n$1.50 per week plus 0.10% pa of your account balance on the day the fee \nis deducted (0.10% pa component is capped at $300 pa). \nAdministration costs (not taken \ndirectly from your account) \nThis is deducted from the Fund’s reserves throughout the year, not your account. \n0.09% pa (based on costs for the financial year ended 30 June 2024). \n\n\nRest Pension Product Disclosure Statement \n\n6", + "---Example 3 End---", + "Although exist \"Retirement account\" and \"Transition to Retirement account\", but the investment option is not exist, so fund name and share name should be: \"Rest Pension\".", "\n", "- 3. Only extract the latest data from context:", "If with multiple data values in same row, please extract the latest.", @@ -62,8 +80,13 @@ "---Example End---", "The output should be:", "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}", - "- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:", - "\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"." + "- 5. Please ignore these words as fund names, it means never extract these words as fund names. They are:", + "\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\".", + "- 6. Identify the value of data point and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0", + "---Example Start---", + "Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced – Indexed 0.00% 0.00% 0.00% 0.00%\n", + "---Example End---", + "For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values." ], "investment_level": { "total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.", @@ -251,17 +274,24 @@ "Both of management_fee and management_fee_and_costs are the values for \"Management costs\", so the output should be:", "{\"data\": [{\"fund name\": \"FirstChoice Wholesale Defensive\", \"share name\": \"FirstChoice Wholesale Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.85}, {\"fund name\": \"FirstChoice Wholesale Conservative\", \"share name\": \"FirstChoice Wholesale Conservative\", \"management_fee_and_costs\": 0.9, \"management_fee\": 0.9, \"performance_fee_costs\": 0.02}]}", "---Example 2 Start---", + "Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCapital Stable 0.46% 0.04% 0.08% 0.54%\nBalanced 0.52% 0.06% 0.10%0.62% \n", + "---Example 2 End", + "The column: \"(A) Investment fees and costs (including (B) performance fees) (pa)*\" includes \"(B) performance fees) (pa)*\", we should subtract the \"(B) performance fees) (pa)*\" value, just output the pure management fee and costs value.", + "Besides, the \"Retirement account\" is the pre-fix fund name, should output it with fund/ share name together, e.g. \"Retirement account Capital Stable\"", + "The output should be:", + "{\"data\": [{\"fund name\": \"Retirement account Capital Stable\", \"share name\": \"Retirement account Capital Stable\", \"management_fee_and_costs\": 0.42, \"management_fee\": 0.42, \"performance_fee_costs\": 0.04}, {\"fund name\": \"Retirement account Balanced\", \"share name\": \"Retirement account Balanced\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.46, \"performance_fee_costs\": 0.06}]}", + "---Example 3 Start---", "Investment \noption \nInvestment fees and \ncosts (p.a.) \n1 \nTransaction \ncosts (p.a.) \nMySuper/ \nBalanced \n0.38% (including 0.09% \nPerformance fee) \n0.18% \nManaged \nGrowth \n0.38% (including 0.11% \nPerformance fee) \n0.08% \n", - "---Example 2 End---", + "---Example 3 End---", "The column: \"Investment fees and costs (p.a.)\", \"including Performance fee\", meaning the value is the sum of \"Management costs\" and \"performance fee\", We should subtract the \"performance fee\" value, just output the \"Management costs\" value.", "Both of management_fee and management_fee_and_costs are the values for \"Management costs\".", "So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee_costs is 0.09.", "For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee_costs is 0.11.", "So the output should be:", "{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee_costs\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee_costs\": 0.11}]}", - "---Example 3 Start---", + "---Example 4 Start---", "Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return – Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n", - "---Example 3 End---", + "---Example 4 End---", "The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.", "The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".", "Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.", @@ -296,36 +326,56 @@ "---Example End---", "The values in example is **Maximum management fee**, should ignore all of them.", "The Output should be:", - "{\"data\": []}" + "{\"data\": []}", + "K. The management fee and costs in paragraph with speficic fund/ share prefix name: \"Account-based pension\" or \"Pre-retirement pension\"", + "---Example 1 Start---", + "Account-based pension \nInvestment fees \nand costs 2 \nHigh Growth 0.45%, Growth 0.49%", + "---Example 1 End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"Account-based pension High Growth\", \"share name\": \"Account-based pension High Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45}, {\"fund name\": \"Account-based pension Growth\", \"share name\": \"Account-based pension Growth\", \"management_fee_and_costs\": 0.49, \"management_fee\": 0.49}]}", + "---Example 2 Start---", + "Pre-retirement pension \nWe generally calculate \nand deduct this fee daily when unit \nprices are determined. \nHigh Growth 0.48%, Growth 0.50%", + "---Example 2 End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"Pre-retirement pension High Growth\", \"share name\": \"Pre-retirement pension High Growth\", \"management_fee_and_costs\": 0.48, \"management_fee\": 0.48}, {\"fund name\": \"Pre-retirement pension Growth\", \"share name\": \"Pre-retirement pension Growth\", \"management_fee_and_costs\": 0.50, \"management_fee\": 0.50}]}" ], "administration_fees":[ "Administration fees and costs and total annual dollar-based charges are share class level data.", "Simple case:", "----Example 1 Start----", - "Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n", + "Fees and costs summary \n\nVision income streams \n\nType of fee Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n2 \n0.25% pa of your account balance (made up of \n0.25% of your account balance which is capped \nat $1,050 pa plus a reserving margin of 0.00% \npa of each investment option’s assets).", "----Example 1 End----", + "According to example, the administration fee is 0.25% pa, so administration_fees is 0.25, ", + "The output should be:", + "{\"data\": [{\"fund name\": \"Vision income streams\", \"share name\": \"Vision income streams\", \"administration_fees\": 0.25}]}", + "\n", + "----Example 2 Start----", + "Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n", + "----Example 2 End----", "According to example, the administration fee is $1.30 per week plus 0.29% pa, so administration_fees is 0.29, ", "total_annual_dollar_based_charges is 1.30 * 52 = 67.6", "The output should be:", "{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29, \"total_annual_dollar_based_charges\": 67.6}]}", "\n", - "----Example 2 Start----", + "----Example 3 Start----", "At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply)", - "----Example 2 End----", + "----Example 3 End----", "According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ", "total_annual_dollar_based_charges is 1 * 52 = 52", "The output should be:", "{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}", - "---Example 3 Start---", + "\n", + "---Example 4 Start---", "\nPrime Super Income Stream\nType of fee \nor cost \nAmount How and when paid \nOngoing annual fees and costs \n1 \nAdministration \nfees and costs \nAdministration \nfees of $1.30 \nper week \nPlus \n0.50% p.a. of \nyour account \nbalance, capped \nat $500 p.a. \nDeducted from your \naccount on the last \nbusiness day of each \nmonth, except if you \nare leaving Prime \nSuper, in which case \nit is deducted prior to \nyour exit from Prime \nSuper. \nInvestment \nfees and costs \n2 \n0.07% to 1.00% \nof assets p.a. \ndepending on \nthe investment \noption \nTaken into account \nprior to the declaration \nof weekly earning \nrates. This cost is not \ndeducted directly from \nyour account. \n", - "---Example 3 End---", + "---Example 4 End---", "According to example, the administration fee is $1.30 per week plus 0.50% p.a., so administration_fees is 0.5, ", "total_annual_dollar_based_charges is 1.30 * 52 = 67.6", "The output should be:", "{\"data\": [{\"fund name\": \"Prime Super Income Stream\", \"share name\": \"Prime Super Income Stream\", \"administration_fees\": 0.5, \"total_annual_dollar_based_charges\": 67.6}]}", - "---Example 4 Start---", + "\n", + "---Example 5 Start---", "At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nTTR income stream Retirement income stream Reference \nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply) \nRefer to the ‘Fees and \nother costs’ section on \npages 40-46 for details \n", - "---Example 4 End---", + "---Example 5 End---", "According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ", "total_annual_dollar_based_charges is 1 * 52 = 52", "The output should be:", @@ -341,8 +391,21 @@ "The output should be:", "{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"administration_fees\": 0.32}]}", "---Example 2 Start---", - "Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Fund’s \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n", + "Mine Super\nType of fee or cost Amount (% pa) How and when paid \nOngoing annual fees and costs \n1 \nWe generally calculate and \ndeduct this fee daily when unit \nprices are determined. \nAdministration fees \nand costs \n0.16% pa \nPlus \n0.031% pa. \n", "---Example 2 End---", + "According to example, the relevant values: 0.16% and 0.031%, so administration_fees is 0.16 + 0.031 = 0.191", + "The output should be:", + "{\"data\": [{\"fund name\": \"Mine Super\", \"share name\": \"Mine Super\", \"administration_fees\": 0.191}]}", + "---Example 3 Start---", + "Fees and costs* \n\nRetirement account Transition to Retirement account \nAdministration fees (taken directly \nfrom your account) \n$1.50 per week plus 0.10% pa of your account balance on the day the fee \nis deducted (0.10% pa component is capped at $300 pa). \nAdministration costs (not taken \ndirectly from your account) \nThis is deducted from the Fund’s reserves throughout the year, not your account. \n0.09% pa (based on costs for the financial year ended 30 June 2024). \n\n\nRest Pension Product Disclosure Statement \n\n6", + "---Example 3 End---", + "According to the example, the administration fee is $1.50 per week plus 0.10% pa, Administration costs is 0.09% pa so administration_fees is 0.1 + 0.09 = 0.19, ", + "total_annual_dollar_based_charges is 1.50 * 52 = 78", + "The output should be:", + "{\"data\": [{\"fund name\": \"Rest Pension\", \"share name\": \"Rest Pension\", \"administration_fees\": 0.19, \"total_annual_dollar_based_charges\": 78}]}", + "---Example 4 Start---", + "Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Fund’s \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n", + "---Example 4 End---", "Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees." ], "total_annual_dollar_based_charges": [ diff --git a/main.py b/main.py index bc2d7eb..187ae91 100644 --- a/main.py +++ b/main.py @@ -1533,11 +1533,11 @@ if __name__ == "__main__": # doc_source = "emea_ar" if doc_source == "aus_prospectus": document_sample_file = ( - r"./sample_documents/aus_prospectus_46_documents_sample.txt" + r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt" ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] - document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" + document_mapping_file = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx" # special_doc_id_list = ["441280757"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" diff --git a/performance.ipynb b/performance.ipynb index b1b3d9a..73d2ca1 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -15,35 +15,49 @@ "from utils.similarity import Similarity\n", "\n", "\n", - "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n", - " \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n", - " \"Minimum Initial Investment\", \"Benchmark\"]\n", + "# imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n", + "# \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n", + "# \"Minimum Initial Investment\", \"Benchmark\"]\n", + "\n", + "\n", + "# imp_datapoints_mapping = {\n", + "# \"Management Fee and Costs\": \"management_fee_and_costs\",\n", + "# \"Management Fee\": \"management_fee\",\n", + "# \"Performance fee and cost\": \"performance_fee_costs\",\n", + "# \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n", + "# \"Administration Fee and costs\": \"administration_fees\",\n", + "# \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", + "# \"Buy Spread\": \"buy_spread\",\n", + "# \"Sell Spread\": \"sell_spread\",\n", + "# \"Performance Fee\": \"PerformanceFeeCharged\",\n", + "# \"Minimum Initial Investment\": \"minimum_initial_investment\",\n", + "# \"Benchmark\": \"benchmark_name\"\n", + "# }\n", + "\n", + "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\",\n", + " \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\"]\n", "\n", "\n", "imp_datapoints_mapping = {\n", " \"Management Fee and Costs\": \"management_fee_and_costs\",\n", " \"Management Fee\": \"management_fee\",\n", " \"Performance fee and cost\": \"performance_fee_costs\",\n", - " \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n", " \"Administration Fee and costs\": \"administration_fees\",\n", " \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", " \"Buy Spread\": \"buy_spread\",\n", - " \"Sell Spread\": \"sell_spread\",\n", - " \"Performance Fee\": \"PerformanceFeeCharged\",\n", - " \"Minimum Initial Investment\": \"minimum_initial_investment\",\n", - " \"Benchmark\": \"benchmark_name\"\n", + " \"Sell Spread\": \"sell_spread\"\n", "}\n", "\n", - "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", + "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx\"\n", "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n", - "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250319000625.xlsx\"\n", + "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_6_documents_by_text_20250324170432.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", "\n" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -316,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -330,53 +344,17 @@ "All Providers Results: \n", "Document List File - None\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9123 \t0.8465 \t0.9891 \t0.8387 \t433 \t364 \t0 \t66 \t4 \n", - "management_fee \t0.9284 \t0.8744 \t0.9895 \t0.8664 \t433 \t376 \t0 \t54 \t4 \n", - "performance_fee_costs \t0.9217 \t0.8691 \t0.9811 \t0.8986 \t291 \t259 \t131 \t39 \t5 \n", - "interposed_vehicle_performance_fee_cost \t0.9536 \t0.9114 \t1.0000 \t0.9839 \t73 \t72 \t355 \t7 \t0 \n", - "administration_fees \t0.9857 \t0.9857 \t0.9857 \t0.9954 \t70 \t69 \t363 \t1 \t1 \n", - "total_annual_dollar_based_charges \t0.9920 \t0.9841 \t1.0000 \t0.9977 \t62 \t62 \t371 \t1 \t0 \n", - "buy_spread \t0.9483 \t0.9187 \t0.9798 \t0.9147 \t370 \t339 \t58 \t30 \t7 \n", - "sell_spread \t0.9526 \t0.9268 \t0.9799 \t0.9217 \t370 \t342 \t58 \t27 \t7 \n", - "minimum_initial_investment \t0.9593 \t0.9641 \t0.9547 \t0.9424 \t309 \t295 \t114 \t11 \t14 \n", - "benchmark_name \t0.8738 \t0.8084 \t0.9507 \t0.9101 \t157 \t135 \t260 \t32 \t7 \n", - "TOTAL \t0.9428 \t0.9089 \t0.9810 \t0.9270 \t2568 \t2313 \t1710 \t268 \t49 \n", - "Total Funds Matched - 434\n", - "Total Funds Not Matched - 131\n", - "Percentage of Funds Matched - 76.8141592920354\n", - "All Providers Results: \n", - "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", - "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9462 \t0.9027 \t0.9940 \t0.8978 \t185 \t167 \t0 \t18 \t1 \n", - "management_fee \t0.9724 \t0.9514 \t0.9944 \t0.9462 \t185 \t176 \t0 \t9 \t1 \n", - "performance_fee_costs \t0.9239 \t0.8750 \t0.9785 \t0.9194 \t99 \t91 \t80 \t13 \t2 \n", - "interposed_vehicle_performance_fee_cost \t0.9369 \t0.8814 \t1.0000 \t0.9624 \t53 \t52 \t127 \t7 \t0 \n", - "administration_fees \t0.9412 \t1.0000 \t0.8889 \t0.9946 \t9 \t8 \t177 \t0 \t1 \n", - "buy_spread \t0.9779 \t0.9672 \t0.9888 \t0.9570 \t183 \t177 \t1 \t6 \t2 \n", - "sell_spread \t0.9835 \t0.9781 \t0.9890 \t0.9677 \t183 \t179 \t1 \t4 \t2 \n", - "minimum_initial_investment \t0.9306 \t0.9571 \t0.9054 \t0.8925 \t148 \t134 \t32 \t6 \t14 \n", - "benchmark_name \t0.9206 \t0.8878 \t0.9560 \t0.9194 \t99 \t87 \t84 \t11 \t4 \n", - "TOTAL \t0.9481 \t0.9334 \t0.9661 \t0.9397 \t1144 \t1071 \t502 \t74 \t76 \n", - "Total Funds Matched - 186\n", - "Total Funds Not Matched - 10\n", - "Percentage of Funds Matched - 94.89795918367348\n", - "All Providers Results: \n", - "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", - "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.8854 \t0.8041 \t0.9850 \t0.7944 \t248 \t197 \t0 \t48 \t3 \n", - "management_fee \t0.8929 \t0.8163 \t0.9852 \t0.8065 \t248 \t200 \t0 \t45 \t3 \n", - "performance_fee_costs \t0.9205 \t0.8660 \t0.9825 \t0.8831 \t192 \t168 \t51 \t26 \t3 \n", - "interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t228 \t0 \t0 \n", - "administration_fees \t0.9919 \t0.9839 \t1.0000 \t0.9960 \t61 \t61 \t186 \t1 \t0 \n", - "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t62 \t62 \t186 \t0 \t0 \n", - "buy_spread \t0.9178 \t0.8710 \t0.9701 \t0.8831 \t187 \t162 \t57 \t24 \t5 \n", - "sell_spread \t0.9209 \t0.8763 \t0.9702 \t0.8871 \t187 \t163 \t57 \t23 \t5 \n", - "minimum_initial_investment \t0.9847 \t0.9699 \t1.0000 \t0.9798 \t161 \t161 \t82 \t5 \t0 \n", - "benchmark_name \t0.8000 \t0.6957 \t0.9412 \t0.9032 \t58 \t48 \t176 \t21 \t3 \n", - "TOTAL \t0.9314 \t0.8883 \t0.9834 \t0.9133 \t1424 \t1242 \t1023 \t193 \t98 \n", - "Total Funds Matched - 248\n", - "Total Funds Not Matched - 121\n", - "Percentage of Funds Matched - 67.20867208672087\n" + "management_fee_and_costs \t0.7470 \t0.6739 \t0.8378 \t0.5962 \t52 \t31 \t0 \t15 \t6 \n", + "management_fee \t0.8046 \t0.7609 \t0.8537 \t0.6731 \t52 \t35 \t0 \t11 \t6 \n", + "performance_fee_costs \t0.7805 \t0.9697 \t0.6531 \t0.6538 \t50 \t32 \t2 \t1 \t17 \n", + "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t52 \t52 \t0 \t0 \t0 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t16 \t16 \t36 \t0 \t0 \n", + "buy_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n", + "sell_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n", + "TOTAL \t0.9046 \t0.9149 \t0.9064 \t0.8462 \t258 \t202 \t106 \t27 \t29 \n", + "Total Funds Matched - 52\n", + "Total Funds Not Matched - 28\n", + "Percentage of Funds Matched - 65.0\n" ] } ], @@ -434,9 +412,10 @@ "\n", "print(\"\\n\")\n", "print(\"\\n\")\n", - "document_list_file_list = [None, \n", - " \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n", - " \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n", + "# document_list_file_list = [None, \n", + "# \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n", + "# \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n", + "document_list_file_list = [None]\n", "for document_list_file in document_list_file_list:\n", " document_list = None\n", " if document_list_file is not None:\n", @@ -637,7 +616,7 @@ ], "metadata": { "kernelspec": { - "display_name": "blade", + "display_name": "emea_ar_test", "language": "python", "name": "python3" }, diff --git a/prepare_data.py b/prepare_data.py index 3b773aa..fcbef95 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -1483,13 +1483,21 @@ def set_mapping_to_data_side_documents_data(): # mapping_sheet = "document_mapping" # output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/audited_file_phase2_with_mapping.xlsx" - data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx" + # data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx" + # data_sheet = "ground_truth" + # raw_name_column = "raw_share_name" + # mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" + # mapping_sheet = "document_mapping" + # raw_name_mapping_column = None + # output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" + + data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth.xlsx" data_sheet = "ground_truth" raw_name_column = "raw_share_name" - mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" + mapping_file_path = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx" mapping_sheet = "document_mapping" raw_name_mapping_column = None - output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" + output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx" set_mapping_to_raw_name_data(data_file_path=data_file_path, data_sheet=data_sheet, raw_name_column=raw_name_column, @@ -1582,8 +1590,7 @@ def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/ou "administration_fees", "minimum_initial_investment", "benchmark_name", - "performance_fee", - "performance_fee_charged", + "performance_fee_costs", "buy_spread", "sell_spread", "total_annual_dollar_based_charges", @@ -1593,9 +1600,7 @@ def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/ou "withdrawal_fee", "exit_fee", "switching_fee", - "activity_fee", - "hurdle_rate", - "analyst_name" + "activity_fee" ]] except Exception as e: print(e) @@ -1733,7 +1738,7 @@ def update_data_by_latest_ground_truth(): if __name__ == "__main__": - update_data_by_latest_ground_truth() + # update_data_by_latest_ground_truth() # set_provider_to_ground_truth( # groud_truth_file=r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx", # ground_truth_sheet="Sheet1", @@ -1741,7 +1746,7 @@ if __name__ == "__main__": # document_mapping_sheet="document_mapping" # ) - # set_mapping_to_data_side_documents_data() + set_mapping_to_data_side_documents_data() # source_file = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx" # target_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" diff --git a/sample_documents/aus_prospectus_verify_6_documents_sample.txt b/sample_documents/aus_prospectus_verify_6_documents_sample.txt new file mode 100644 index 0000000..ee8d7fd --- /dev/null +++ b/sample_documents/aus_prospectus_verify_6_documents_sample.txt @@ -0,0 +1,6 @@ +553449169 +539791362 +573372424 +448906722 +462780211 +563608192 \ No newline at end of file From dd1f8f76ae0203131e93429272e8e4aee9359579 Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 24 Mar 2025 17:12:13 -0500 Subject: [PATCH 2/9] update for metrics --- performance.ipynb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/performance.ipynb b/performance.ipynb index 73d2ca1..5818380 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -330,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -344,14 +344,14 @@ "All Providers Results: \n", "Document List File - None\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.7470 \t0.6739 \t0.8378 \t0.5962 \t52 \t31 \t0 \t15 \t6 \n", - "management_fee \t0.8046 \t0.7609 \t0.8537 \t0.6731 \t52 \t35 \t0 \t11 \t6 \n", - "performance_fee_costs \t0.7805 \t0.9697 \t0.6531 \t0.6538 \t50 \t32 \t2 \t1 \t17 \n", + "management_fee_and_costs \t0.9495 \t0.9038 \t1.0000 \t0.9038 \t52 \t47 \t0 \t5 \t0 \n", + "management_fee \t0.9495 \t0.9038 \t1.0000 \t0.9038 \t52 \t47 \t0 \t5 \t0 \n", + "performance_fee_costs \t0.9899 \t0.9800 \t1.0000 \t0.9808 \t50 \t49 \t2 \t1 \t0 \n", "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t52 \t52 \t0 \t0 \t0 \n", "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t16 \t16 \t36 \t0 \t0 \n", "buy_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n", "sell_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n", - "TOTAL \t0.9046 \t0.9149 \t0.9064 \t0.8462 \t258 \t202 \t106 \t27 \t29 \n", + "TOTAL \t0.9841 \t0.9697 \t1.0000 \t0.9698 \t258 \t247 \t106 \t11 \t0 \n", "Total Funds Matched - 52\n", "Total Funds Not Matched - 28\n", "Percentage of Funds Matched - 65.0\n" From 8ad472fb39656d09b55751515e3622c45e2081ac Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 24 Mar 2025 18:00:53 -0500 Subject: [PATCH 3/9] UPDATE metrics code file --- performance.ipynb | 117 +++------------------------------------------- 1 file changed, 7 insertions(+), 110 deletions(-) diff --git a/performance.ipynb b/performance.ipynb index 5818380..9451e53 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 35, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -330,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -457,104 +457,9 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'data_point': 'performance_fee_costs', 'doc_id': 377377369, 'sec_name': 'SPDR® S&P Emerging Markets Carbon Control Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Investment Portfolio-BlackRock Tactical Growth NE', 'truth': '0', 'generated': '0.33', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Inv-Greencape Broadcap NEF', 'truth': '0.33', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA IP- Pendal Monthly Income Plus-NEF', 'truth': '0', 'generated': '0.02', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA IP-Alternatives Growth Fund-NEF', 'truth': '0.41', 'generated': '0.13', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA IP-Perpetual Balanced Growth Trust-NEF', 'truth': '0', 'generated': '0.15', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'truth': '0', 'generated': '0.03', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth -NE', 'truth': '0', 'generated': '0.15', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -NE', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA IP-Ausbil Australian Emerging Leaders Trust-NEF', 'truth': '0', 'generated': '0.03', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard High Yield Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Property Securities Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Macquarie Income Opps', 'truth': '0.03', 'generated': '0.12', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Diversified Inc', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Schroder Fixed Income', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Share Plus L/S', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only) P Class', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Asia Fund', 'truth': '0.27', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund P Class', 'truth': '0.03', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Healthcare Fund', 'truth': '0.86', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum European Fund', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Japan Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Fairview Eq Ptnr Emg Comp', 'truth': '0.56', 'generated': '0.54', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MasterKey Pension Fundamentals (Pre Retirement) - Perpetual Smll Co Fund No.2', 'truth': '0', 'generated': '0.56', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond - Allan Gray Australian Equity Fund Class A', 'truth': '0.28', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open', 'truth': '0.05', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539241700, 'sec_name': 'North Professional Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 506913190, 'sec_name': 'FC W Pen-CFS TTR Defensive', 'truth': '', 'generated': '0.15', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 523516443, 'sec_name': 'CFS MIF-Strategic Cash', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active High Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Moderately Defensive', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Balanced', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Defensive Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 527969661, 'sec_name': 'JPMorgan Global Equity Premium Income (Hedged) Complex ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Higher Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557362553, 'sec_name': 'JPMorgan Global Select Equity Active ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 550522985, 'sec_name': 'RQI Global Value – Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539241700, 'sec_name': 'North Professional Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 506913190, 'sec_name': 'FC W Pen-CFS TTR Defensive', 'truth': '', 'generated': '0.15', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 523516443, 'sec_name': 'CFS MIF-Strategic Cash', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active High Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Moderately Defensive', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Balanced', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Defensive Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 527969661, 'sec_name': 'JPMorgan Global Equity Premium Income (Hedged) Complex ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Higher Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557362553, 'sec_name': 'JPMorgan Global Select Equity Active ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 550522985, 'sec_name': 'RQI Global Value – Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 377377369, 'sec_name': 'SPDR® S&P Emerging Markets Carbon Control Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Investment Portfolio-BlackRock Tactical Growth NE', 'truth': '0', 'generated': '0.33', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Inv-Greencape Broadcap NEF', 'truth': '0.33', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA IP- Pendal Monthly Income Plus-NEF', 'truth': '0', 'generated': '0.02', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA IP-Alternatives Growth Fund-NEF', 'truth': '0.41', 'generated': '0.13', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA IP-Perpetual Balanced Growth Trust-NEF', 'truth': '0', 'generated': '0.15', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'truth': '0', 'generated': '0.03', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth -NE', 'truth': '0', 'generated': '0.15', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -NE', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA IP-Ausbil Australian Emerging Leaders Trust-NEF', 'truth': '0', 'generated': '0.03', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard High Yield Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Property Securities Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Macquarie Income Opps', 'truth': '0.03', 'generated': '0.12', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Diversified Inc', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Schroder Fixed Income', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Share Plus L/S', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only) P Class', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Asia Fund', 'truth': '0.27', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund P Class', 'truth': '0.03', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Healthcare Fund', 'truth': '0.86', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum European Fund', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Japan Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Fairview Eq Ptnr Emg Comp', 'truth': '0.56', 'generated': '0.54', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MasterKey Pension Fundamentals (Pre Retirement) - Perpetual Smll Co Fund No.2', 'truth': '0', 'generated': '0.56', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond - Allan Gray Australian Equity Fund Class A', 'truth': '0.28', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open', 'truth': '0.05', 'generated': '', 'error': 'Generated is null and truth is not null'}\n" - ] - } - ], + "outputs": [], "source": [ "for message_list_element in message_list:\n", " if message_list_element[\"data_point\"] == \"performance_fee_costs\":\n", @@ -563,17 +468,9 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Excel file '/data/aus_prospectus/output/error_analysis/anomalies_found.xlsx' has been created successfully.\n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "\n", From ff2325c72d8d2c0352456e7a0cbab50bf6af4902 Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 26 Mar 2025 18:58:45 -0500 Subject: [PATCH 4/9] 1. fix issue for assign values based on production name 2. optimize instructions for extract non-necessary data by Cost of Product message --- core/auz_nz/hybrid_solution_script.py | 44 ++++----- core/data_extraction.py | 17 +++- .../data_extraction_prompts_config.json | 35 ++++--- main.py | 4 +- performance.ipynb | 91 ++++++++++++------- ...s_prospectus_verify_3_documents_sample.txt | 3 + 6 files changed, 122 insertions(+), 72 deletions(-) create mode 100644 sample_documents/aus_prospectus_verify_3_documents_sample.txt diff --git a/core/auz_nz/hybrid_solution_script.py b/core/auz_nz/hybrid_solution_script.py index 4b0ec25..3ff701a 100644 --- a/core/auz_nz/hybrid_solution_script.py +++ b/core/auz_nz/hybrid_solution_script.py @@ -628,16 +628,16 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list) cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list) prompt_context = f""" - {prompt_instruction} + {prompt_instruction} - provider_name: {provider_name} + provider_name: {provider_name} - prediction_fund: - {cleaned_unmatched_pred_list} - - true_fund: - {cleaned_unmatched_db_list} - """ + prediction_fund: + {cleaned_unmatched_pred_list} + + true_fund: + {cleaned_unmatched_db_list} + """ # print(f"\ncleaned_unmatched_pred_list: ",cleaned_unmatched_pred_list) # print(f"cleaned_unmatched_db_list: ",cleaned_unmatched_db_list) # llm_response = get_llm_response(prompt_context) @@ -660,35 +660,35 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc # cleaned_response = llm_response['response'].strip("```json").strip("```").replace('\n', '') # llm_result = json.loads(cleaned_response) # logger.info(f"\n\n llm_result: {llm_result}") - for k,v in llm_result.items(): + for pred_name,db_name in llm_result.items(): # print("k: ",k) # print("v: ",v) og_db_index=-1 # og_pred_index = -1 og_pred_index_list = [] - if k in cleaned_unmatched_pred_list: + if pred_name in cleaned_unmatched_pred_list: for c_idx, c_item in enumerate(cleaned_unmatched_pred_list): - if c_item==k: + if c_item==pred_name: og_pred_index_list.append(c_idx) # og_pred_index = cleaned_unmatched_pred_list.index(k) if len(og_pred_index_list) == 0: # sometimes, the raw name and db name reversed from the LLM response - if v in cleaned_unmatched_pred_list and k in cleaned_unmatched_db_list: + if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list: for c_idx, c_item in enumerate(cleaned_unmatched_pred_list): - if c_item==v: + if c_item==db_name: og_pred_index_list.append(c_idx) # og_pred_index = cleaned_unmatched_pred_list.index(v) - og_db_index = cleaned_unmatched_db_list.index(k) + og_db_index = cleaned_unmatched_db_list.index(pred_name) # v and k are swapped - temp = v - v = k - k = temp + temp = db_name + db_name = pred_name + pred_name = temp if len(og_pred_index_list)==0: continue # og_db_index = cleaned_unmatched_db_list.index(v) - if og_db_index == -1 and v in cleaned_unmatched_db_list: - og_db_index = cleaned_unmatched_db_list.index(v) + if og_db_index == -1 and db_name in cleaned_unmatched_db_list: + og_db_index = cleaned_unmatched_db_list.index(db_name) # print("og_db_index: ",og_db_index, cleaned_unmatched_db_list) # print("unmatched_db_list: ",unmatched_db_list) @@ -697,7 +697,7 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc if i['pred_fund']==unmatched_pred_list[og_pred_index]: if og_db_index!=-1: i['db_fund']=unmatched_db_list[og_db_index] - i['cleaned_db_fund_name'] = v + i['cleaned_db_fund_name'] = db_name final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]}) else: i['db_fund'] = '' @@ -705,8 +705,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc final_result.update({unmatched_pred_list[og_pred_index]:""}) i['llm_clean_pred_list'] = cleaned_unmatched_pred_list i['llm_clean_db_list'] = cleaned_unmatched_db_list, - i['llm_pred_fund'] = k - i['llm_matched_db_name'] = v + i['llm_pred_fund'] = pred_name + i['llm_matched_db_name'] = db_name i['llm_result'] = llm_result break diff --git a/core/data_extraction.py b/core/data_extraction.py index 277c1f3..4445ecd 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -11,7 +11,7 @@ from utils.sql_query_util import query_document_fund_mapping, query_investment_b from utils.logger import logger from utils.biz_utils import add_slash_to_text_as_regex, clean_text, \ get_most_similar_name, remove_abundant_data, replace_special_table_header - +from utils.similarity import Similarity class DataExtraction: def __init__( @@ -511,7 +511,7 @@ class DataExtraction: raw_name_list = list(raw_name_dict.keys()) raw_name_as_production_name = None for raw_name in raw_name_list: - if raw_name.lower() in self.document_production.lower(): + if self.is_production_name(raw_name): raw_name_as_production_name = raw_name break datapoint_list_with_production_name = [] @@ -532,7 +532,7 @@ class DataExtraction: fund_name = data_item.get("fund_name", "") share_name = data_item.get("share_name", "") raw_name = self.get_raw_name(fund_name, share_name) - if raw_name.lower() in self.document_production.lower(): + if self.is_production_name(raw_name): dp_keys = [key for key in keys if key not in ["fund_name", "share_name", "management_fee_and_costs", @@ -584,6 +584,15 @@ class DataExtraction: extract_data["data"].remove(remove_item) return data_list, datapoint_list_with_production_name + def is_production_name(self, text: str): + if text.lower() in self.document_production.lower(): + return True + simlarity_util = Similarity() + similarity = simlarity_util.edit_distance_similarity(text, self.document_production) + if similarity > 0.93: + return True + return False + def remove_duplicate_data(self, data_list: list): """ The purpose is to remove duplicate data in the different pages. @@ -821,7 +830,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num not in [4, 5]: + # if page_num not in [14, 15]: # continue if page_num in handled_page_num_list: continue diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 22f8105..e9ab8c9 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -234,7 +234,8 @@ "The output should be:", "{\"data\": [{\"fund name\": \"Vanguard High Growth Index Fund\", \"share name\": \"Vanguard High Growth Index Fund\", \"management_fee_and_costs\": 1.5, \"management_fee\": 1.5}]}", "\n", - "F. If with columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\", please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", + "F. If columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\" appear, please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\" for EACH SPECIFIC investment option. ", + "DO NOT assume these values apply to other investment options mentioned elsewhere in the context or from provided examples.", "---Example 1 Start---", "\n\nInvestment option \nInvestment fees \nand costs (excl \nPerformance Fees) \nPerformance \nFee \nTransaction \ncosts \nTotal \ninvestment \nfees and costs \nBalanced 0.53% 0.43% 0.13%1.09% \nCapital Stable \n0.32% \n0.18% \n0.09% \n0.59% \n", "---Example 1 End---", @@ -337,7 +338,16 @@ "Pre-retirement pension \nWe generally calculate \nand deduct this fee daily when unit \nprices are determined. \nHigh Growth 0.48%, Growth 0.50%", "---Example 2 End---", "The output should be:", - "{\"data\": [{\"fund name\": \"Pre-retirement pension High Growth\", \"share name\": \"Pre-retirement pension High Growth\", \"management_fee_and_costs\": 0.48, \"management_fee\": 0.48}, {\"fund name\": \"Pre-retirement pension Growth\", \"share name\": \"Pre-retirement pension Growth\", \"management_fee_and_costs\": 0.50, \"management_fee\": 0.50}]}" + "{\"data\": [{\"fund name\": \"Pre-retirement pension High Growth\", \"share name\": \"Pre-retirement pension High Growth\", \"management_fee_and_costs\": 0.48, \"management_fee\": 0.48}, {\"fund name\": \"Pre-retirement pension Growth\", \"share name\": \"Pre-retirement pension Growth\", \"management_fee_and_costs\": 0.50, \"management_fee\": 0.50}]}", + "L. DO NOT extract management fees from \"Cost of product\" summaries. ", + "\"Cost of product\" figures should not be treated as 'Investment fees and costs'.", + "---Example Start---", + "Investment option Cost of product \nCash $141.00", + "---Example End---", + "FOUND \"Cost of product\", IGNORE ALL OF INFORMATION BELOW IT!!! JUST RETURN EMPTY RESPONSE!!!", + "The output should be:", + "{\"data\": []}", + "M. Do NOT infer or copy investment fees or management fees from examples provided for specific funds to other investment options. Only extract 'management_fee_and_costs' and 'management_fee' if explicitly stated separately for each investment option." ], "administration_fees":[ "Administration fees and costs and total annual dollar-based charges are share class level data.", @@ -406,7 +416,16 @@ "---Example 4 Start---", "Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Fund’s \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n", "---Example 4 End---", - "Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees." + "Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees.", + "B. The administration fee and costs/ total annual dollar-based charges are with production name, other data points/ values are with specific fund/ share name(s).", + "---Example Start---", + "My Super \nType of fee or cost Amount How and when paid \nOngoing annual fees and costs 1 \nAdministration fees and costs \n$26.00 p.a. \nplus \n0.17% p.a. of account balance (subject to a \nmaximum of $1,000 p.a.) \n$0.50 per week deducted from your account\nbalance at the end of each month or on exit.\nPercentage fee taken into account in the \ndaily calculation of unit prices. \nInvestment fees and costs \n2 \nOption % of option’s assets* \nFund1 0.12%\n", + "---Example End---", + "According to example, \"My Super\" is with \"Administration fees and costs \n$26.00 p.a. \nplus \n0.17% p.a. of account balance (subject to a maximum of $1,000 p.a.) \n$0.50 per week deducted from your account balance at the end of each month or on exit.\"", + "so administration_fees is 0.17, total_annual_dollar_based_charges is 0.50 * 52 = 26, with production name: \"My Super\".", + "\"Fund1\" is with specific fund/ share name, so management_fee_and_costs and management_fee are: 0.12", + "The output should be:", + "{\"data\": [{\"fund name\": \"My Super\", \"share name\": \"My Super\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 26}, {\"fund name\": \"Fund1\", \"share name\": \"Fund1\", \"management_fee_and_costs\": 0.12, \"management_fee\": 0.12}]}" ], "total_annual_dollar_based_charges": [ "Total annual dollar-based charges are share class level data.", @@ -845,19 +864,12 @@ }, "output_requirement": { "common": [ - "If possible, please extract fund name, share name, data points values as the output.", "If find fund name, and exist sub fund name, please output fund name + sub fund name, e.g. fund name is \"Black Rock European\", sub fund name is \"Growth\", the output fund name should be: \"Black Rock European Growth\".", "Only output the data point which with relevant value.", "Don't ignore the data point which with negative value, e.g. -0.12, -1.13", "Don't ignore the data point which with explicit zero value, e.g. 0, 0.00", "Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.", "Please also output the data point reported name in context.", - "Example:", - "---Example Start---", - "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\nAlphinity Sustainable Share Fund\n0.95\n0.60\n0.42\n1.55\n1.37\nAntipodes Global Fund\n1.20\n0.60\n0.42\n1.80\n1.62\n", - "---Example End---", - "Output:", - "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"management_fee_and_costs\": 1.37, \"management_fee\": 0.95, \"administration_fees\": 0.42}, {\"fund name\": \"Antipodes Global Fund\", \"share name\": \"Antipodes Global Fund\", \"management_fee_and_costs\": 1.62, \"management_fee\": 1.20, \"administration_fees\": 0.42}]}", "Fund level data: (\"fund name\" and \"datapoint_name\") and share level data: (\"fund name\", \"share name\", \"datapoint_name\") should be output separately.", "The output should be JSON format, the format is like below example(s):" ], @@ -939,7 +951,8 @@ }, "end": [ "Only output JSON data.", - "Don't output the value which not exist in context.", + "Please re-check before output answer, DO NOT output the data point and value which not exist in context.", + "DO NOT use the example values from a representative fund (such as Balanced Growth) for other funds unless explicitly mentioned", "If can't find fund name or share class name in context, please output empty JSON data: {\"data\": []}" ] } \ No newline at end of file diff --git a/main.py b/main.py index 187ae91..1b5aa81 100644 --- a/main.py +++ b/main.py @@ -1452,7 +1452,7 @@ def get_aus_prospectus_document_category(): def test_post_adjust_extract_data(): - doc_id = "397107472" + doc_id = "462780211" pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( @@ -1538,7 +1538,7 @@ if __name__ == "__main__": with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] document_mapping_file = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx" - # special_doc_id_list = ["441280757"] + special_doc_id_list = ["462780211"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/performance.ipynb b/performance.ipynb index 9451e53..e90267b 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -79,7 +79,7 @@ "\n", " return headers, data\n", "\n", - "def index_data_by_key(data, key_index, secondary_key_index, header):\n", + "def index_data_by_key(data, header):\n", " \"\"\"Index data by primary and secondary keys (doc_id and sec_name).\"\"\"\n", " indexed_data = defaultdict(dict)\n", " \n", @@ -114,10 +114,12 @@ " value2 = convert_if_number(value2)\n", " return value1 == value2\n", "\n", - "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched, document_list):\n", + "def compare_data(ground_truth, generated_results, headers, intersection_list, document_list):\n", " \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n", " results = {}\n", - " funds_matched, funds_not_matched = 0, 0\n", + " share_name_list = []\n", + " not_matched_share_name_list = []\n", + " share_matched, share_not_matched = 0, 0\n", " # Initialize result dictionaries for each column except 'doc_id'\n", " for keys in headers:\n", " if keys != \"doc_id\":\n", @@ -180,13 +182,28 @@ " \"truth\": truth, \"generated\": generated, \"error\": \"Generated is null and truth is not null\"}\n", " message_list.append(message)\n", " results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n", - " funds_matched += 1\n", + " if sec_name not in share_name_list:\n", + " share_name_list.append(sec_name)\n", + " share_matched += 1\n", " else:\n", - " funds_not_matched += 1\n", + " if sec_name not in share_name_list:\n", + " share_name_list.append(sec_name)\n", + " if sec_name not in not_matched_share_name_list:\n", + " # If the share class is not found in the generated results, count it as not matched\n", + " # print(\"Share class not matched - \", sec_name, doc_id)\n", + " message = {\"data_point\": \"Share Class\", \"doc_id\": doc_id, \"sec_name\": sec_name, \n", + " \"truth\": \"\", \"generated\": \"\", \"error\": \"Share class not found in generated results\"}\n", + " message_list.append(message)\n", + " share_not_matched += 1\n", + " not_matched_share_name_list.append(sec_name)\n", " else:\n", " # If the entire document is not found, count all funds as not matched\n", - " funds_not_matched += len(secs)\n", - " return results, message_list, funds_matched, funds_not_matched\n", + " message = {\"data_point\": \"Document\", \"doc_id\": doc_id, \"sec_name\": \"\",\n", + " \"truth\": \"\", \"generated\": \"\", \"error\": \"Document not found in generated results\"}\n", + " message_list.append(message)\n", + " \n", + " # share_not_matched += len(secs)\n", + " return results, message_list, share_matched, share_not_matched, not_matched_share_name_list\n", "\n", "def clean_text(text: str):\n", " if text is None or len(text) == 0:\n", @@ -330,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -352,9 +369,10 @@ "buy_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n", "sell_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n", "TOTAL \t0.9841 \t0.9697 \t1.0000 \t0.9698 \t258 \t247 \t106 \t11 \t0 \n", - "Total Funds Matched - 52\n", - "Total Funds Not Matched - 28\n", - "Percentage of Funds Matched - 65.0\n" + "Total Shares Matched - 52\n", + "Total Shares Not Matched - 18\n", + "Percentage of Shares Matched - 74.28571428571429\n", + "Not Matched Shares Name List - ['Vision Balanced Growth Pen', 'CFS FC W Pen-Ausbil Aust Active Equity', 'CFS FC W Pen-AXA IM TTR W Sust Eq', 'CFS FC W PSup-FirstRate Term Dep (10yr)', 'CFS FC W PSup-FirstRate Term Dep (15yr)', 'CFS FC W PSup-FirstRate Term Dep (2yr)', 'CFS FC W PSup-FirstRate Term Dep (3yr)', 'CFS FC W PSup-FirstRate Term Dep (5yr)', 'CFS FC W PSup-FirstRate Term Dep (7yr)', 'AV Australian Shares TTR', 'AV Balanced Growth TTR', 'AV Cash TTR', 'AV Conservative Growth TTR', 'AV Diversified Index TTR', 'AV Growth TTR', 'AV High Growth TTR', 'AV International Shares TTR', 'AV Stable Growth TTR']\n" ] } ], @@ -370,20 +388,17 @@ "6. Set F1-Score to the first column in the metrics table\n", "\"\"\"\n", "\n", - "funds_matched = 0\n", - "funds_not_matched = 0\n", - "\n", "# Load the files\n", "headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n", "headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n", "\n", "# Assuming doc_id is the first column and fund_name is the second column\n", - "doc_id_index = 0\n", - "fund_name_index = 1\n", + "# doc_id_index = 0\n", + "# fund_name_index = 1\n", "\n", "# Index the data\n", - "ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n", - "generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n", + "ground_truth_indexed = index_data_by_key(ground_truth_data, headers_gt)\n", + "generated_results_indexed = index_data_by_key(generated_results_data, headers_gen)\n", "\n", "intersection = set(headers_gen).intersection(headers_gt)\n", "\n", @@ -425,20 +440,21 @@ " \n", " print(\"All Providers Results: \")\n", " print(\"Document List File - \", document_list_file)\n", - " comparison_results, message_list, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, \n", - " generated_results_indexed, \n", - " headers_gt, doc_id_index, \n", - " fund_name_index, \n", - " intersection_list,\n", - " funds_matched, \n", - " funds_not_matched,\n", - " document_list)\n", + " comparison_results, message_list, share_matched, \\\n", + " share_not_matched, not_matched_share_name_list = compare_data(ground_truth_indexed, \n", + " generated_results_indexed,\n", + " headers_gt,\n", + " intersection_list,\n", + " document_list)\n", " metrics_list = print_metrics_table(comparison_results)\n", - " print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n", - " print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n", + " print(\"Total Shares Matched - \" + str(share_matched) + \"\\nTotal Shares Not Matched - \" + str(share_not_matched))\n", + " print(\"Percentage of Shares Matched - \" + str((share_matched/(share_matched + share_not_matched))*100))\n", + " print(\"Not Matched Shares Name List - \", not_matched_share_name_list)\n", "\n", " metrics_df = pd.DataFrame(metrics_list)\n", " message_df = pd.DataFrame(message_list)\n", + " share_matched_data = {\"share_matched\": share_matched, \"share_not_matched\": share_not_matched, \"not_matched_share_name_list\": not_matched_share_name_list}\n", + " share_matched_df = pd.DataFrame([share_matched_data])\n", "\n", " output_metrics_folder = r\"/data/aus_prospectus/output/metrics_data/\"\n", " os.makedirs(output_metrics_folder, exist_ok=True)\n", @@ -452,14 +468,23 @@ " metrics_file_path = os.path.join(output_metrics_folder, metrics_file_name)\n", " with pd.ExcelWriter(metrics_file_path) as writer:\n", " metrics_df.to_excel(writer, sheet_name=\"metrics_data\", index=False)\n", - " message_df.to_excel(writer, sheet_name=\"message_data\", index=False)\n" + " message_df.to_excel(writer, sheet_name=\"message_data\", index=False)\n", + " share_matched_df.to_excel(writer, sheet_name=\"share_matched_data\", index=False)\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'data_point': 'performance_fee_costs', 'doc_id': 539791362, 'sec_name': 'REST High Growth Pension', 'truth': '0.6', 'generated': '0.08', 'error': 'Truth is not equal with generated'}\n" + ] + } + ], "source": [ "for message_list_element in message_list:\n", " if message_list_element[\"data_point\"] == \"performance_fee_costs\":\n", diff --git a/sample_documents/aus_prospectus_verify_3_documents_sample.txt b/sample_documents/aus_prospectus_verify_3_documents_sample.txt new file mode 100644 index 0000000..8e69c2c --- /dev/null +++ b/sample_documents/aus_prospectus_verify_3_documents_sample.txt @@ -0,0 +1,3 @@ +539999907 +455235248 +448576924 \ No newline at end of file From dc560e1e010c3989b1c2fdba8124ac9d64a7711d Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 26 Mar 2025 23:14:28 -0500 Subject: [PATCH 5/9] update metrics --- main.py | 10 ++-- performance.ipynb | 124 ++++++++++++++++++++++++++++++---------------- 2 files changed, 89 insertions(+), 45 deletions(-) diff --git a/main.py b/main.py index 1b5aa81..0880afb 100644 --- a/main.py +++ b/main.py @@ -1532,13 +1532,17 @@ if __name__ == "__main__": doc_source = "aus_prospectus" # doc_source = "emea_ar" if doc_source == "aus_prospectus": + # document_sample_file = ( + # r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt" + # ) document_sample_file = ( - r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt" + r"./sample_documents/aus_prospectus_46_documents_sample.txt" ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] - document_mapping_file = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx" - special_doc_id_list = ["462780211"] + # document_mapping_file = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx" + document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" + # special_doc_id_list = ["553449169"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/performance.ipynb b/performance.ipynb index e90267b..0edf296 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 31, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -15,49 +15,51 @@ "from utils.similarity import Similarity\n", "\n", "\n", - "# imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n", - "# \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n", - "# \"Minimum Initial Investment\", \"Benchmark\"]\n", - "\n", - "\n", - "# imp_datapoints_mapping = {\n", - "# \"Management Fee and Costs\": \"management_fee_and_costs\",\n", - "# \"Management Fee\": \"management_fee\",\n", - "# \"Performance fee and cost\": \"performance_fee_costs\",\n", - "# \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n", - "# \"Administration Fee and costs\": \"administration_fees\",\n", - "# \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", - "# \"Buy Spread\": \"buy_spread\",\n", - "# \"Sell Spread\": \"sell_spread\",\n", - "# \"Performance Fee\": \"PerformanceFeeCharged\",\n", - "# \"Minimum Initial Investment\": \"minimum_initial_investment\",\n", - "# \"Benchmark\": \"benchmark_name\"\n", - "# }\n", - "\n", - "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\",\n", - " \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\"]\n", + "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n", + " \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n", + " \"Minimum Initial Investment\", \"Benchmark\"]\n", "\n", "\n", "imp_datapoints_mapping = {\n", " \"Management Fee and Costs\": \"management_fee_and_costs\",\n", " \"Management Fee\": \"management_fee\",\n", " \"Performance fee and cost\": \"performance_fee_costs\",\n", + " \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n", " \"Administration Fee and costs\": \"administration_fees\",\n", " \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", " \"Buy Spread\": \"buy_spread\",\n", - " \"Sell Spread\": \"sell_spread\"\n", + " \"Sell Spread\": \"sell_spread\",\n", + " \"Performance Fee\": \"PerformanceFeeCharged\",\n", + " \"Minimum Initial Investment\": \"minimum_initial_investment\",\n", + " \"Benchmark\": \"benchmark_name\"\n", "}\n", "\n", - "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx\"\n", + "# imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\",\n", + "# \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\"]\n", + "\n", + "\n", + "# imp_datapoints_mapping = {\n", + "# \"Management Fee and Costs\": \"management_fee_and_costs\",\n", + "# \"Management Fee\": \"management_fee\",\n", + "# \"Performance fee and cost\": \"performance_fee_costs\",\n", + "# \"Administration Fee and costs\": \"administration_fees\",\n", + "# \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", + "# \"Buy Spread\": \"buy_spread\",\n", + "# \"Sell Spread\": \"sell_spread\"\n", + "# }\n", + "\n", + "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", + "# path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx\"\n", "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n", - "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_6_documents_by_text_20250324170432.xlsx\"\n", + "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250326224343.xlsx\"\n", + "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_6_documents_by_text_20250326203744.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", "\n" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -347,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 49, "metadata": {}, "outputs": [ { @@ -361,18 +363,56 @@ "All Providers Results: \n", "Document List File - None\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9495 \t0.9038 \t1.0000 \t0.9038 \t52 \t47 \t0 \t5 \t0 \n", - "management_fee \t0.9495 \t0.9038 \t1.0000 \t0.9038 \t52 \t47 \t0 \t5 \t0 \n", - "performance_fee_costs \t0.9899 \t0.9800 \t1.0000 \t0.9808 \t50 \t49 \t2 \t1 \t0 \n", - "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t52 \t52 \t0 \t0 \t0 \n", - "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t16 \t16 \t36 \t0 \t0 \n", - "buy_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n", - "sell_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n", - "TOTAL \t0.9841 \t0.9697 \t1.0000 \t0.9698 \t258 \t247 \t106 \t11 \t0 \n", - "Total Shares Matched - 52\n", + "management_fee_and_costs \t0.9375 \t0.8993 \t0.9791 \t0.8829 \t424 \t375 \t2 \t42 \t8 \n", + "management_fee \t0.9571 \t0.9353 \t0.9799 \t0.9180 \t424 \t390 \t2 \t27 \t8 \n", + "performance_fee_costs \t0.8801 \t0.8601 \t0.9011 \t0.8431 \t291 \t246 \t114 \t40 \t27 \n", + "interposed_vehicle_performance_fee_cost \t0.9172 \t0.8471 \t1.0000 \t0.9696 \t73 \t72 \t342 \t13 \t0 \n", + "administration_fees \t0.9081 \t0.8317 \t1.0000 \t0.9602 \t84 \t84 \t326 \t17 \t0 \n", + "total_annual_dollar_based_charges \t0.9930 \t0.9861 \t1.0000 \t0.9977 \t71 \t71 \t355 \t1 \t0 \n", + "buy_spread \t0.9291 \t0.8930 \t0.9681 \t0.8806 \t376 \t334 \t42 \t40 \t11 \n", + "sell_spread \t0.9291 \t0.8930 \t0.9681 \t0.8806 \t376 \t334 \t42 \t40 \t11 \n", + "minimum_initial_investment \t0.9507 \t0.9633 \t0.9383 \t0.9297 \t308 \t289 \t108 \t11 \t19 \n", + "benchmark_name \t0.9139 \t0.8846 \t0.9452 \t0.9391 \t156 \t138 \t263 \t18 \t8 \n", + "TOTAL \t0.9316 \t0.8994 \t0.9680 \t0.9201 \t2583 \t2333 \t1596 \t249 \t92 \n", + "Total Shares Matched - 379\n", + "Total Shares Not Matched - 128\n", + "Percentage of Shares Matched - 74.7534516765286\n", + "Not Matched Shares Name List - ['SPDR® S&P World ex Australia Carbon Control Fund', 'Mercer Multi-manager Growth Fund – Retail Units', 'Mercer Multi-manager High Growth Fund – Retail Units', 'ANZ OA Inv-OnePath Multi Asset Income EF', 'ANZ OA IP-OnePath Australian Shares', 'ANZ OA IP-OnePath Diversified Fixed Interest', 'ANZ OA IP-OP Diversified Credit EF', 'ANZ OA IP-OP Diversified Credit NE', 'OneAnswer Investment Portfolio - Schroder Strategic Growth -NE', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond EF', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond NE', 'OnePath OA Investment Portfolio-BlackRock Tactical Growth EF', 'OnePath OA Inv-Greencape Broadcap EF', 'OnePath OA Inv-Nikko AM Australian Shares EF', 'OnePath OA IP- Pendal Monthly Income Plus-EF/Sel', 'OnePath OA IP- Pendal Monthly Income Plus-NEF', 'OnePath OA IP-Alternatives Growth Fund-EF/Sel', 'OnePath OA IP-Alternatives Growth Fund-NEF', 'OnePath OA IP-ANZ Cash Advantage-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-NEF', 'OnePath OA IP-Ausbil Australian Emerging Leaders Trust-EF/Sel', 'OnePath OA IP-Bennelong Australian Equities-EF/Sel', 'OnePath OA IP-Bentham Global Income Trust-EF/Sel', 'OnePath OA IP-Bentham Global Income Trust-NEF', 'OnePath OA IP-Fidelity Australian Equities-EF/Sel', 'OnePath OA IP-Investors Mutual Australian Share Trust- EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-NEF', 'OnePath OA IP-Merlon Australian Share Income-EF/Sel', 'OnePath OA IP-OnePath Active Growth Trust-NEF', 'OnePath OA IP-OnePath High Growth Trust-EF/Sel', 'OnePath OA IP-OnePath High Growth Trust-NEF', 'OnePath OA IP-OnePath Managed Growth Trust-EF/Sel', 'OnePath OA IP-OnePath Managed Growth Trust-NEF', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-NEF', 'OnePath OA IP-OptiMix Australian Share Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Share Trust-NEF', 'OnePath OA IP-OptiMix Global Emerging Markets Share-EF/Sel', 'OnePath OA IP-OptiMix Global Emerging Markets Share-NEF', 'OnePath OA IP-OptiMIx Global Share Trust-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-NEF', 'OnePath OA IP-OptiMix High Growth Trust-EF/Sel', 'OnePath OA IP-OptiMix High Growth Trust-NEF', 'OnePath OA IP-OptiMix Property Securities Trust-EF/Sel', 'OnePath OA IP-OptiMix Property Securities Trust-NEF', 'OnePath OA IP-Perpetual Balanced Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Balanced Growth Trust-NEF', 'OnePath OA IP-Perpetual Conservative Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'OnePath OA IP-Schroder Fixed Income-EF/Sel', 'OnePath OA IP-Schroder Fixed Income-NEF', 'OnePath OA IP-UBS Balanced Trust-EF/Sel', 'OnePath OA IP-UBS Balanced Trust-NEF', 'OnePath OA IP-UBS Defensive Trust-EF/Sel', 'OnePath OA IP-UBS Defensive Trust-NEF', 'OnePath OA IP-UBS Diversified Fixed Income Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-NEF', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -NE', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -NE', 'OnePath OneAnswer Investment Portfolio - BlackRock Advantage Australian Equity -EF/Sel', 'OnePath OneAnswer Investment Portfolio - BlackRock Advantage Australian Equity -NE', 'OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth -EF/Sel', 'OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth -NE', 'OnePath OneAnswer Investment Portfolio - First Sentier Imputation -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Australian Shares Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath High Growth Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath High Growth Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Schroder Strategic Growth -EF/Sel', 'OnePath Schroder Real Return Trust (Entry Fee)', 'OnePath Schroder Real Return Trust (Nil Entry Fee)', 'OnePath OA IP-Ausbil Australian Emerging Leaders Trust-NEF', 'Telstra Growth Pen', 'First Sentier Concentrated Aus Share', 'First Sentier Australian Small Companies', 'First Sentier Imputation', 'First Sentier Global Property Securities', 'First Sentier Australian Share', 'CFS FC-Investors Mutual Future Leaders', 'Stewart Worldwide Leaders Sustainability', 'First Sentier Property Securities', 'MyNorth Index Defensive', 'MLC MKPF - Inflation Plus - Conservative', 'MLC MasterKey Super Fundamentals - Perpetual Australian Share', 'MLC MKSF - Perpetual WS Ethical SRI Fund', 'MLC MasterKey Super Fundamentals - Perpetual Small Co Fund No.2', 'MLC MKSF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKSF - Platinum Asia Fund', 'MLC MKSF - Platinum International Fund', 'MLC MKSF - PM CAPITAL Global Companies', 'MLC MKSF - Schroder WS Australian Equity', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Aust Property Index', 'MLC MasterKey Super Fundamentals - MLC Australian Property Index', 'MLC MKSF - Vanguard Intl Shr Indx (Hgd)', 'MLC MKSF - Vanguard Intl Shr Indx', 'HOSTPLUS Fixed Interest Indexed Super', 'Lifeplan Investment Bond Perpetual Balanced Growth', 'Lifeplan Investment Bond Perpetual Conservative Growth', 'Lifeplan Investment Bond Perpetual Industrial Share', 'Lifeplan Investment Bond Vanguard® Australian Shares Index', 'Dimensional Australian Core Equity Trust', 'FC W Pen-CFS TTR Global Infrastructure Securities', 'CFS MIF-High Growth', 'CFS MIF-Property Securities', 'CFS MIF-Geared Share NEF', 'CFS MIF-Australian Share', 'CFS MIF-Geared Global Share', 'CFS MIF-Global Tech & Comm', 'CFS MIF-Stewart Inv Worldwide Leaders Sustainability', 'CFS MIF-Geared Share', 'CFS MIF-Diversified', 'CFS MIF-Janus Henderson Global Natural Resources Fund', 'CFS MIF-Macquarie Australian Emerging Companies', 'CFS MIF-Balanced', 'CFS MIF-Conservative', 'CFS MIF-Imputation', 'CFS MIF-Global Health & Biotech', 'Dimensional Australia Core Equity Trust - Active ETF']\n", + "All Providers Results: \n", + "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", + "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", + "management_fee_and_costs \t0.9621 \t0.9270 \t1.0000 \t0.9270 \t177 \t165 \t0 \t13 \t0 \n", + "management_fee \t0.9886 \t0.9775 \t1.0000 \t0.9775 \t177 \t174 \t0 \t4 \t0 \n", + "performance_fee_costs \t0.8557 \t0.8037 \t0.9149 \t0.8371 \t100 \t86 \t63 \t21 \t8 \n", + "interposed_vehicle_performance_fee_cost \t0.8966 \t0.8125 \t1.0000 \t0.9326 \t53 \t52 \t114 \t12 \t0 \n", + "administration_fees \t0.9655 \t0.9333 \t1.0000 \t0.9944 \t14 \t14 \t163 \t1 \t0 \n", + "buy_spread \t0.9496 \t0.9091 \t0.9938 \t0.9045 \t175 \t160 \t1 \t16 \t1 \n", + "sell_spread \t0.9464 \t0.9034 \t0.9938 \t0.8989 \t175 \t159 \t1 \t17 \t1 \n", + "minimum_initial_investment \t0.9064 \t0.9528 \t0.8643 \t0.8596 \t140 \t121 \t32 \t6 \t19 \n", + "benchmark_name \t0.9186 \t0.8587 \t0.9875 \t0.9213 \t89 \t79 \t85 \t13 \t1 \n", + "TOTAL \t0.9322 \t0.8976 \t0.9727 \t0.9170 \t1100 \t1010 \t459 \t103 \t122 \n", + "Total Shares Matched - 173\n", "Total Shares Not Matched - 18\n", - "Percentage of Shares Matched - 74.28571428571429\n", - "Not Matched Shares Name List - ['Vision Balanced Growth Pen', 'CFS FC W Pen-Ausbil Aust Active Equity', 'CFS FC W Pen-AXA IM TTR W Sust Eq', 'CFS FC W PSup-FirstRate Term Dep (10yr)', 'CFS FC W PSup-FirstRate Term Dep (15yr)', 'CFS FC W PSup-FirstRate Term Dep (2yr)', 'CFS FC W PSup-FirstRate Term Dep (3yr)', 'CFS FC W PSup-FirstRate Term Dep (5yr)', 'CFS FC W PSup-FirstRate Term Dep (7yr)', 'AV Australian Shares TTR', 'AV Balanced Growth TTR', 'AV Cash TTR', 'AV Conservative Growth TTR', 'AV Diversified Index TTR', 'AV Growth TTR', 'AV High Growth TTR', 'AV International Shares TTR', 'AV Stable Growth TTR']\n" + "Percentage of Shares Matched - 90.57591623036649\n", + "Not Matched Shares Name List - ['Dimensional Australian Core Equity Trust', 'FC W Pen-CFS TTR Global Infrastructure Securities', 'CFS MIF-High Growth', 'CFS MIF-Property Securities', 'CFS MIF-Geared Share NEF', 'CFS MIF-Australian Share', 'CFS MIF-Geared Global Share', 'CFS MIF-Global Tech & Comm', 'CFS MIF-Stewart Inv Worldwide Leaders Sustainability', 'CFS MIF-Geared Share', 'CFS MIF-Diversified', 'CFS MIF-Janus Henderson Global Natural Resources Fund', 'CFS MIF-Macquarie Australian Emerging Companies', 'CFS MIF-Balanced', 'CFS MIF-Conservative', 'CFS MIF-Imputation', 'CFS MIF-Global Health & Biotech', 'Dimensional Australia Core Equity Trust - Active ETF']\n", + "All Providers Results: \n", + "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", + "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", + "management_fee_and_costs \t0.9190 \t0.8787 \t0.9633 \t0.8514 \t247 \t210 \t2 \t29 \t8 \n", + "management_fee \t0.9330 \t0.9038 \t0.9643 \t0.8755 \t247 \t216 \t2 \t23 \t8 \n", + "performance_fee_costs \t0.8939 \t0.8939 \t0.8939 \t0.8474 \t191 \t160 \t51 \t19 \t19 \n", + "interposed_vehicle_performance_fee_cost \t0.9756 \t0.9524 \t1.0000 \t0.9960 \t20 \t20 \t228 \t1 \t0 \n", + "administration_fees \t0.8974 \t0.8140 \t1.0000 \t0.9357 \t70 \t70 \t163 \t16 \t0 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t71 \t71 \t178 \t0 \t0 \n", + "buy_spread \t0.9110 \t0.8788 \t0.9457 \t0.8635 \t201 \t174 \t41 \t24 \t10 \n", + "sell_spread \t0.9138 \t0.8838 \t0.9459 \t0.8675 \t201 \t175 \t41 \t23 \t10 \n", + "minimum_initial_investment \t0.9853 \t0.9711 \t1.0000 \t0.9799 \t168 \t168 \t76 \t5 \t0 \n", + "benchmark_name \t0.9077 \t0.9219 \t0.8939 \t0.9518 \t67 \t59 \t178 \t5 \t7 \n", + "TOTAL \t0.9337 \t0.9098 \t0.9607 \t0.9169 \t1483 \t1323 \t960 \t145 \t184 \n", + "Total Shares Matched - 249\n", + "Total Shares Not Matched - 110\n", + "Percentage of Shares Matched - 69.35933147632312\n", + "Not Matched Shares Name List - ['SPDR® S&P World ex Australia Carbon Control Fund', 'Mercer Multi-manager Growth Fund – Retail Units', 'Mercer Multi-manager High Growth Fund – Retail Units', 'ANZ OA Inv-OnePath Multi Asset Income EF', 'ANZ OA IP-OnePath Australian Shares', 'ANZ OA IP-OnePath Diversified Fixed Interest', 'ANZ OA IP-OP Diversified Credit EF', 'ANZ OA IP-OP Diversified Credit NE', 'OneAnswer Investment Portfolio - Schroder Strategic Growth -NE', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond EF', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond NE', 'OnePath OA Investment Portfolio-BlackRock Tactical Growth EF', 'OnePath OA Inv-Greencape Broadcap EF', 'OnePath OA Inv-Nikko AM Australian Shares EF', 'OnePath OA IP- Pendal Monthly Income Plus-EF/Sel', 'OnePath OA IP- Pendal Monthly Income Plus-NEF', 'OnePath OA IP-Alternatives Growth Fund-EF/Sel', 'OnePath OA IP-Alternatives Growth Fund-NEF', 'OnePath OA IP-ANZ Cash Advantage-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-NEF', 'OnePath OA IP-Ausbil Australian Emerging Leaders Trust-EF/Sel', 'OnePath OA IP-Bennelong Australian Equities-EF/Sel', 'OnePath OA IP-Bentham Global Income Trust-EF/Sel', 'OnePath OA IP-Bentham Global Income Trust-NEF', 'OnePath OA IP-Fidelity Australian Equities-EF/Sel', 'OnePath OA IP-Investors Mutual Australian Share Trust- EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-NEF', 'OnePath OA IP-Merlon Australian Share Income-EF/Sel', 'OnePath OA IP-OnePath Active Growth Trust-NEF', 'OnePath OA IP-OnePath High Growth Trust-EF/Sel', 'OnePath OA IP-OnePath High Growth Trust-NEF', 'OnePath OA IP-OnePath Managed Growth Trust-EF/Sel', 'OnePath OA IP-OnePath Managed Growth Trust-NEF', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-NEF', 'OnePath OA IP-OptiMix Australian Share Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Share Trust-NEF', 'OnePath OA IP-OptiMix Global Emerging Markets Share-EF/Sel', 'OnePath OA IP-OptiMix Global Emerging Markets Share-NEF', 'OnePath OA IP-OptiMIx Global Share Trust-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-NEF', 'OnePath OA IP-OptiMix High Growth Trust-EF/Sel', 'OnePath OA IP-OptiMix High Growth Trust-NEF', 'OnePath OA IP-OptiMix Property Securities Trust-EF/Sel', 'OnePath OA IP-OptiMix Property Securities Trust-NEF', 'OnePath OA IP-Perpetual Balanced Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Balanced Growth Trust-NEF', 'OnePath OA IP-Perpetual Conservative Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'OnePath OA IP-Schroder Fixed Income-EF/Sel', 'OnePath OA IP-Schroder Fixed Income-NEF', 'OnePath OA IP-UBS Balanced Trust-EF/Sel', 'OnePath OA IP-UBS Balanced Trust-NEF', 'OnePath OA IP-UBS Defensive Trust-EF/Sel', 'OnePath OA IP-UBS Defensive Trust-NEF', 'OnePath OA IP-UBS Diversified Fixed Income Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-NEF', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -NE', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -NE', 'OnePath OneAnswer Investment Portfolio - BlackRock Advantage Australian Equity -EF/Sel', 'OnePath OneAnswer Investment Portfolio - BlackRock Advantage Australian Equity -NE', 'OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth -EF/Sel', 'OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth -NE', 'OnePath OneAnswer Investment Portfolio - First Sentier Imputation -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Australian Shares Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath High Growth Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath High Growth Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Schroder Strategic Growth -EF/Sel', 'OnePath Schroder Real Return Trust (Entry Fee)', 'OnePath Schroder Real Return Trust (Nil Entry Fee)', 'OnePath OA IP-Ausbil Australian Emerging Leaders Trust-NEF', 'Telstra Growth Pen', 'First Sentier Concentrated Aus Share', 'First Sentier Australian Small Companies', 'First Sentier Imputation', 'First Sentier Global Property Securities', 'First Sentier Australian Share', 'CFS FC-Investors Mutual Future Leaders', 'Stewart Worldwide Leaders Sustainability', 'First Sentier Property Securities', 'MyNorth Index Defensive', 'MLC MKPF - Inflation Plus - Conservative', 'MLC MasterKey Super Fundamentals - Perpetual Australian Share', 'MLC MKSF - Perpetual WS Ethical SRI Fund', 'MLC MasterKey Super Fundamentals - Perpetual Small Co Fund No.2', 'MLC MKSF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKSF - Platinum Asia Fund', 'MLC MKSF - Platinum International Fund', 'MLC MKSF - PM CAPITAL Global Companies', 'MLC MKSF - Schroder WS Australian Equity', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Aust Property Index', 'MLC MasterKey Super Fundamentals - MLC Australian Property Index', 'MLC MKSF - Vanguard Intl Shr Indx (Hgd)', 'MLC MKSF - Vanguard Intl Shr Indx', 'HOSTPLUS Fixed Interest Indexed Super', 'Lifeplan Investment Bond Perpetual Balanced Growth', 'Lifeplan Investment Bond Perpetual Conservative Growth', 'Lifeplan Investment Bond Perpetual Industrial Share', 'Lifeplan Investment Bond Vanguard® Australian Shares Index']\n" ] } ], @@ -427,10 +467,10 @@ "\n", "print(\"\\n\")\n", "print(\"\\n\")\n", - "# document_list_file_list = [None, \n", - "# \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n", - "# \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n", - "document_list_file_list = [None]\n", + "document_list_file_list = [None, \n", + " \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n", + " \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n", + "# document_list_file_list = [None]\n", "for document_list_file in document_list_file_list:\n", " document_list = None\n", " if document_list_file is not None:\n", From d9259923269c8ea351f42440b6a3f62c9054de61 Mon Sep 17 00:00:00 2001 From: Blade He Date: Thu, 27 Mar 2025 16:00:19 -0500 Subject: [PATCH 6/9] 1. Support the keywords of complex special cases to be regex 2. Support set sub-datapoints list to complex special cases node. 3. Simplify the common management fee and costs instructions. 4. Add markdown title characters: ## or ### to instructions. --- core/data_extraction.py | 48 ++++-- .../data_extraction_prompts_config.json | 105 +++++++++---- main.py | 9 +- performance.ipynb | 143 +++++++++++++++++- 4 files changed, 244 insertions(+), 61 deletions(-) diff --git a/core/data_extraction.py b/core/data_extraction.py index 4445ecd..0a14620 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -830,7 +830,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num not in [14, 15]: + # if page_num not in [13, 14]: # continue if page_num in handled_page_num_list: continue @@ -1646,7 +1646,7 @@ class DataExtraction: instructions.extend(image_features) instructions.append("\n") - instructions.append("Datapoints Reported name:\n") + instructions.append("## Datapoints Reported name:\n") instructions.append("Please look for relevant reported names and similar variations in the context.\n") reported_name_info_in_instructions = self.instructions_config.get("reported_name", {}) for datapoint in datapoints: @@ -1746,7 +1746,7 @@ class DataExtraction: none_value_example_count += 1 instructions.append("\n") - instructions.append("Data business features:\n") + instructions.append("## Data business features:\n") data_business_features = self.instructions_config.get( "data_business_features", {} ) @@ -1754,7 +1754,7 @@ class DataExtraction: instructions.append(common) instructions.append("\n") - instructions.append("Datapoints investment level:\n") + instructions.append("## Datapoints investment level:\n") investment_level_info = data_business_features.get("investment_level", {}) for datapoint in datapoints: investment_level = investment_level_info.get(datapoint, "") @@ -1762,7 +1762,7 @@ class DataExtraction: instructions.append("\n") instructions.append("\n") - instructions.append("Datapoints value range:\n") + instructions.append("## Datapoints value range:\n") data_value_range_info = data_business_features.get("data_value_range", {}) for datapoint in datapoints: data_value_range = data_value_range_info.get(datapoint, "") @@ -1776,7 +1776,12 @@ class DataExtraction: # 2. To load it by keywords, is to avoid for simple case, the prompts are too long. complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "") with_special_rule_title = False + found_sub_datapoints = [] for datapoint in datapoints: + # If some complex special rule is found, and with sub datapoints, + # need not to load relevant rule again. + if datapoint in found_sub_datapoints: + continue find_complex_special_rule = False if page_text is not None and len(page_text) > 0: complex_special_rule_list = complex_special_rule.get(datapoint, []) @@ -1784,29 +1789,41 @@ class DataExtraction: complex_keywords = complex_special_rule.get("keywords", []) if len(complex_keywords) == 0: continue + # support keywords to be pure text or regex + keywords_is_regex = complex_special_rule.get("keywords_is_regex", False) exist_keywords = False for special_keywords in complex_keywords: - special_keywrods_regex = add_slash_to_text_as_regex(special_keywords) - if special_keywords in page_text or \ - re.search(special_keywrods_regex, page_text) is not None: - exist_keywords = True - break + if keywords_is_regex: + if re.search(special_keywords, page_text) is not None: + exist_keywords = True + break + else: + special_keywrods_regex = add_slash_to_text_as_regex(special_keywords) + if special_keywords in page_text or \ + re.search(special_keywrods_regex, page_text) is not None: + exist_keywords = True + break if exist_keywords: complex_prompts_list = complex_special_rule.get("prompts", []) if len(complex_prompts_list) > 0: if not with_special_rule_title: - instructions.append("Special rule:\n") + instructions.append("## Special rule:\n") with_special_rule_title = True complex_prompts = "\n".join(complex_prompts_list) instructions.append(complex_prompts) instructions.append("\n\n") find_complex_special_rule = True + # If the complex special rule is found, need to find the sub datapoints + # and add them to the found_sub_datapoints list. + sub_datapoints = complex_special_rule.get("sub_datapoints", []) + if len(sub_datapoints) > 0: + found_sub_datapoints.extend(sub_datapoints) if find_complex_special_rule: continue special_rule_list = special_rule_info.get(datapoint, []) if len(special_rule_list) > 0: if not with_special_rule_title: - instructions.append("Special rule:\n") + instructions.append("## Special rule:\n") with_special_rule_title = True special_rule = "\n".join(special_rule_list) instructions.append(special_rule) @@ -1814,7 +1831,7 @@ class DataExtraction: instructions.append("\n") - instructions.append("Special cases:\n") + instructions.append("## Special cases:\n") special_cases = self.instructions_config.get("special_cases", {}) special_cases_common_list = special_cases.get("common", []) special_cases_number = 1 @@ -1827,7 +1844,7 @@ class DataExtraction: contents_list = special_cases_common.get("contents", []) contents = "\n".join(contents_list) instructions.append(contents) - instructions.append("\n\n") + instructions.append("\n") for datapoint in datapoints: special_case_list = special_cases.get(datapoint, []) @@ -1841,9 +1858,8 @@ class DataExtraction: contents = "\n".join(contents_list) instructions.append(contents) instructions.append("\n") - instructions.append("\n") - instructions.append("Output requirement:\n") + instructions.append("## Output requirement:\n") output_requirement = self.instructions_config.get("output_requirement", {}) output_requirement_common_list = output_requirement.get("common", []) instructions.append("\n".join(output_requirement_common_list)) diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index e9ab8c9..cf2b9e3 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -143,6 +143,7 @@ }, "special_rule": { "management_fee_and_costs": [ + "### Management fee and cost", "Management fee and cost = Management fee + indirect cost + recoverable expense (Also known as Expense recovery cost or recovery fee or Expense recovery fee or expense recoveries) + Manager fee or Responsible entity fee.", "If there are multiple Management fee and costs reported names, here is the priority rule:", "A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".", @@ -301,34 +302,14 @@ "So the output should be:", "{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", "\n", - "I. Some table is very complex, with many data points columns, please extract the relevant values.", - "---Example 1 Start---", - "Option name \nTotal administration\nand investment\nfees and costs (p.a.)\n= \nAdministration\nfees and\ncosts (p.a.)\n+ \nInvestment fees \nand costs (p.a.) \n2 \n+ \nPerformance \nfee (p.a.) \n1 \nBuy/sell\nspread\n(%)\n6 \nCFS Multi-Manager Multi-Sector (These investment options are located in the Investment Options Menu.) \nCFS Defensive \n0.94% \n0.20% 0.74%0.15 \nCFS Conservative 1.04% \n1 \n0.20% 0.81% 0.03%\n1 \n0.15 \n", - "---Example 1 End---", - "For this table, there are \"Administration fees and costs (p.a.)\" as administration_fees, ", - "\"Investment fees and costs (p.a.)\" as management_fee_and_costs and management_fee, ", - "\"Performance fee (p.a.)\" as performance_fee_costs, ", - "\"Buy/sell spread (%)\" as buy_spread and sell_spread.", - "If one row has 5 decimal numbers, ", - "the 2nd decimal number is the administration_fees, ", - "the 3rd decimal number is the management_fee_and_costs and management_fee, ", - "the 4th decimal number is the performance_fee_costs, ", - "the 5th decimal number is the buy_spread and sell_spread.", - "If one row has 4 decimal numbers, ", - "the 2nd decimal number is the administration_fees, ", - "the 3rd decimal number is the management_fee_and_costs and management_fee, ", - "the 4th decimal number is the buy_spread and sell_spread.", - "Please always ignore the 1st decimal number, we need not the total sum values.", - "The output should be:", - "{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee_costs\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", - "J. If exist **\"Maximum management fee\"** in context, please ignore relevant values.", + "I. If exist **\"Maximum management fee\"** in context, please ignore relevant values.", "---Example Start---", "Fund name \nMaximum \nmanagement \nfee (p.a.) \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity Income Fund 3.075% \nAUSTRALIAN SHARE \nFirst Sentier Wholesale Australian Share Fund 1.538%", "---Example End---", "The values in example is **Maximum management fee**, should ignore all of them.", "The Output should be:", "{\"data\": []}", - "K. The management fee and costs in paragraph with speficic fund/ share prefix name: \"Account-based pension\" or \"Pre-retirement pension\"", + "J. The management fee and costs in paragraph with speficic fund/ share prefix name: \"Account-based pension\" or \"Pre-retirement pension\"", "---Example 1 Start---", "Account-based pension \nInvestment fees \nand costs 2 \nHigh Growth 0.45%, Growth 0.49%", "---Example 1 End---", @@ -339,7 +320,7 @@ "---Example 2 End---", "The output should be:", "{\"data\": [{\"fund name\": \"Pre-retirement pension High Growth\", \"share name\": \"Pre-retirement pension High Growth\", \"management_fee_and_costs\": 0.48, \"management_fee\": 0.48}, {\"fund name\": \"Pre-retirement pension Growth\", \"share name\": \"Pre-retirement pension Growth\", \"management_fee_and_costs\": 0.50, \"management_fee\": 0.50}]}", - "L. DO NOT extract management fees from \"Cost of product\" summaries. ", + "K. DO NOT extract management fees from \"Cost of product\" summaries. ", "\"Cost of product\" figures should not be treated as 'Investment fees and costs'.", "---Example Start---", "Investment option Cost of product \nCash $141.00", @@ -347,9 +328,10 @@ "FOUND \"Cost of product\", IGNORE ALL OF INFORMATION BELOW IT!!! JUST RETURN EMPTY RESPONSE!!!", "The output should be:", "{\"data\": []}", - "M. Do NOT infer or copy investment fees or management fees from examples provided for specific funds to other investment options. Only extract 'management_fee_and_costs' and 'management_fee' if explicitly stated separately for each investment option." + "L. Do NOT infer or copy investment fees or management fees from examples provided for specific funds to other investment options. Only extract 'management_fee_and_costs' and 'management_fee' if explicitly stated separately for each investment option." ], "administration_fees":[ + "### Administration fees and costs", "Administration fees and costs and total annual dollar-based charges are share class level data.", "Simple case:", "----Example 1 Start----", @@ -390,6 +372,15 @@ "total_annual_dollar_based_charges is 1 * 52 = 52", "The output should be:", "{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}", + "---Example 6 Start---", + "Administration \nfees and costs \n1 \nFirstChoice Lifestage (MySuper product) \nand Select investment options \n(other than FirstRate Saver) \n0.04% p.a. \nThe percentage‑based administration fee is reflected in \nthe daily unit price of your investment option and payable \nmonthly or as incurred by the option. \nFirstRate Saver \nFrom 0.35% to \n0.50% p.a. \nThe dollar‑based administration fee of $5 per month is \npayable at the beginning of each month by deduction of \nunits from one of your options. \nDollar-based fee discounts \nThe current fee for FirstRate Saver is set out at \ncfs.com.au/personal/resources/funds-and-performance/ \nfirstrate‑interest‑rates.html \nYour employer may be able to negotiate a lower dollar‑ \nbased administration fee for employee members. \nplus \nDollar-based administration fee \nRetained benefit and spouse members are not entitled \nto this discount. \n$60 p.a. ($5 per month) per account \n", + "---Example 6 Start---", + "According to example, the administration fee is 0.04, ", + "\"From 0.35% to 0.50% p.a.\", because it is the range value, need ignore and exclude, so administration_fees is 0.04, ", + "the total_annual_dollar_based_charges is 60 (5 per month * 12)", + "About fund name, it should be \"FirstChoice Lifestage\".", + "The output should be:", + "{\"data\": [{\"fund name\": \"FirstChoice Lifestage\", \"share name\": \"FirstChoice Lifestage\", \"administration_fees\": 0.04, \"total_annual_dollar_based_charges\": 60}]}", "\n", "Complex cases:", "A. Need to add multiple numbers together.", @@ -428,6 +419,7 @@ "{\"data\": [{\"fund name\": \"My Super\", \"share name\": \"My Super\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 26}, {\"fund name\": \"Fund1\", \"share name\": \"Fund1\", \"management_fee_and_costs\": 0.12, \"management_fee\": 0.12}]}" ], "total_annual_dollar_based_charges": [ + "### Total annual dollar-based charges", "Total annual dollar-based charges are share class level data.", "A. Its value corresponds to the administration fees and costs that are charged on a weekly basis.", "----Example Start----", @@ -439,18 +431,27 @@ "{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", "\n", "B. Please identify some case which not belong to the total_annual_dollar_based_charges, and output empty.", - "----Example Start----", + "----Example 1 Start----", "Cost of product information \n\nCost of product for 1 year \n\nThe cost of product gives a summary calculation about \nhow ongoing annual fees and costs can affect your \nsuperannuation investment over a 1-year period for all \ninvestment options. It is calculated in the manner \nshown in the 'Example of annual fees and costs'. \n\nThe cost of product information assumes a balance of \n$50,000 at the beginning of the year. (Additional fees \nsuch as a buy/sell spread may apply – refer to the ‘Fees \nand costs summary’ table for the relevant investment \noption.) \n\nYou should use this figure to help compare \nsuperannuation products and investment options. \n\nInvestment option \nCash \nCost of product \nPerpetual Cash \n$60.00 \nFixed income and credit \nBentham Global Income \n$485.00 \n", - "----Example End----", + "----Example 1 End----", "Explanation:", "The values provided in the example are not total annual dollar-based charges; ", "they represent the cost of product information, which is a calculated figure used to compare superannuation products and investment options. ", "This figure includes ongoing annual fees and costs, but it may not encompass all possible charges, such as additional fees like buy/sell spreads. ", "Therefore, it serves as a comparative tool rather than a comprehensive total of all annual charges.", "The output should be empty:", - "{\"data\": []}" + "{\"data\": []}", + "----Example 2 Start----", + "Equals \nCost of product \n1 \nIf your balance was $50,000 at \nthe beginning of the year, then \nfor that year you will be charged \nfees and costs of $395 for the \nsuperannuation product. \n\n", + "----Example 2 End----", + "Explanation:", + "The values provided in the example are not total annual dollar-based charges; ", + "they represent the cost of product information, which is a calculated figure used to compare superannuation products and investment options. ", + "FOUND \"Cost of product\", IGNORE ALL OF INFORMATION BELOW IT!!!" ], "buy_spread": [ + "### Buy/sell spread", + "Buy/sell spread is share class level data.", "A. Exclude reported name", "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), Cost of product, ", @@ -498,6 +499,7 @@ "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund – Class A\", \"share name\": \"Allan Gray Australian Equity Fund – Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}" ], "performance_fee_costs": [ + "### Performance fees", "Performance fees is share class level data.", "A. If the performance fees is with the range, please ignore and output empty.", "---Example Start---", @@ -536,6 +538,7 @@ "a. For this example, you have Example keyword in the header so you should not extract any datapoint values Like performance_fee_costs, management fee etc." ], "minimum_initial_investment": [ + "### Minimum initial investment", "Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.", "---Example 1 Start---", "The minimum investment per Pension Plan account is \n$20,000. The minimum initial investment in any \ninvestment option is $5,000.\n\nPerpetual WealthFocus Pension Plan", @@ -570,6 +573,7 @@ "{\"data\": [{\"fund name\": \"Lifeplan Investment Bond\", \"minimum_initial_investment\": 1000}]}" ], "benchmark_name": [ + "### Benchmark name", "Benchmark is fund leval data, usually as index fund name, e.g. S&P/ASX 300 A-REIT Total Return Index ", "Sometime, there are multiple benchmark names with weightings in the context, please extract them all including weightings and benchmark names.", "A. Examples for single benchmark name", @@ -687,7 +691,9 @@ "management_fee_and_costs": [ { "keywords": ["Administration fees \nEstimated administration costs \nInvestment fees"], - "prompts": ["Complex management fee and costs rule:", + "keywords_is_regex": false, + "prompts": [ + "### Complex management fee and costs rule", "If the table with columns:", "\"Administration fees\", \"Investment fees\" ,\"Estimated other investment costs\" and \"Estimated performance fees\"", "The administration_fees is \"Administration fees\"", @@ -708,7 +714,9 @@ }, { "keywords": ["Entry Fee option \nNil Entry option"], - "prompts": ["Complex management fee and costs rule:", + "keywords_is_regex": false, + "prompts": [ + "### Complex management fee and costs rule", "If the table with columns:", "\"Entry Fee option\", \"Nil Entry option\", \"Estimated Other investment costs\", \"Estimated Performance fees\"", "The performance_fee_costs is \"Estimated Performance fees\"", @@ -733,7 +741,9 @@ }, { "keywords": ["Retirement and TTR income streams"], - "prompts": ["Complex management fee and costs rule:", + "keywords_is_regex": false, + "prompts": [ + "### Complex management fee and costs rule", "For management_fee_and_costs, ", "a. If the title is \"Retirement and TTR income streams\"", "it means each investment name is with two fund names, one is for Retirement as pension, another is for TTR.", @@ -754,7 +764,9 @@ }, { "keywords": ["Recoverable expenses \nEstimated other indirect costs"], - "prompts": ["Complex management fee and costs rule:", + "keywords_is_regex": false, + "prompts": [ + "### Complex management fee and costs rule", "If the table with columns:", "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", "The management_fee is \"Management fee (% pa)\".", @@ -796,8 +808,9 @@ }, { "keywords":["Plus other investment fees and costs \nEquals investment fees and costs"], + "keywords_is_regex": false, "prompts": [ - "Complex management fee and costs rule:", + "### Complex management fee and costs rule", "If the table with columns:", "\"Performance fee\", \"Plus other investment fees and costs\", \"Equals investment fees and costs\", \"Transaction costs(net)\", \"Buy-sell spreads\", \"Transaction costs(gross)\".", "Both of the management_fee and management_fee_costs are \"Plus other investment fees and costs\".", @@ -812,6 +825,34 @@ "The output should be:", "{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" ] + }, + { + "keywords":["Total\\s*administration\\s*and investment\\s*fees[\\s\\S]*?Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Performance\\s*fee[\\s\\S]*?Buy\\/[sS]ell\\s*spread"], + "keywords_is_regex": true, + "sub_datapoints": ["administration_fees", "performance_fee_costs", "buy_spread", "sell_spread"], + "prompts": [ + "### Complex management fee and costs rule", + "---Example Start---", + "Option name \nTotal administration\nand investment\nfees and costs (p.a.)\n= \nAdministration\nfees and\ncosts (p.a.)\n+ \nInvestment fees \nand costs (p.a.) \n2 \n+ \nPerformance \nfee (p.a.) \n1 \nBuy/sell\nspread\n(%)\n6 \nCFS Multi-Manager Multi-Sector (These investment options are located in the Investment Options Menu.) \nCFS Defensive \n0.94% \n0.20% 0.74%0.15 \nCFS Conservative 1.04% \n1 \n0.20% 0.81% 0.03%\n1 \n0.15 \n", + "---Example End---", + "For this table, there are \"Administration fees and costs (p.a.)\" as administration_fees, ", + "\"Investment fees and costs (p.a.)\" as management_fee_and_costs and management_fee, ", + "\"Performance fee (p.a.)\" as performance_fee_costs, ", + "\"Buy/sell spread (%)\" as buy_spread and sell_spread.", + "If one row has 5 decimal numbers, ", + "the 2nd decimal number is the administration_fees, ", + "the 3rd decimal number is the management_fee_and_costs and management_fee, ", + "the 4th decimal number is the performance_fee_costs, ", + "the 5th decimal number is the buy_spread and sell_spread.", + "If one row has 4 decimal numbers, ", + "the 2nd decimal number is the administration_fees, ", + "the 3rd decimal number is the management_fee_and_costs and management_fee, ", + "the 4th decimal number is the buy_spread and sell_spread.", + "\"Buy/sell spread\" is always as the last decimal value column, for buy_spread and sell_spread, please extract all of them.", + "Please always ignore the 1st decimal number, we need not the total sum values.", + "The output should be:", + "{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee_costs\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}" + ] } ] } diff --git a/main.py b/main.py index 0880afb..7534026 100644 --- a/main.py +++ b/main.py @@ -453,7 +453,6 @@ def batch_start_job( pdf_folder: str = "/data/emea_ar/pdf/", output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", doc_data_excel_file: str = None, - document_mapping_file: str = None, output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/", output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/", @@ -1051,7 +1050,6 @@ def batch_run_documents( doc_source: str = "emea_ar", special_doc_id_list: list = None, pdf_folder: str = r"/data/emea_ar/pdf/", - document_mapping_file: str = None, output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/", @@ -1090,7 +1088,6 @@ def batch_run_documents( pdf_folder, output_pdf_text_folder, page_filter_ground_truth_file, - document_mapping_file, output_extract_data_child_folder, output_mapping_child_folder, output_extract_data_total_folder, @@ -1110,7 +1107,6 @@ def batch_run_documents( pdf_folder, output_pdf_text_folder, page_filter_ground_truth_file, - document_mapping_file, output_extract_data_child_folder, output_mapping_child_folder, output_extract_data_total_folder, @@ -1540,9 +1536,7 @@ if __name__ == "__main__": ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] - # document_mapping_file = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx" - document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" - # special_doc_id_list = ["553449169"] + # special_doc_id_list = ["448576924"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( @@ -1563,7 +1557,6 @@ if __name__ == "__main__": doc_source=doc_source, special_doc_id_list=special_doc_id_list, pdf_folder=pdf_folder, - document_mapping_file=document_mapping_file, output_pdf_text_folder=output_pdf_text_folder, output_extract_data_child_folder=output_extract_data_child_folder, output_extract_data_total_folder=output_extract_data_total_folder, diff --git a/performance.ipynb b/performance.ipynb index 0edf296..1fe0334 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 47, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -349,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -514,14 +514,147 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'data_point': 'performance_fee_costs', 'doc_id': 539791362, 'sec_name': 'REST High Growth Pension', 'truth': '0.6', 'generated': '0.08', 'error': 'Truth is not equal with generated'}\n" + "{'data_point': 'performance_fee_costs', 'doc_id': 377377369, 'sec_name': 'SPDR® S&P Emerging Markets Carbon Control Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'truth': '0', 'generated': '0.11', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA IP-OnePath Australian Shares NE', 'truth': '0', 'generated': '0.07', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Investment Portfolio-BlackRock Tactical Growth NE', 'truth': '0', 'generated': '0.33', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -NE', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Diversified Bond Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard High Yield Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Property Securities Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Macquarie Income Opps', 'truth': '0.03', 'generated': '0.12', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Diversified Inc', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Schroder Fixed Income', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Share Plus L/S', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only)', 'truth': '0.24', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only) P Class', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Asia Fund', 'truth': '0.27', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund', 'truth': '0.03', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund P Class', 'truth': '0.03', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Healthcare Fund', 'truth': '0.86', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Technology Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum European Fund', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Japan Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 397107472, 'sec_name': 'AMP Capital Specialist Diversified Fixed Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Ausbil Aus. Emrging Leaders', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Investors Mutual Aus. Shre', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Macquarie Inc Opportunities', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Cash', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - IncomeBuilder', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - Hedged Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Hedged Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PIMCO Div. Fixed Interest Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PIMCO Global Bond Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - PIMCO Global Bond Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PM CAPITAL Global Companies', 'truth': '1.54', 'generated': '1.45', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond - Allan Gray Australian Equity Fund Class A', 'truth': '0.28', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open', 'truth': '0.05', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Core Equity Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Value Trust -Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Core Equity Tr AUDHdg', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Australian Fixed Interest Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Australian Equity Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP Generations - AMP Cash Mgmt', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Property Securities Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock International Equity Index (Unhedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock International Equity Index (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539241700, 'sec_name': 'North Professional Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539261734, 'sec_name': 'ipac life choices Income Generator', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active High Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Moderately Defensive', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Balanced', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Defensive Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 527969661, 'sec_name': 'JPMorgan Global Equity Premium Income (Hedged) Complex ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Higher Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 557362553, 'sec_name': 'JPMorgan Global Select Equity Active ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 550522985, 'sec_name': 'RQI Global Value – Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 557362556, 'sec_name': 'JPMorgan Global Select Equity Fund - Class A (Hedged) Units', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 557362556, 'sec_name': 'JPMorgan Global Select Equity Fund - Class A Units', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Core Equity Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Value Trust -Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Core Equity Tr AUDHdg', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Australian Fixed Interest Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Australian Equity Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP Generations - AMP Cash Mgmt', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Property Securities Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock International Equity Index (Unhedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock International Equity Index (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539241700, 'sec_name': 'North Professional Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539261734, 'sec_name': 'ipac life choices Income Generator', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active High Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Moderately Defensive', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Balanced', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Defensive Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 527969661, 'sec_name': 'JPMorgan Global Equity Premium Income (Hedged) Complex ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Higher Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 557362553, 'sec_name': 'JPMorgan Global Select Equity Active ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 550522985, 'sec_name': 'RQI Global Value – Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 557362556, 'sec_name': 'JPMorgan Global Select Equity Fund - Class A (Hedged) Units', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 557362556, 'sec_name': 'JPMorgan Global Select Equity Fund - Class A Units', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 377377369, 'sec_name': 'SPDR® S&P Emerging Markets Carbon Control Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'truth': '0', 'generated': '0.11', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA IP-OnePath Australian Shares NE', 'truth': '0', 'generated': '0.07', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Investment Portfolio-BlackRock Tactical Growth NE', 'truth': '0', 'generated': '0.33', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -NE', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Diversified Bond Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard High Yield Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Property Securities Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Macquarie Income Opps', 'truth': '0.03', 'generated': '0.12', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Diversified Inc', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Schroder Fixed Income', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Share Plus L/S', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only)', 'truth': '0.24', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only) P Class', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Asia Fund', 'truth': '0.27', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund', 'truth': '0.03', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund P Class', 'truth': '0.03', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Healthcare Fund', 'truth': '0.86', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Technology Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum European Fund', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Japan Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 397107472, 'sec_name': 'AMP Capital Specialist Diversified Fixed Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Ausbil Aus. Emrging Leaders', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Investors Mutual Aus. Shre', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Macquarie Inc Opportunities', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Cash', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - IncomeBuilder', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - Hedged Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Hedged Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PIMCO Div. Fixed Interest Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PIMCO Global Bond Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - PIMCO Global Bond Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PM CAPITAL Global Companies', 'truth': '1.54', 'generated': '1.45', 'error': 'Truth is not equal with generated'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond - Allan Gray Australian Equity Fund Class A', 'truth': '0.28', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", + "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open', 'truth': '0.05', 'generated': '', 'error': 'Generated is null and truth is not null'}\n" ] } ], From 8a5723c150f11fa5d1039ce7cbd247b0af2ff19a Mon Sep 17 00:00:00 2001 From: Blade He Date: Thu, 27 Mar 2025 21:10:33 -0500 Subject: [PATCH 7/9] optimize for Entry Fee/ Nil Entry case --- core/data_extraction.py | 23 ++++-- .../data_extraction_prompts_config.json | 73 +++++++++++------ main.py | 78 ++++++++++--------- 3 files changed, 106 insertions(+), 68 deletions(-) diff --git a/core/data_extraction.py b/core/data_extraction.py index 0a14620..0a1b297 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -1635,6 +1635,7 @@ class DataExtraction: if page_text is not None and len(page_text) > 0: logger.info(f"Transfer previous page fund name: {page_text} to be the pre-fix of page text") summary += f"\nThe last fund name of previous PDF page: {page_text}\n" + summary += "If could find the fund name for the first data point value, please ignore this fund name.\n" else: summary = self.instructions_config.get("summary", "\n") @@ -1646,7 +1647,7 @@ class DataExtraction: instructions.extend(image_features) instructions.append("\n") - instructions.append("## Datapoints Reported name:\n") + instructions.append("## Datapoints Reported name\n") instructions.append("Please look for relevant reported names and similar variations in the context.\n") reported_name_info_in_instructions = self.instructions_config.get("reported_name", {}) for datapoint in datapoints: @@ -1746,7 +1747,7 @@ class DataExtraction: none_value_example_count += 1 instructions.append("\n") - instructions.append("## Data business features:\n") + instructions.append("## Data business features\n") data_business_features = self.instructions_config.get( "data_business_features", {} ) @@ -1754,7 +1755,7 @@ class DataExtraction: instructions.append(common) instructions.append("\n") - instructions.append("## Datapoints investment level:\n") + instructions.append("## Datapoints investment level\n") investment_level_info = data_business_features.get("investment_level", {}) for datapoint in datapoints: investment_level = investment_level_info.get(datapoint, "") @@ -1762,7 +1763,7 @@ class DataExtraction: instructions.append("\n") instructions.append("\n") - instructions.append("## Datapoints value range:\n") + instructions.append("## Datapoints value range\n") data_value_range_info = data_business_features.get("data_value_range", {}) for datapoint in datapoints: data_value_range = data_value_range_info.get(datapoint, "") @@ -1777,6 +1778,7 @@ class DataExtraction: complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "") with_special_rule_title = False found_sub_datapoints = [] + datapoint_special_rule = {} for datapoint in datapoints: # If some complex special rule is found, and with sub datapoints, # need not to load relevant rule again. @@ -1807,7 +1809,7 @@ class DataExtraction: complex_prompts_list = complex_special_rule.get("prompts", []) if len(complex_prompts_list) > 0: if not with_special_rule_title: - instructions.append("## Special rule:\n") + instructions.append("## Special rule\n") with_special_rule_title = True complex_prompts = "\n".join(complex_prompts_list) instructions.append(complex_prompts) @@ -1822,8 +1824,13 @@ class DataExtraction: continue special_rule_list = special_rule_info.get(datapoint, []) if len(special_rule_list) > 0: + datapoint_special_rule[datapoint] = special_rule_list + if len(list(datapoint_special_rule.keys())) > 0: + for datapoint, special_rule_list in datapoint_special_rule.items(): + if datapoint in found_sub_datapoints: + continue if not with_special_rule_title: - instructions.append("## Special rule:\n") + instructions.append("## Special rule\n") with_special_rule_title = True special_rule = "\n".join(special_rule_list) instructions.append(special_rule) @@ -1831,7 +1838,7 @@ class DataExtraction: instructions.append("\n") - instructions.append("## Special cases:\n") + instructions.append("## Special cases\n") special_cases = self.instructions_config.get("special_cases", {}) special_cases_common_list = special_cases.get("common", []) special_cases_number = 1 @@ -1859,7 +1866,7 @@ class DataExtraction: instructions.append(contents) instructions.append("\n") - instructions.append("## Output requirement:\n") + instructions.append("## Output requirement\n") output_requirement = self.instructions_config.get("output_requirement", {}) output_requirement_common_list = output_requirement.get("common", []) instructions.append("\n".join(output_requirement_common_list)) diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index cf2b9e3..8417eea 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -16,7 +16,7 @@ ], "data_business_features": { "common": [ - "General rules:", + "## General rules", "- 1. The data is in the context, perhaps in table(s), semi-table(s) or paragraphs.", "- 2. Fund name: ", "a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.", @@ -86,7 +86,8 @@ "---Example Start---", "Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced – Indexed 0.00% 0.00% 0.00% 0.00%\n", "---Example End---", - "For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values." + "For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values.", + "- 7. If for data point value specifically Nil is written in the value then return NULL('') for the same" ], "investment_level": { "total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.", @@ -145,7 +146,7 @@ "management_fee_and_costs": [ "### Management fee and cost", "Management fee and cost = Management fee + indirect cost + recoverable expense (Also known as Expense recovery cost or recovery fee or Expense recovery fee or expense recoveries) + Manager fee or Responsible entity fee.", - "If there are multiple Management fee and costs reported names, here is the priority rule:", + "A. If there are multiple Management fee and costs reported names, here are the priority rules:", "A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".", "---Example 1 Start---", "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n", @@ -196,7 +197,7 @@ "The management_fee is the value of \"Management fee (% pa)\".", "The management_fee_and_costs is the value of \"Total management cost (% pa)\".", "---Example 1 Start---", - "Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n", + "Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.00 0.04 0.00 0.01 1.38 0.31\n1.29 0.00 0.00 0.00 0.01 1.30 0.29\n", "---Example 1 End---", "The output should be:", "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee_costs\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", @@ -225,6 +226,7 @@ "---Example 3 Start---", "Fund name \nManagement \nfees and costs \n(p.a.) 1 \nBuy/sell \nspread \n(%) 2 \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity \nIncome Fund \n1.22% 0.05\nFirst Sentier Wholesale Geared \nShare Fund 3 \n1.04%(g)/2.18%(n) 4 0.20–0.50 5 \n\n", "---Example 3 End---", + "For value: 1.04%(g)/2.18%(n), (g) means gross, (n) means net, please extract net value: 2.18", "The output should be:", "{\"data\": [{\"fund name\": \"First Sentier Wholesale Equity Income Fund\", \"share name\": \"First Sentier Wholesale Equity Income Fund\", \"management_fee_and_costs\": 1.22, \"management_fee\": 1.22, \"buy_spread\": 0.05, \"sell_spread\": 0.05}, {\"fund name\": \"First Sentier Wholesale Geared Share Fund\", \"share name\": \"First Sentier Wholesale Geared Share Fund\", \"management_fee_and_costs\": 2.18, \"management_fee\": 2.18, \"buy_spread\": 0.5, \"sell_spread\": 0.5}]}", "\n", @@ -291,16 +293,6 @@ "For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee_costs is 0.11.", "So the output should be:", "{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee_costs\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee_costs\": 0.11}]}", - "---Example 4 Start---", - "Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return – Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n", - "---Example 4 End---", - "The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.", - "The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".", - "Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.", - "If there are 3 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the buy_spread and sell_spread.", - "If there are 4 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the performance_fee_costs, the 4th decimal number is buy_spread and sell_spread.", - "So the output should be:", - "{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", "\n", "I. If exist **\"Maximum management fee\"** in context, please ignore relevant values.", "---Example Start---", @@ -407,7 +399,7 @@ "---Example 4 Start---", "Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Fund’s \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n", "---Example 4 End---", - "Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees.", + "Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees, only output total_annual_dollar_based_charges as 78.", "B. The administration fee and costs/ total annual dollar-based charges are with production name, other data points/ values are with specific fund/ share name(s).", "---Example Start---", "My Super \nType of fee or cost Amount How and when paid \nOngoing annual fees and costs 1 \nAdministration fees and costs \n$26.00 p.a. \nplus \n0.17% p.a. of account balance (subject to a \nmaximum of $1,000 p.a.) \n$0.50 per week deducted from your account\nbalance at the end of each month or on exit.\nPercentage fee taken into account in the \ndaily calculation of unit prices. \nInvestment fees and costs \n2 \nOption % of option’s assets* \nFund1 0.12%\n", @@ -520,7 +512,7 @@ "a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.", "b. This example mentioned share classes, please output according to share class.", "The output should be", - "{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}", + "{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}", "D. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0", "---Example Start---", "Fund/Investment Option \nManagement Fees \nand Costs \n(% pa) \n1 \nPerformance Fees 2 \n(% pa) \nTransaction Costs 3 \n(% pa) \nBT American Share Fund 1.08 0.00 0.00\nBT Asian Share Fund 1.10 0.00 0.10", @@ -692,6 +684,7 @@ { "keywords": ["Administration fees \nEstimated administration costs \nInvestment fees"], "keywords_is_regex": false, + "sub_datapoints": ["administration_fees", "performance_fee_costs"], "prompts": [ "### Complex management fee and costs rule", "If the table with columns:", @@ -715,6 +708,7 @@ { "keywords": ["Entry Fee option \nNil Entry option"], "keywords_is_regex": false, + "sub_datapoints": ["performance_fee_costs"], "prompts": [ "### Complex management fee and costs rule", "If the table with columns:", @@ -727,14 +721,27 @@ "---Example 1 Start---", "\nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.32 0.00 0.00 0.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.29 0.00 0.00 0.04 1.48 2.33\nPlatinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n", "---Example 1 End---", - "The data points numbers order in data row (for example: 2.14 2.99 0.02 0.00 0.21 2.37 3.22) is correct as initial table structure.", "Please pay attention below information", - "Assume the numeric column sequence number is from 1.", - "\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.", - "For main fund: Platinum Asia with values: 2.14 2.99 0.02 0.00 0.21 2.37 3.22, ", - "the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", - "the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", + "Assume the numeric column sequence is from 1.", + "\"Entry Fee option\" values are as the 1st column values, \"Nil Entry option\" values are as the 2nd column values, \"Estimated other investment costs\" values are as the 3rd column values, \"Estimated Performance fees\" values are as the 4th column values.", + "Here is the example to get data, step by step.", + "For this fund in Example:", + "Platinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n", + "Step 1 Get new fund name", + "Combine \"Platinum Asia\" with \"Entry Fee\" as \"Platinum Asia Entry Fee\"", + "Combine \"Platinum Asia\" with \"Nil Entry\" as \"Platinum Asia Nil Entry\"", + "Step 2 **EXCLUE the values of the last three columns of data.**", + "ONLY KEEP these 4 values: 2.14 2.99 0.02 0.00 for next steps", + "Step 3 Calculate management_fee and management_fee_and_costs for these 2 new funds:", + "the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (Value of 1st column) + 0.02 (Value of 3rd column)", + "the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (Value of 2nd column) + 0.02 (Value of 3rd column)", + "**Make sure don't take \"Estimated other investment costs\" value from the wrong column!!!**", + "Step 4 Get performance_fee_costs", + "the fund: Platinum Asia Entry Fee, performance_fee_costs is 0 (Value of 4th column)", + "the fund: Platinum Asia Nil Entry, performance_fee_costs is 0 (Value of 4th column)", "Identify the value of the column \"Estimated Performance fees\" and if it is written 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0", + "**Make sure don't take \"Estimated Performance fees\" value from the wrong column!!!**", + "Please ignore the last fund name of previous PDF page, and extract data as these 4 steps for all of records in Context.", "Therefore, the output should be:", "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}" ] @@ -765,6 +772,7 @@ { "keywords": ["Recoverable expenses \nEstimated other indirect costs"], "keywords_is_regex": false, + "sub_datapoints": ["performance_fee_costs", "interposed_vehicle_performance_fee_cost", "buy_spread", "sell_spread"], "prompts": [ "### Complex management fee and costs rule", "If the table with columns:", @@ -809,6 +817,7 @@ { "keywords":["Plus other investment fees and costs \nEquals investment fees and costs"], "keywords_is_regex": false, + "sub_datapoints": ["performance_fee_costs", "buy_spread", "sell_spread"], "prompts": [ "### Complex management fee and costs rule", "If the table with columns:", @@ -827,7 +836,7 @@ ] }, { - "keywords":["Total\\s*administration\\s*and investment\\s*fees[\\s\\S]*?Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Performance\\s*fee[\\s\\S]*?Buy\\/[sS]ell\\s*spread"], + "keywords":["Total\\s*administration\\s*and (management|investment)\\s*fees[\\s\\S]*?Administration\\s*fees[\\s\\S]*?(Management|Investment)\\s*fees[\\s\\S]*?Performance\\s*fee[\\s\\S]*?Buy\\/[sS]ell\\s*spread"], "keywords_is_regex": true, "sub_datapoints": ["administration_fees", "performance_fee_costs", "buy_spread", "sell_spread"], "prompts": [ @@ -853,6 +862,24 @@ "The output should be:", "{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee_costs\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}" ] + }, + { + "keywords":["Total\\s*of\\s*(management|investment)\\s*fees\\s*and\\s*costs\\s*and\\s*performance\\s*fees[\\s\\S]*?(Management|Investment)\\s*fees[\\s\\S]*?Performance\\s*fee[\\s\\S]*?Buy\\/[sS]ell\\s*spread"], + "keywords_is_regex": true, + "sub_datapoints": ["performance_fee_costs", "buy_spread", "sell_spread"], + "prompts": [ + "### Complex management fee and costs rule", + "---Example Start---", + "Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return – Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n", + "---Example End---", + "The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.", + "The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".", + "Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.", + "If there are 3 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the buy_spread and sell_spread.", + "If there are 4 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the performance_fee_costs, the 4th decimal number is buy_spread and sell_spread.", + "So the output should be:", + "{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}" + ] } ] } diff --git a/main.py b/main.py index 7534026..324c613 100644 --- a/main.py +++ b/main.py @@ -1522,8 +1522,8 @@ if __name__ == "__main__": # get_aus_prospectus_document_category() - re_run_extract_data = True - re_run_mapping_data = True + re_run_extract_data = False + re_run_mapping_data = False force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" @@ -1531,42 +1531,46 @@ if __name__ == "__main__": # document_sample_file = ( # r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt" # ) - document_sample_file = ( - r"./sample_documents/aus_prospectus_46_documents_sample.txt" - ) - with open(document_sample_file, "r", encoding="utf-8") as f: - special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] - # special_doc_id_list = ["448576924"] - pdf_folder: str = r"/data/aus_prospectus/pdf/" - output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" - output_extract_data_child_folder: str = ( - r"/data/aus_prospectus/output/extract_data/docs/" - ) - output_extract_data_total_folder: str = ( - r"/data/aus_prospectus/output/extract_data/total/" - ) - output_mapping_child_folder: str = ( - r"/data/aus_prospectus/output/mapping_data/docs/" - ) - output_mapping_total_folder: str = ( - r"/data/aus_prospectus/output/mapping_data/total/" - ) - drilldown_folder = r"/data/aus_prospectus/output/drilldown/" + document_sample_file_list = [ + r"./sample_documents/aus_prospectus_46_documents_sample.txt", + r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt", + ] + for document_sample_file in document_sample_file_list: + logger.info(f"Start to run document sample file: {document_sample_file}") + with open(document_sample_file, "r", encoding="utf-8") as f: + special_doc_id_list = [doc_id.strip() for doc_id in f.readlines() + if len(doc_id.strip()) > 0] + # special_doc_id_list = ["401212184"] + pdf_folder: str = r"/data/aus_prospectus/pdf/" + output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" + output_extract_data_child_folder: str = ( + r"/data/aus_prospectus/output/extract_data/docs/" + ) + output_extract_data_total_folder: str = ( + r"/data/aus_prospectus/output/extract_data/total/" + ) + output_mapping_child_folder: str = ( + r"/data/aus_prospectus/output/mapping_data/docs/" + ) + output_mapping_total_folder: str = ( + r"/data/aus_prospectus/output/mapping_data/total/" + ) + drilldown_folder = r"/data/aus_prospectus/output/drilldown/" - batch_run_documents( - doc_source=doc_source, - special_doc_id_list=special_doc_id_list, - pdf_folder=pdf_folder, - output_pdf_text_folder=output_pdf_text_folder, - output_extract_data_child_folder=output_extract_data_child_folder, - output_extract_data_total_folder=output_extract_data_total_folder, - output_mapping_child_folder=output_mapping_child_folder, - output_mapping_total_folder=output_mapping_total_folder, - drilldown_folder=drilldown_folder, - re_run_extract_data=re_run_extract_data, - re_run_mapping_data=re_run_mapping_data, - force_save_total_data=force_save_total_data - ) + batch_run_documents( + doc_source=doc_source, + special_doc_id_list=special_doc_id_list, + pdf_folder=pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, + output_extract_data_child_folder=output_extract_data_child_folder, + output_extract_data_total_folder=output_extract_data_total_folder, + output_mapping_child_folder=output_mapping_child_folder, + output_mapping_total_folder=output_mapping_total_folder, + drilldown_folder=drilldown_folder, + re_run_extract_data=re_run_extract_data, + re_run_mapping_data=re_run_mapping_data, + force_save_total_data=force_save_total_data + ) elif doc_source == "emea_ar": special_doc_id_list = ["321733631"] batch_run_documents( From 46f86b124b8a30cb4911de0184caaf85c4b18b8e Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 28 Mar 2025 00:51:51 -0500 Subject: [PATCH 8/9] update instructions fund name section structure --- .../data_extraction_prompts_config.json | 19 +- main.py | 80 +++--- performance.ipynb | 270 +++--------------- 3 files changed, 95 insertions(+), 274 deletions(-) diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 8417eea..642dbd6 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -17,8 +17,8 @@ "data_business_features": { "common": [ "## General rules", - "- 1. The data is in the context, perhaps in table(s), semi-table(s) or paragraphs.", - "- 2. Fund name: ", + "1. The data is in the context, perhaps in table(s), semi-table(s) or paragraphs.", + "2. Fund name: ", "a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.", "b. The sub-fund name may be as the first column or first row values in the table.", "b.1 fund name example:", @@ -67,12 +67,12 @@ "---Example 3 End---", "Although exist \"Retirement account\" and \"Transition to Retirement account\", but the investment option is not exist, so fund name and share name should be: \"Rest Pension\".", "\n", - "- 3. Only extract the latest data from context:", + "3. Only extract the latest data from context:", "If with multiple data values in same row, please extract the latest.", "\n", - "- 4. Reported names:", + "4. Reported names:", "Only output the values which with significant reported names.", - "- Multiple data columns with same reported name but different post-fix:", + "Multiple data columns with same reported name but different post-fix:", "If there are multiple reported names with different post-fix text, here is the priority rule:", "The pos-fix text is in the brackets: (gross), (net), pick up the values from (net).", "---Example Start---", @@ -80,14 +80,14 @@ "---Example End---", "The output should be:", "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}", - "- 5. Please ignore these words as fund names, it means never extract these words as fund names. They are:", + "5. Please ignore these words as fund names, it means never extract these words as fund names. They are:", "\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\".", - "- 6. Identify the value of data point and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0", + "6. Identify the value of data point and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0", "---Example Start---", "Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced – Indexed 0.00% 0.00% 0.00% 0.00%\n", "---Example End---", "For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values.", - "- 7. If for data point value specifically Nil is written in the value then return NULL('') for the same" + "7. If for data point value specifically Nil is written in the value then return NULL('') for the same" ], "investment_level": { "total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.", @@ -320,7 +320,8 @@ "FOUND \"Cost of product\", IGNORE ALL OF INFORMATION BELOW IT!!! JUST RETURN EMPTY RESPONSE!!!", "The output should be:", "{\"data\": []}", - "L. Do NOT infer or copy investment fees or management fees from examples provided for specific funds to other investment options. Only extract 'management_fee_and_costs' and 'management_fee' if explicitly stated separately for each investment option." + "L. Do NOT infer or copy investment fees or management fees from examples provided for specific funds to other investment options. Only extract 'management_fee_and_costs' and 'management_fee' if explicitly stated separately for each investment option.", + "M. Identify the value of management fee and costs, and if it is written 0% or 0.00% or 0 or 0.00, then extract the same as 0, please don't ignore it." ], "administration_fees":[ "### Administration fees and costs", diff --git a/main.py b/main.py index 324c613..8d2eeda 100644 --- a/main.py +++ b/main.py @@ -1522,8 +1522,8 @@ if __name__ == "__main__": # get_aus_prospectus_document_category() - re_run_extract_data = False - re_run_mapping_data = False + re_run_extract_data = True + re_run_mapping_data = True force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" @@ -1531,46 +1531,44 @@ if __name__ == "__main__": # document_sample_file = ( # r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt" # ) - document_sample_file_list = [ - r"./sample_documents/aus_prospectus_46_documents_sample.txt", - r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt", - ] - for document_sample_file in document_sample_file_list: - logger.info(f"Start to run document sample file: {document_sample_file}") - with open(document_sample_file, "r", encoding="utf-8") as f: - special_doc_id_list = [doc_id.strip() for doc_id in f.readlines() - if len(doc_id.strip()) > 0] - # special_doc_id_list = ["401212184"] - pdf_folder: str = r"/data/aus_prospectus/pdf/" - output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" - output_extract_data_child_folder: str = ( - r"/data/aus_prospectus/output/extract_data/docs/" - ) - output_extract_data_total_folder: str = ( - r"/data/aus_prospectus/output/extract_data/total/" - ) - output_mapping_child_folder: str = ( - r"/data/aus_prospectus/output/mapping_data/docs/" - ) - output_mapping_total_folder: str = ( - r"/data/aus_prospectus/output/mapping_data/total/" - ) - drilldown_folder = r"/data/aus_prospectus/output/drilldown/" + document_sample_file = ( + r"./sample_documents/aus_prospectus_46_documents_sample.txt" + ) + logger.info(f"Start to run document sample file: {document_sample_file}") + with open(document_sample_file, "r", encoding="utf-8") as f: + special_doc_id_list = [doc_id.strip() for doc_id in f.readlines() + if len(doc_id.strip()) > 0] + # special_doc_id_list = ["420339794"] + pdf_folder: str = r"/data/aus_prospectus/pdf/" + output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" + output_extract_data_child_folder: str = ( + r"/data/aus_prospectus/output/extract_data/docs/" + ) + output_extract_data_total_folder: str = ( + r"/data/aus_prospectus/output/extract_data/total/" + ) + output_mapping_child_folder: str = ( + r"/data/aus_prospectus/output/mapping_data/docs/" + ) + output_mapping_total_folder: str = ( + r"/data/aus_prospectus/output/mapping_data/total/" + ) + drilldown_folder = r"/data/aus_prospectus/output/drilldown/" - batch_run_documents( - doc_source=doc_source, - special_doc_id_list=special_doc_id_list, - pdf_folder=pdf_folder, - output_pdf_text_folder=output_pdf_text_folder, - output_extract_data_child_folder=output_extract_data_child_folder, - output_extract_data_total_folder=output_extract_data_total_folder, - output_mapping_child_folder=output_mapping_child_folder, - output_mapping_total_folder=output_mapping_total_folder, - drilldown_folder=drilldown_folder, - re_run_extract_data=re_run_extract_data, - re_run_mapping_data=re_run_mapping_data, - force_save_total_data=force_save_total_data - ) + batch_run_documents( + doc_source=doc_source, + special_doc_id_list=special_doc_id_list, + pdf_folder=pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, + output_extract_data_child_folder=output_extract_data_child_folder, + output_extract_data_total_folder=output_extract_data_total_folder, + output_mapping_child_folder=output_mapping_child_folder, + output_mapping_total_folder=output_mapping_total_folder, + drilldown_folder=drilldown_folder, + re_run_extract_data=re_run_extract_data, + re_run_mapping_data=re_run_mapping_data, + force_save_total_data=force_save_total_data + ) elif doc_source == "emea_ar": special_doc_id_list = ["321733631"] batch_run_documents( diff --git a/performance.ipynb b/performance.ipynb index 1fe0334..6a2f94c 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -15,51 +15,51 @@ "from utils.similarity import Similarity\n", "\n", "\n", - "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n", - " \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n", - " \"Minimum Initial Investment\", \"Benchmark\"]\n", - "\n", - "\n", - "imp_datapoints_mapping = {\n", - " \"Management Fee and Costs\": \"management_fee_and_costs\",\n", - " \"Management Fee\": \"management_fee\",\n", - " \"Performance fee and cost\": \"performance_fee_costs\",\n", - " \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n", - " \"Administration Fee and costs\": \"administration_fees\",\n", - " \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", - " \"Buy Spread\": \"buy_spread\",\n", - " \"Sell Spread\": \"sell_spread\",\n", - " \"Performance Fee\": \"PerformanceFeeCharged\",\n", - " \"Minimum Initial Investment\": \"minimum_initial_investment\",\n", - " \"Benchmark\": \"benchmark_name\"\n", - "}\n", - "\n", - "# imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\",\n", - "# \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\"]\n", + "# imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n", + "# \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n", + "# \"Minimum Initial Investment\", \"Benchmark\"]\n", "\n", "\n", "# imp_datapoints_mapping = {\n", "# \"Management Fee and Costs\": \"management_fee_and_costs\",\n", "# \"Management Fee\": \"management_fee\",\n", "# \"Performance fee and cost\": \"performance_fee_costs\",\n", + "# \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n", "# \"Administration Fee and costs\": \"administration_fees\",\n", "# \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", "# \"Buy Spread\": \"buy_spread\",\n", - "# \"Sell Spread\": \"sell_spread\"\n", + "# \"Sell Spread\": \"sell_spread\",\n", + "# \"Performance Fee\": \"PerformanceFeeCharged\",\n", + "# \"Minimum Initial Investment\": \"minimum_initial_investment\",\n", + "# \"Benchmark\": \"benchmark_name\"\n", "# }\n", "\n", - "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", - "# path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx\"\n", + "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\",\n", + " \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\"]\n", + "\n", + "\n", + "imp_datapoints_mapping = {\n", + " \"Management Fee and Costs\": \"management_fee_and_costs\",\n", + " \"Management Fee\": \"management_fee\",\n", + " \"Performance fee and cost\": \"performance_fee_costs\",\n", + " \"Administration Fee and costs\": \"administration_fees\",\n", + " \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", + " \"Buy Spread\": \"buy_spread\",\n", + " \"Sell Spread\": \"sell_spread\"\n", + "}\n", + "\n", + "# path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", + "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx\"\n", "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n", - "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250326224343.xlsx\"\n", - "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_6_documents_by_text_20250326203744.xlsx\"\n", + "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250327230323.xlsx\"\n", + "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_6_documents_by_text_20250328004858.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", "\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -349,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -363,56 +363,18 @@ "All Providers Results: \n", "Document List File - None\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9375 \t0.8993 \t0.9791 \t0.8829 \t424 \t375 \t2 \t42 \t8 \n", - "management_fee \t0.9571 \t0.9353 \t0.9799 \t0.9180 \t424 \t390 \t2 \t27 \t8 \n", - "performance_fee_costs \t0.8801 \t0.8601 \t0.9011 \t0.8431 \t291 \t246 \t114 \t40 \t27 \n", - "interposed_vehicle_performance_fee_cost \t0.9172 \t0.8471 \t1.0000 \t0.9696 \t73 \t72 \t342 \t13 \t0 \n", - "administration_fees \t0.9081 \t0.8317 \t1.0000 \t0.9602 \t84 \t84 \t326 \t17 \t0 \n", - "total_annual_dollar_based_charges \t0.9930 \t0.9861 \t1.0000 \t0.9977 \t71 \t71 \t355 \t1 \t0 \n", - "buy_spread \t0.9291 \t0.8930 \t0.9681 \t0.8806 \t376 \t334 \t42 \t40 \t11 \n", - "sell_spread \t0.9291 \t0.8930 \t0.9681 \t0.8806 \t376 \t334 \t42 \t40 \t11 \n", - "minimum_initial_investment \t0.9507 \t0.9633 \t0.9383 \t0.9297 \t308 \t289 \t108 \t11 \t19 \n", - "benchmark_name \t0.9139 \t0.8846 \t0.9452 \t0.9391 \t156 \t138 \t263 \t18 \t8 \n", - "TOTAL \t0.9316 \t0.8994 \t0.9680 \t0.9201 \t2583 \t2333 \t1596 \t249 \t92 \n", - "Total Shares Matched - 379\n", - "Total Shares Not Matched - 128\n", - "Percentage of Shares Matched - 74.7534516765286\n", - "Not Matched Shares Name List - ['SPDR® S&P World ex Australia Carbon Control Fund', 'Mercer Multi-manager Growth Fund – Retail Units', 'Mercer Multi-manager High Growth Fund – Retail Units', 'ANZ OA Inv-OnePath Multi Asset Income EF', 'ANZ OA IP-OnePath Australian Shares', 'ANZ OA IP-OnePath Diversified Fixed Interest', 'ANZ OA IP-OP Diversified Credit EF', 'ANZ OA IP-OP Diversified Credit NE', 'OneAnswer Investment Portfolio - Schroder Strategic Growth -NE', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond EF', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond NE', 'OnePath OA Investment Portfolio-BlackRock Tactical Growth EF', 'OnePath OA Inv-Greencape Broadcap EF', 'OnePath OA Inv-Nikko AM Australian Shares EF', 'OnePath OA IP- Pendal Monthly Income Plus-EF/Sel', 'OnePath OA IP- Pendal Monthly Income Plus-NEF', 'OnePath OA IP-Alternatives Growth Fund-EF/Sel', 'OnePath OA IP-Alternatives Growth Fund-NEF', 'OnePath OA IP-ANZ Cash Advantage-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-NEF', 'OnePath OA IP-Ausbil Australian Emerging Leaders Trust-EF/Sel', 'OnePath OA IP-Bennelong Australian Equities-EF/Sel', 'OnePath OA IP-Bentham Global Income Trust-EF/Sel', 'OnePath OA IP-Bentham Global Income Trust-NEF', 'OnePath OA IP-Fidelity Australian Equities-EF/Sel', 'OnePath OA IP-Investors Mutual Australian Share Trust- EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-NEF', 'OnePath OA IP-Merlon Australian Share Income-EF/Sel', 'OnePath OA IP-OnePath Active Growth Trust-NEF', 'OnePath OA IP-OnePath High Growth Trust-EF/Sel', 'OnePath OA IP-OnePath High Growth Trust-NEF', 'OnePath OA IP-OnePath Managed Growth Trust-EF/Sel', 'OnePath OA IP-OnePath Managed Growth Trust-NEF', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-NEF', 'OnePath OA IP-OptiMix Australian Share Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Share Trust-NEF', 'OnePath OA IP-OptiMix Global Emerging Markets Share-EF/Sel', 'OnePath OA IP-OptiMix Global Emerging Markets Share-NEF', 'OnePath OA IP-OptiMIx Global Share Trust-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-NEF', 'OnePath OA IP-OptiMix High Growth Trust-EF/Sel', 'OnePath OA IP-OptiMix High Growth Trust-NEF', 'OnePath OA IP-OptiMix Property Securities Trust-EF/Sel', 'OnePath OA IP-OptiMix Property Securities Trust-NEF', 'OnePath OA IP-Perpetual Balanced Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Balanced Growth Trust-NEF', 'OnePath OA IP-Perpetual Conservative Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'OnePath OA IP-Schroder Fixed Income-EF/Sel', 'OnePath OA IP-Schroder Fixed Income-NEF', 'OnePath OA IP-UBS Balanced Trust-EF/Sel', 'OnePath OA IP-UBS Balanced Trust-NEF', 'OnePath OA IP-UBS Defensive Trust-EF/Sel', 'OnePath OA IP-UBS Defensive Trust-NEF', 'OnePath OA IP-UBS Diversified Fixed Income Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-NEF', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -NE', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -NE', 'OnePath OneAnswer Investment Portfolio - BlackRock Advantage Australian Equity -EF/Sel', 'OnePath OneAnswer Investment Portfolio - BlackRock Advantage Australian Equity -NE', 'OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth -EF/Sel', 'OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth -NE', 'OnePath OneAnswer Investment Portfolio - First Sentier Imputation -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Australian Shares Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath High Growth Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath High Growth Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Schroder Strategic Growth -EF/Sel', 'OnePath Schroder Real Return Trust (Entry Fee)', 'OnePath Schroder Real Return Trust (Nil Entry Fee)', 'OnePath OA IP-Ausbil Australian Emerging Leaders Trust-NEF', 'Telstra Growth Pen', 'First Sentier Concentrated Aus Share', 'First Sentier Australian Small Companies', 'First Sentier Imputation', 'First Sentier Global Property Securities', 'First Sentier Australian Share', 'CFS FC-Investors Mutual Future Leaders', 'Stewart Worldwide Leaders Sustainability', 'First Sentier Property Securities', 'MyNorth Index Defensive', 'MLC MKPF - Inflation Plus - Conservative', 'MLC MasterKey Super Fundamentals - Perpetual Australian Share', 'MLC MKSF - Perpetual WS Ethical SRI Fund', 'MLC MasterKey Super Fundamentals - Perpetual Small Co Fund No.2', 'MLC MKSF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKSF - Platinum Asia Fund', 'MLC MKSF - Platinum International Fund', 'MLC MKSF - PM CAPITAL Global Companies', 'MLC MKSF - Schroder WS Australian Equity', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Aust Property Index', 'MLC MasterKey Super Fundamentals - MLC Australian Property Index', 'MLC MKSF - Vanguard Intl Shr Indx (Hgd)', 'MLC MKSF - Vanguard Intl Shr Indx', 'HOSTPLUS Fixed Interest Indexed Super', 'Lifeplan Investment Bond Perpetual Balanced Growth', 'Lifeplan Investment Bond Perpetual Conservative Growth', 'Lifeplan Investment Bond Perpetual Industrial Share', 'Lifeplan Investment Bond Vanguard® Australian Shares Index', 'Dimensional Australian Core Equity Trust', 'FC W Pen-CFS TTR Global Infrastructure Securities', 'CFS MIF-High Growth', 'CFS MIF-Property Securities', 'CFS MIF-Geared Share NEF', 'CFS MIF-Australian Share', 'CFS MIF-Geared Global Share', 'CFS MIF-Global Tech & Comm', 'CFS MIF-Stewart Inv Worldwide Leaders Sustainability', 'CFS MIF-Geared Share', 'CFS MIF-Diversified', 'CFS MIF-Janus Henderson Global Natural Resources Fund', 'CFS MIF-Macquarie Australian Emerging Companies', 'CFS MIF-Balanced', 'CFS MIF-Conservative', 'CFS MIF-Imputation', 'CFS MIF-Global Health & Biotech', 'Dimensional Australia Core Equity Trust - Active ETF']\n", - "All Providers Results: \n", - "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", - "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9621 \t0.9270 \t1.0000 \t0.9270 \t177 \t165 \t0 \t13 \t0 \n", - "management_fee \t0.9886 \t0.9775 \t1.0000 \t0.9775 \t177 \t174 \t0 \t4 \t0 \n", - "performance_fee_costs \t0.8557 \t0.8037 \t0.9149 \t0.8371 \t100 \t86 \t63 \t21 \t8 \n", - "interposed_vehicle_performance_fee_cost \t0.8966 \t0.8125 \t1.0000 \t0.9326 \t53 \t52 \t114 \t12 \t0 \n", - "administration_fees \t0.9655 \t0.9333 \t1.0000 \t0.9944 \t14 \t14 \t163 \t1 \t0 \n", - "buy_spread \t0.9496 \t0.9091 \t0.9938 \t0.9045 \t175 \t160 \t1 \t16 \t1 \n", - "sell_spread \t0.9464 \t0.9034 \t0.9938 \t0.8989 \t175 \t159 \t1 \t17 \t1 \n", - "minimum_initial_investment \t0.9064 \t0.9528 \t0.8643 \t0.8596 \t140 \t121 \t32 \t6 \t19 \n", - "benchmark_name \t0.9186 \t0.8587 \t0.9875 \t0.9213 \t89 \t79 \t85 \t13 \t1 \n", - "TOTAL \t0.9322 \t0.8976 \t0.9727 \t0.9170 \t1100 \t1010 \t459 \t103 \t122 \n", - "Total Shares Matched - 173\n", - "Total Shares Not Matched - 18\n", - "Percentage of Shares Matched - 90.57591623036649\n", - "Not Matched Shares Name List - ['Dimensional Australian Core Equity Trust', 'FC W Pen-CFS TTR Global Infrastructure Securities', 'CFS MIF-High Growth', 'CFS MIF-Property Securities', 'CFS MIF-Geared Share NEF', 'CFS MIF-Australian Share', 'CFS MIF-Geared Global Share', 'CFS MIF-Global Tech & Comm', 'CFS MIF-Stewart Inv Worldwide Leaders Sustainability', 'CFS MIF-Geared Share', 'CFS MIF-Diversified', 'CFS MIF-Janus Henderson Global Natural Resources Fund', 'CFS MIF-Macquarie Australian Emerging Companies', 'CFS MIF-Balanced', 'CFS MIF-Conservative', 'CFS MIF-Imputation', 'CFS MIF-Global Health & Biotech', 'Dimensional Australia Core Equity Trust - Active ETF']\n", - "All Providers Results: \n", - "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", - "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9190 \t0.8787 \t0.9633 \t0.8514 \t247 \t210 \t2 \t29 \t8 \n", - "management_fee \t0.9330 \t0.9038 \t0.9643 \t0.8755 \t247 \t216 \t2 \t23 \t8 \n", - "performance_fee_costs \t0.8939 \t0.8939 \t0.8939 \t0.8474 \t191 \t160 \t51 \t19 \t19 \n", - "interposed_vehicle_performance_fee_cost \t0.9756 \t0.9524 \t1.0000 \t0.9960 \t20 \t20 \t228 \t1 \t0 \n", - "administration_fees \t0.8974 \t0.8140 \t1.0000 \t0.9357 \t70 \t70 \t163 \t16 \t0 \n", - "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t71 \t71 \t178 \t0 \t0 \n", - "buy_spread \t0.9110 \t0.8788 \t0.9457 \t0.8635 \t201 \t174 \t41 \t24 \t10 \n", - "sell_spread \t0.9138 \t0.8838 \t0.9459 \t0.8675 \t201 \t175 \t41 \t23 \t10 \n", - "minimum_initial_investment \t0.9853 \t0.9711 \t1.0000 \t0.9799 \t168 \t168 \t76 \t5 \t0 \n", - "benchmark_name \t0.9077 \t0.9219 \t0.8939 \t0.9518 \t67 \t59 \t178 \t5 \t7 \n", - "TOTAL \t0.9337 \t0.9098 \t0.9607 \t0.9169 \t1483 \t1323 \t960 \t145 \t184 \n", - "Total Shares Matched - 249\n", - "Total Shares Not Matched - 110\n", - "Percentage of Shares Matched - 69.35933147632312\n", - "Not Matched Shares Name List - ['SPDR® S&P World ex Australia Carbon Control Fund', 'Mercer Multi-manager Growth Fund – Retail Units', 'Mercer Multi-manager High Growth Fund – Retail Units', 'ANZ OA Inv-OnePath Multi Asset Income EF', 'ANZ OA IP-OnePath Australian Shares', 'ANZ OA IP-OnePath Diversified Fixed Interest', 'ANZ OA IP-OP Diversified Credit EF', 'ANZ OA IP-OP Diversified Credit NE', 'OneAnswer Investment Portfolio - Schroder Strategic Growth -NE', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond EF', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond NE', 'OnePath OA Investment Portfolio-BlackRock Tactical Growth EF', 'OnePath OA Inv-Greencape Broadcap EF', 'OnePath OA Inv-Nikko AM Australian Shares EF', 'OnePath OA IP- Pendal Monthly Income Plus-EF/Sel', 'OnePath OA IP- Pendal Monthly Income Plus-NEF', 'OnePath OA IP-Alternatives Growth Fund-EF/Sel', 'OnePath OA IP-Alternatives Growth Fund-NEF', 'OnePath OA IP-ANZ Cash Advantage-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-NEF', 'OnePath OA IP-Ausbil Australian Emerging Leaders Trust-EF/Sel', 'OnePath OA IP-Bennelong Australian Equities-EF/Sel', 'OnePath OA IP-Bentham Global Income Trust-EF/Sel', 'OnePath OA IP-Bentham Global Income Trust-NEF', 'OnePath OA IP-Fidelity Australian Equities-EF/Sel', 'OnePath OA IP-Investors Mutual Australian Share Trust- EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-NEF', 'OnePath OA IP-Merlon Australian Share Income-EF/Sel', 'OnePath OA IP-OnePath Active Growth Trust-NEF', 'OnePath OA IP-OnePath High Growth Trust-EF/Sel', 'OnePath OA IP-OnePath High Growth Trust-NEF', 'OnePath OA IP-OnePath Managed Growth Trust-EF/Sel', 'OnePath OA IP-OnePath Managed Growth Trust-NEF', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-NEF', 'OnePath OA IP-OptiMix Australian Share Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Share Trust-NEF', 'OnePath OA IP-OptiMix Global Emerging Markets Share-EF/Sel', 'OnePath OA IP-OptiMix Global Emerging Markets Share-NEF', 'OnePath OA IP-OptiMIx Global Share Trust-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-NEF', 'OnePath OA IP-OptiMix High Growth Trust-EF/Sel', 'OnePath OA IP-OptiMix High Growth Trust-NEF', 'OnePath OA IP-OptiMix Property Securities Trust-EF/Sel', 'OnePath OA IP-OptiMix Property Securities Trust-NEF', 'OnePath OA IP-Perpetual Balanced Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Balanced Growth Trust-NEF', 'OnePath OA IP-Perpetual Conservative Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'OnePath OA IP-Schroder Fixed Income-EF/Sel', 'OnePath OA IP-Schroder Fixed Income-NEF', 'OnePath OA IP-UBS Balanced Trust-EF/Sel', 'OnePath OA IP-UBS Balanced Trust-NEF', 'OnePath OA IP-UBS Defensive Trust-EF/Sel', 'OnePath OA IP-UBS Defensive Trust-NEF', 'OnePath OA IP-UBS Diversified Fixed Income Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-NEF', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -NE', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -NE', 'OnePath OneAnswer Investment Portfolio - BlackRock Advantage Australian Equity -EF/Sel', 'OnePath OneAnswer Investment Portfolio - BlackRock Advantage Australian Equity -NE', 'OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth -EF/Sel', 'OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth -NE', 'OnePath OneAnswer Investment Portfolio - First Sentier Imputation -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Australian Shares Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath High Growth Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath High Growth Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Schroder Strategic Growth -EF/Sel', 'OnePath Schroder Real Return Trust (Entry Fee)', 'OnePath Schroder Real Return Trust (Nil Entry Fee)', 'OnePath OA IP-Ausbil Australian Emerging Leaders Trust-NEF', 'Telstra Growth Pen', 'First Sentier Concentrated Aus Share', 'First Sentier Australian Small Companies', 'First Sentier Imputation', 'First Sentier Global Property Securities', 'First Sentier Australian Share', 'CFS FC-Investors Mutual Future Leaders', 'Stewart Worldwide Leaders Sustainability', 'First Sentier Property Securities', 'MyNorth Index Defensive', 'MLC MKPF - Inflation Plus - Conservative', 'MLC MasterKey Super Fundamentals - Perpetual Australian Share', 'MLC MKSF - Perpetual WS Ethical SRI Fund', 'MLC MasterKey Super Fundamentals - Perpetual Small Co Fund No.2', 'MLC MKSF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKSF - Platinum Asia Fund', 'MLC MKSF - Platinum International Fund', 'MLC MKSF - PM CAPITAL Global Companies', 'MLC MKSF - Schroder WS Australian Equity', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Aust Property Index', 'MLC MasterKey Super Fundamentals - MLC Australian Property Index', 'MLC MKSF - Vanguard Intl Shr Indx (Hgd)', 'MLC MKSF - Vanguard Intl Shr Indx', 'HOSTPLUS Fixed Interest Indexed Super', 'Lifeplan Investment Bond Perpetual Balanced Growth', 'Lifeplan Investment Bond Perpetual Conservative Growth', 'Lifeplan Investment Bond Perpetual Industrial Share', 'Lifeplan Investment Bond Vanguard® Australian Shares Index']\n" + "management_fee_and_costs \t0.9515 \t0.9074 \t1.0000 \t0.9074 \t54 \t49 \t0 \t5 \t0 \n", + "management_fee \t0.9515 \t0.9074 \t1.0000 \t0.9074 \t54 \t49 \t0 \t5 \t0 \n", + "performance_fee_costs \t0.9796 \t0.9796 \t0.9796 \t0.9630 \t50 \t48 \t4 \t1 \t1 \n", + "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t54 \t54 \t0 \t0 \t0 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t16 \t16 \t38 \t0 \t0 \n", + "buy_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t34 \t0 \t0 \n", + "sell_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t34 \t0 \t0 \n", + "TOTAL \t0.9832 \t0.9706 \t0.9971 \t0.9683 \t268 \t256 \t110 \t11 \t1 \n", + "Total Shares Matched - 54\n", + "Total Shares Not Matched - 16\n", + "Percentage of Shares Matched - 77.14285714285715\n", + "Not Matched Shares Name List - ['Vision Balanced Growth Pen', 'CFS FC W PSup-FirstRate Term Dep (10yr)', 'CFS FC W PSup-FirstRate Term Dep (15yr)', 'CFS FC W PSup-FirstRate Term Dep (2yr)', 'CFS FC W PSup-FirstRate Term Dep (3yr)', 'CFS FC W PSup-FirstRate Term Dep (5yr)', 'CFS FC W PSup-FirstRate Term Dep (7yr)', 'AV Australian Shares TTR', 'AV Balanced Growth TTR', 'AV Cash TTR', 'AV Conservative Growth TTR', 'AV Diversified Index TTR', 'AV Growth TTR', 'AV High Growth TTR', 'AV International Shares TTR', 'AV Stable Growth TTR']\n" ] } ], @@ -468,8 +430,9 @@ "print(\"\\n\")\n", "print(\"\\n\")\n", "document_list_file_list = [None, \n", - " \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n", - " \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n", + " # \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n", + " # \"./sample_documents/aus_prospectus_17_documents_sample.txt\"\n", + " ]\n", "# document_list_file_list = [None]\n", "for document_list_file in document_list_file_list:\n", " document_list = None\n", @@ -514,150 +477,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'data_point': 'performance_fee_costs', 'doc_id': 377377369, 'sec_name': 'SPDR® S&P Emerging Markets Carbon Control Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'truth': '0', 'generated': '0.11', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA IP-OnePath Australian Shares NE', 'truth': '0', 'generated': '0.07', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Investment Portfolio-BlackRock Tactical Growth NE', 'truth': '0', 'generated': '0.33', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -NE', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Diversified Bond Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard High Yield Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Property Securities Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Macquarie Income Opps', 'truth': '0.03', 'generated': '0.12', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Diversified Inc', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Schroder Fixed Income', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Share Plus L/S', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only)', 'truth': '0.24', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only) P Class', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Asia Fund', 'truth': '0.27', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund', 'truth': '0.03', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund P Class', 'truth': '0.03', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Healthcare Fund', 'truth': '0.86', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Technology Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum European Fund', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Japan Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 397107472, 'sec_name': 'AMP Capital Specialist Diversified Fixed Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Ausbil Aus. Emrging Leaders', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Investors Mutual Aus. Shre', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Macquarie Inc Opportunities', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Cash', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - IncomeBuilder', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - Hedged Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Hedged Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PIMCO Div. Fixed Interest Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PIMCO Global Bond Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - PIMCO Global Bond Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PM CAPITAL Global Companies', 'truth': '1.54', 'generated': '1.45', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond - Allan Gray Australian Equity Fund Class A', 'truth': '0.28', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open', 'truth': '0.05', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Core Equity Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Value Trust -Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Core Equity Tr AUDHdg', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Australian Fixed Interest Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Australian Equity Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP Generations - AMP Cash Mgmt', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Property Securities Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock International Equity Index (Unhedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock International Equity Index (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539241700, 'sec_name': 'North Professional Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539261734, 'sec_name': 'ipac life choices Income Generator', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active High Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Moderately Defensive', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Balanced', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Defensive Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 527969661, 'sec_name': 'JPMorgan Global Equity Premium Income (Hedged) Complex ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Higher Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557362553, 'sec_name': 'JPMorgan Global Select Equity Active ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 550522985, 'sec_name': 'RQI Global Value – Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557362556, 'sec_name': 'JPMorgan Global Select Equity Fund - Class A (Hedged) Units', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557362556, 'sec_name': 'JPMorgan Global Select Equity Fund - Class A Units', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Core Equity Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Value Trust -Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 530101994, 'sec_name': 'Dimensional Global Core Equity Tr AUDHdg', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Australian Fixed Interest Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Australian Equity Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP Generations - AMP Cash Mgmt', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock Property Securities Index', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock International Equity Index (Unhedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266893, 'sec_name': 'AMP - Generations - BlackRock International Equity Index (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539241700, 'sec_name': 'North Professional Balanced', 'truth': '0', 'generated': '0.05', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539261734, 'sec_name': 'ipac life choices Income Generator', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active High Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Moderately Defensive', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Growth Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Balanced', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 539266874, 'sec_name': 'SUMMIT Select - Active Defensive Units', 'truth': '0', 'generated': '0.06', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 527969661, 'sec_name': 'JPMorgan Global Equity Premium Income (Hedged) Complex ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557526129, 'sec_name': 'Fortlake Real-Higher Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Australian Value Trust - Active ETF', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 531373053, 'sec_name': 'Dimensional Global Small Company Trust', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 541356150, 'sec_name': 'JPMorgan Global Research Enhanced Index Equity Trust - Class I (Hedged)', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557362553, 'sec_name': 'JPMorgan Global Select Equity Active ETF', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 550522985, 'sec_name': 'RQI Global Value – Class A', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557362556, 'sec_name': 'JPMorgan Global Select Equity Fund - Class A (Hedged) Units', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 557362556, 'sec_name': 'JPMorgan Global Select Equity Fund - Class A Units', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 377377369, 'sec_name': 'SPDR® S&P Emerging Markets Carbon Control Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'truth': '0', 'generated': '0.11', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'ANZ OA IP-OnePath Australian Shares NE', 'truth': '0', 'generated': '0.07', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OA Investment Portfolio-BlackRock Tactical Growth NE', 'truth': '0', 'generated': '0.33', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 401212184, 'sec_name': 'OnePath OneAnswer Investment Portfolio - OnePath Growth Index -NE', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Diversified Bond Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard High Yield Australian Shares Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 409723592, 'sec_name': 'Vanguard Index Australian Property Securities Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Macquarie Income Opps', 'truth': '0.03', 'generated': '0.12', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Diversified Inc', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Schroder Fixed Income', 'truth': '0', 'generated': '0.01', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 411062815, 'sec_name': 'Perpetual WFP-Perpetual Share Plus L/S', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only)', 'truth': '0.24', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Global Fund (Long Only) P Class', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Asia Fund', 'truth': '0.27', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund', 'truth': '0.03', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Brands Fund P Class', 'truth': '0.03', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Healthcare Fund', 'truth': '0.86', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum International Technology Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum European Fund', 'truth': '0.24', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 414751292, 'sec_name': 'Platinum Japan Fund', 'truth': '0.15', 'generated': '0', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 397107472, 'sec_name': 'AMP Capital Specialist Diversified Fixed Income Fund', 'truth': '', 'generated': '0', 'error': 'Truth is null and generated is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Ausbil Aus. Emrging Leaders', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Investors Mutual Aus. Shre', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Macquarie Inc Opportunities', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Cash', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - IncomeBuilder', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - Hedged Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - Hedged Global Share Fund', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PIMCO Div. Fixed Interest Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PIMCO Global Bond Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPFPR - PIMCO Global Bond Wholesale Class', 'truth': '0', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 420339794, 'sec_name': 'MLC MKPF - PM CAPITAL Global Companies', 'truth': '1.54', 'generated': '1.45', 'error': 'Truth is not equal with generated'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond - Allan Gray Australian Equity Fund Class A', 'truth': '0.28', 'generated': '', 'error': 'Generated is null and truth is not null'}\n", - "{'data_point': 'performance_fee_costs', 'doc_id': 446324179, 'sec_name': 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open', 'truth': '0.05', 'generated': '', 'error': 'Generated is null and truth is not null'}\n" - ] - } - ], + "outputs": [], "source": [ "for message_list_element in message_list:\n", " if message_list_element[\"data_point\"] == \"performance_fee_costs\":\n", From 355b145cf7c920245e54626cc9b66c8064c636f0 Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 28 Mar 2025 01:33:33 -0500 Subject: [PATCH 9/9] If found total_annual_dollar_based_charges and could be divisible by 52 or 12, then set the fund name and share name to be document production name --- core/data_extraction.py | 31 ++++++++++ main.py | 4 +- performance.ipynb | 124 ++++++++++++++++++++++++++-------------- 3 files changed, 114 insertions(+), 45 deletions(-) diff --git a/core/data_extraction.py b/core/data_extraction.py index 0a1b297..82a35de 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -289,6 +289,7 @@ class DataExtraction: data_list = self.supplement_ttr_pension(data_list) data_list = self.align_fund_share_name(data_list) data_list = self.supplement_minimum_initial_investment(data_list) + data_list = self.check_total_annual_dollar_based_charges(data_list) data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list) data_list = self.remove_duplicate_data(data_list) if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name: @@ -503,6 +504,36 @@ class DataExtraction: pass return data_list + def check_total_annual_dollar_based_charges(self, data_list: list): + """ + If found total_annual_dollar_based_charges and could be divisible by 52 or 12, + then set the fund name and share name to be document production name. + """ + for data_dict in data_list: + extract_data = data_dict.get("extract_data", {}) + data = extract_data.get("data", []) + found = False + for data_item in data: + keys = list(data_item.keys()) + fund_name = data_item.get("fund_name", "") + share_name = data_item.get("share_name", "") + if len(fund_name) == 0: + continue + if "total_annual_dollar_based_charges" in keys: + value = data_item.get("total_annual_dollar_based_charges", -1) + if len(str(value)) > 0: + value_divide_52 = value / 52 + value_divide_12 = value / 12 + if (value_divide_52 == round(value_divide_52, 4)) or \ + (value_divide_12 == round(value_divide_12, 4)): + data_item["fund_name"] = self.document_production + data_item["share_name"] = self.document_production + found = True + break + if found: + break + return data_list + def post_adjust_for_value_with_production_name(self, data_list: list): """ If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value. diff --git a/main.py b/main.py index 8d2eeda..c2e4fed 100644 --- a/main.py +++ b/main.py @@ -1448,7 +1448,7 @@ def get_aus_prospectus_document_category(): def test_post_adjust_extract_data(): - doc_id = "462780211" + doc_id = "448576924" pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( @@ -1538,7 +1538,7 @@ if __name__ == "__main__": with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines() if len(doc_id.strip()) > 0] - # special_doc_id_list = ["420339794"] + # special_doc_id_list = ["448576924"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/performance.ipynb b/performance.ipynb index 6a2f94c..1a2ea6e 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -15,51 +15,51 @@ "from utils.similarity import Similarity\n", "\n", "\n", - "# imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n", - "# \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n", - "# \"Minimum Initial Investment\", \"Benchmark\"]\n", - "\n", - "\n", - "# imp_datapoints_mapping = {\n", - "# \"Management Fee and Costs\": \"management_fee_and_costs\",\n", - "# \"Management Fee\": \"management_fee\",\n", - "# \"Performance fee and cost\": \"performance_fee_costs\",\n", - "# \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n", - "# \"Administration Fee and costs\": \"administration_fees\",\n", - "# \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", - "# \"Buy Spread\": \"buy_spread\",\n", - "# \"Sell Spread\": \"sell_spread\",\n", - "# \"Performance Fee\": \"PerformanceFeeCharged\",\n", - "# \"Minimum Initial Investment\": \"minimum_initial_investment\",\n", - "# \"Benchmark\": \"benchmark_name\"\n", - "# }\n", - "\n", - "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\",\n", - " \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\"]\n", + "imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n", + " \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n", + " \"Minimum Initial Investment\", \"Benchmark\"]\n", "\n", "\n", "imp_datapoints_mapping = {\n", " \"Management Fee and Costs\": \"management_fee_and_costs\",\n", " \"Management Fee\": \"management_fee\",\n", " \"Performance fee and cost\": \"performance_fee_costs\",\n", + " \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n", " \"Administration Fee and costs\": \"administration_fees\",\n", " \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", " \"Buy Spread\": \"buy_spread\",\n", - " \"Sell Spread\": \"sell_spread\"\n", + " \"Sell Spread\": \"sell_spread\",\n", + " \"Performance Fee\": \"PerformanceFeeCharged\",\n", + " \"Minimum Initial Investment\": \"minimum_initial_investment\",\n", + " \"Benchmark\": \"benchmark_name\"\n", "}\n", "\n", - "# path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", - "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx\"\n", + "# imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\",\n", + "# \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\"]\n", + "\n", + "\n", + "# imp_datapoints_mapping = {\n", + "# \"Management Fee and Costs\": \"management_fee_and_costs\",\n", + "# \"Management Fee\": \"management_fee\",\n", + "# \"Performance fee and cost\": \"performance_fee_costs\",\n", + "# \"Administration Fee and costs\": \"administration_fees\",\n", + "# \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n", + "# \"Buy Spread\": \"buy_spread\",\n", + "# \"Sell Spread\": \"sell_spread\"\n", + "# }\n", + "\n", + "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", + "# path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx\"\n", "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n", - "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250327230323.xlsx\"\n", - "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_6_documents_by_text_20250328004858.xlsx\"\n", + "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250328010350.xlsx\"\n", + "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_6_documents_by_text_20250328004858.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", "\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -349,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -363,18 +363,56 @@ "All Providers Results: \n", "Document List File - None\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9515 \t0.9074 \t1.0000 \t0.9074 \t54 \t49 \t0 \t5 \t0 \n", - "management_fee \t0.9515 \t0.9074 \t1.0000 \t0.9074 \t54 \t49 \t0 \t5 \t0 \n", - "performance_fee_costs \t0.9796 \t0.9796 \t0.9796 \t0.9630 \t50 \t48 \t4 \t1 \t1 \n", - "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t54 \t54 \t0 \t0 \t0 \n", - "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t16 \t16 \t38 \t0 \t0 \n", - "buy_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t34 \t0 \t0 \n", - "sell_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t34 \t0 \t0 \n", - "TOTAL \t0.9832 \t0.9706 \t0.9971 \t0.9683 \t268 \t256 \t110 \t11 \t1 \n", - "Total Shares Matched - 54\n", - "Total Shares Not Matched - 16\n", - "Percentage of Shares Matched - 77.14285714285715\n", - "Not Matched Shares Name List - ['Vision Balanced Growth Pen', 'CFS FC W PSup-FirstRate Term Dep (10yr)', 'CFS FC W PSup-FirstRate Term Dep (15yr)', 'CFS FC W PSup-FirstRate Term Dep (2yr)', 'CFS FC W PSup-FirstRate Term Dep (3yr)', 'CFS FC W PSup-FirstRate Term Dep (5yr)', 'CFS FC W PSup-FirstRate Term Dep (7yr)', 'AV Australian Shares TTR', 'AV Balanced Growth TTR', 'AV Cash TTR', 'AV Conservative Growth TTR', 'AV Diversified Index TTR', 'AV Growth TTR', 'AV High Growth TTR', 'AV International Shares TTR', 'AV Stable Growth TTR']\n" + "management_fee_and_costs \t0.9324 \t0.8811 \t0.9901 \t0.8734 \t458 \t400 \t0 \t54 \t4 \n", + "management_fee \t0.9615 \t0.9339 \t0.9907 \t0.9258 \t458 \t424 \t0 \t30 \t4 \n", + "performance_fee_costs \t0.9165 \t0.9088 \t0.9244 \t0.8930 \t306 \t269 \t140 \t27 \t22 \n", + "interposed_vehicle_performance_fee_cost \t0.9536 \t0.9114 \t1.0000 \t0.9847 \t73 \t72 \t379 \t7 \t0 \n", + "administration_fees \t0.9878 \t0.9759 \t1.0000 \t0.9956 \t81 \t81 \t375 \t2 \t0 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t68 \t68 \t390 \t0 \t0 \n", + "buy_spread \t0.9346 \t0.9081 \t0.9628 \t0.8974 \t377 \t336 \t75 \t34 \t13 \n", + "sell_spread \t0.9331 \t0.9054 \t0.9626 \t0.8952 \t377 \t335 \t75 \t35 \t13 \n", + "minimum_initial_investment \t0.9635 \t0.9814 \t0.9463 \t0.9476 \t335 \t317 \t117 \t6 \t18 \n", + "benchmark_name \t0.9298 \t0.8968 \t0.9653 \t0.9541 \t153 \t139 \t298 \t16 \t5 \n", + "TOTAL \t0.9513 \t0.9303 \t0.9742 \t0.9367 \t2686 \t2441 \t1849 \t211 \t79 \n", + "Total Shares Matched - 411\n", + "Total Shares Not Matched - 106\n", + "Percentage of Shares Matched - 79.49709864603481\n", + "Not Matched Shares Name List - ['SPDR® S&P World ex Australia Carbon Control Fund', 'BT-BlackRock Scientific Diversified Growth', 'Mercer Multi-manager Balanced Fund – Retail Units', 'Mercer Multi-manager Conservative Fund – Retail Units', 'Mercer Multi-manager Growth Fund – Retail Units', 'Mercer Multi-manager High Growth Fund – Retail Units', 'ANZ OA Inv-OnePath Multi Asset Income EF', 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'ANZ OA IP-OP Diversified Credit EF', 'ANZ OA IP-OP Diversified Credit NE', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond EF', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond NE', 'OnePath OA Inv-Nikko AM Australian Shares EF', 'OnePath OA Inv-Nikko AM Australian Shares NEF', 'OnePath OA IP- Pendal Monthly Income Plus-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-NEF', 'OnePath OA IP-Bentham Global Income Trust-EF/Sel', 'OnePath OA IP-Bentham Global Income Trust-NEF', 'OnePath OA IP-Kapstream Absolute Return Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-NEF', 'OnePath OA IP-OnePath Active Growth Trust-NEF', 'OnePath OA IP-OnePath High Growth Trust-EF/Sel', 'OnePath OA IP-OnePath High Growth Trust-NEF', 'OnePath OA IP-OnePath Managed Growth Trust-EF/Sel', 'OnePath OA IP-OnePath Managed Growth Trust-NEF', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-NEF', 'OnePath OA IP-OptiMix Australian Share Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Share Trust-NEF', 'OnePath OA IP-OptiMix Global Emerging Markets Share-EF/Sel', 'OnePath OA IP-OptiMix Global Emerging Markets Share-NEF', 'OnePath OA IP-OptiMIx Global Share Trust-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-NEF', 'OnePath OA IP-OptiMix High Growth Trust-EF/Sel', 'OnePath OA IP-OptiMix High Growth Trust-NEF', 'OnePath OA IP-OptiMix Property Securities Trust-EF/Sel', 'OnePath OA IP-OptiMix Property Securities Trust-NEF', 'OnePath OA IP-Perpetual Conservative Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'OnePath OA IP-Platinum International Trust-EF/Sel', 'OnePath OA IP-Platinum International Trust-NEF', 'OnePath OA IP-Schroder Fixed Income-EF/Sel', 'OnePath OA IP-Schroder Fixed Income-NEF', 'OnePath OA IP-UBS Defensive Trust-EF/Sel', 'OnePath OA IP-UBS Defensive Trust-NEF', 'OnePath OA IP-UBS Diversified Fixed Income Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-NEF', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -NE', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -NE', 'OnePath Schroder Real Return Trust (Entry Fee)', 'OnePath Schroder Real Return Trust (Nil Entry Fee)', 'Telstra Growth Pen', 'First Sentier Concentrated Aus Share', 'First Sentier Australian Small Companies', 'First Sentier Imputation', 'First Sentier Global Property Securities', 'First Sentier Australian Share', 'CFS FC-Investors Mutual Future Leaders', 'Stewart Worldwide Leaders Sustainability', 'First Sentier Property Securities', 'MyNorth Index Defensive', 'MLC MKPFPR - Altrinsic Global Eq Trust', 'MLC MKPFPR - BlackRock Global Allocation', 'MLC MKPF - Hedged Global Share Fund', 'MLC MKPF - Inflation Plus - Conservative', 'MLC MKPFPR - MLC - Platinum Global Fund', 'MLC MasterKey Pension Fundamentals - Perpetual Australian Share', 'MLC MasterKey Super Fundamentals - Perpetual Australian Share', 'MLC MKPF - Perpetual WS Ethical SRI Fund', 'MLC MKSF - Perpetual WS Ethical SRI Fund', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - Perpetual Smll Co Fund No.2', 'MLC MasterKey Super Fundamentals - Perpetual Small Co Fund No.2', 'MLC MKPF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKSF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKPF - PIMCO Global Bond Wholesale Class', 'MLC MKPFPR - Platinum Asia Fund', 'MLC MKSF - Platinum Asia Fund', 'MLC MKPF - Platinum International Fund', 'MLC MKSF - Platinum International Fund', 'MLC MKPF - PM CAPITAL Global Companies', 'MLC MKSF - PM CAPITAL Global Companies', 'MLC MKPF - Schroder WS Australian Equity', 'MLC MKSF - Schroder WS Australian Equity', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Aust Property Index', 'MLC MasterKey Super Fundamentals - MLC Australian Property Index', 'MLC MKSF - Vanguard Intl Shr Indx (Hgd)', 'MLC MKSF - Vanguard Intl Shr Indx', 'HOSTPLUS Fixed Interest Indexed Super', 'Australian Unity Inv Wholesale Deposits Fund', 'Lifeplan Investment Bond Lifeplan Capital Guaranteed', 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open', 'Dimensional Australian Core Equity Trust', 'CFS MIF-Geared Share NEF', 'BT Imputation Shares Retail', 'Dimensional Australia Core Equity Trust - Active ETF']\n", + "All Providers Results: \n", + "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", + "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", + "management_fee_and_costs \t0.9505 \t0.9058 \t1.0000 \t0.9058 \t191 \t173 \t0 \t18 \t0 \n", + "management_fee \t0.9867 \t0.9738 \t1.0000 \t0.9738 \t191 \t186 \t0 \t5 \t0 \n", + "performance_fee_costs \t0.8832 \t0.8700 \t0.8969 \t0.8796 \t99 \t87 \t81 \t13 \t10 \n", + "interposed_vehicle_performance_fee_cost \t0.9369 \t0.8814 \t1.0000 \t0.9634 \t53 \t52 \t132 \t7 \t0 \n", + "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t15 \t15 \t176 \t0 \t0 \n", + "buy_spread \t0.9812 \t0.9683 \t0.9946 \t0.9634 \t189 \t183 \t1 \t6 \t1 \n", + "sell_spread \t0.9757 \t0.9577 \t0.9945 \t0.9529 \t189 \t181 \t1 \t8 \t1 \n", + "minimum_initial_investment \t0.9189 \t0.9577 \t0.8831 \t0.8743 \t154 \t136 \t31 \t6 \t18 \n", + "benchmark_name \t0.9271 \t0.8812 \t0.9780 \t0.9267 \t99 \t89 \t88 \t12 \t2 \n", + "TOTAL \t0.9512 \t0.9329 \t0.9719 \t0.9378 \t1180 \t1102 \t510 \t75 \t111 \n", + "Total Shares Matched - 186\n", + "Total Shares Not Matched - 4\n", + "Percentage of Shares Matched - 97.89473684210527\n", + "Not Matched Shares Name List - ['Dimensional Australian Core Equity Trust', 'CFS MIF-Geared Share NEF', 'BT Imputation Shares Retail', 'Dimensional Australia Core Equity Trust - Active ETF']\n", + "All Providers Results: \n", + "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", + "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", + "management_fee_and_costs \t0.9190 \t0.8631 \t0.9827 \t0.8502 \t267 \t227 \t0 \t36 \t4 \n", + "management_fee \t0.9426 \t0.9049 \t0.9835 \t0.8914 \t267 \t238 \t0 \t25 \t4 \n", + "performance_fee_costs \t0.9333 \t0.9286 \t0.9381 \t0.9026 \t207 \t182 \t59 \t14 \t12 \n", + "interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t247 \t0 \t0 \n", + "administration_fees \t0.9851 \t0.9706 \t1.0000 \t0.9925 \t66 \t66 \t199 \t2 \t0 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t68 \t68 \t199 \t0 \t0 \n", + "buy_spread \t0.8844 \t0.8453 \t0.9273 \t0.8502 \t188 \t153 \t74 \t28 \t12 \n", + "sell_spread \t0.8876 \t0.8508 \t0.9277 \t0.8539 \t188 \t154 \t74 \t27 \t12 \n", + "minimum_initial_investment \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t181 \t181 \t86 \t0 \t0 \n", + "benchmark_name \t0.9346 \t0.9259 \t0.9434 \t0.9738 \t54 \t50 \t210 \t4 \t3 \n", + "TOTAL \t0.9487 \t0.9289 \t0.9703 \t0.9315 \t1506 \t1339 \t1148 \t136 \t158 \n", + "Total Shares Matched - 267\n", + "Total Shares Not Matched - 102\n", + "Percentage of Shares Matched - 72.35772357723577\n", + "Not Matched Shares Name List - ['SPDR® S&P World ex Australia Carbon Control Fund', 'BT-BlackRock Scientific Diversified Growth', 'Mercer Multi-manager Balanced Fund – Retail Units', 'Mercer Multi-manager Conservative Fund – Retail Units', 'Mercer Multi-manager Growth Fund – Retail Units', 'Mercer Multi-manager High Growth Fund – Retail Units', 'ANZ OA Inv-OnePath Multi Asset Income EF', 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'ANZ OA IP-OP Diversified Credit EF', 'ANZ OA IP-OP Diversified Credit NE', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond EF', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond NE', 'OnePath OA Inv-Nikko AM Australian Shares EF', 'OnePath OA Inv-Nikko AM Australian Shares NEF', 'OnePath OA IP- Pendal Monthly Income Plus-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-NEF', 'OnePath OA IP-Bentham Global Income Trust-EF/Sel', 'OnePath OA IP-Bentham Global Income Trust-NEF', 'OnePath OA IP-Kapstream Absolute Return Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-NEF', 'OnePath OA IP-OnePath Active Growth Trust-NEF', 'OnePath OA IP-OnePath High Growth Trust-EF/Sel', 'OnePath OA IP-OnePath High Growth Trust-NEF', 'OnePath OA IP-OnePath Managed Growth Trust-EF/Sel', 'OnePath OA IP-OnePath Managed Growth Trust-NEF', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-NEF', 'OnePath OA IP-OptiMix Australian Share Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Share Trust-NEF', 'OnePath OA IP-OptiMix Global Emerging Markets Share-EF/Sel', 'OnePath OA IP-OptiMix Global Emerging Markets Share-NEF', 'OnePath OA IP-OptiMIx Global Share Trust-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-NEF', 'OnePath OA IP-OptiMix High Growth Trust-EF/Sel', 'OnePath OA IP-OptiMix High Growth Trust-NEF', 'OnePath OA IP-OptiMix Property Securities Trust-EF/Sel', 'OnePath OA IP-OptiMix Property Securities Trust-NEF', 'OnePath OA IP-Perpetual Conservative Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'OnePath OA IP-Platinum International Trust-EF/Sel', 'OnePath OA IP-Platinum International Trust-NEF', 'OnePath OA IP-Schroder Fixed Income-EF/Sel', 'OnePath OA IP-Schroder Fixed Income-NEF', 'OnePath OA IP-UBS Defensive Trust-EF/Sel', 'OnePath OA IP-UBS Defensive Trust-NEF', 'OnePath OA IP-UBS Diversified Fixed Income Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-NEF', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -NE', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -NE', 'OnePath Schroder Real Return Trust (Entry Fee)', 'OnePath Schroder Real Return Trust (Nil Entry Fee)', 'Telstra Growth Pen', 'First Sentier Concentrated Aus Share', 'First Sentier Australian Small Companies', 'First Sentier Imputation', 'First Sentier Global Property Securities', 'First Sentier Australian Share', 'CFS FC-Investors Mutual Future Leaders', 'Stewart Worldwide Leaders Sustainability', 'First Sentier Property Securities', 'MyNorth Index Defensive', 'MLC MKPFPR - Altrinsic Global Eq Trust', 'MLC MKPFPR - BlackRock Global Allocation', 'MLC MKPF - Hedged Global Share Fund', 'MLC MKPF - Inflation Plus - Conservative', 'MLC MKPFPR - MLC - Platinum Global Fund', 'MLC MasterKey Pension Fundamentals - Perpetual Australian Share', 'MLC MasterKey Super Fundamentals - Perpetual Australian Share', 'MLC MKPF - Perpetual WS Ethical SRI Fund', 'MLC MKSF - Perpetual WS Ethical SRI Fund', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - Perpetual Smll Co Fund No.2', 'MLC MasterKey Super Fundamentals - Perpetual Small Co Fund No.2', 'MLC MKPF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKSF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKPF - PIMCO Global Bond Wholesale Class', 'MLC MKPFPR - Platinum Asia Fund', 'MLC MKSF - Platinum Asia Fund', 'MLC MKPF - Platinum International Fund', 'MLC MKSF - Platinum International Fund', 'MLC MKPF - PM CAPITAL Global Companies', 'MLC MKSF - PM CAPITAL Global Companies', 'MLC MKPF - Schroder WS Australian Equity', 'MLC MKSF - Schroder WS Australian Equity', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Aust Property Index', 'MLC MasterKey Super Fundamentals - MLC Australian Property Index', 'MLC MKSF - Vanguard Intl Shr Indx (Hgd)', 'MLC MKSF - Vanguard Intl Shr Indx', 'HOSTPLUS Fixed Interest Indexed Super', 'Australian Unity Inv Wholesale Deposits Fund', 'Lifeplan Investment Bond Lifeplan Capital Guaranteed', 'Lifeplan Investment Bond MLC Horizon 2-Capital Stable Open']\n" ] } ], @@ -430,8 +468,8 @@ "print(\"\\n\")\n", "print(\"\\n\")\n", "document_list_file_list = [None, \n", - " # \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n", - " # \"./sample_documents/aus_prospectus_17_documents_sample.txt\"\n", + " \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n", + " \"./sample_documents/aus_prospectus_17_documents_sample.txt\"\n", " ]\n", "# document_list_file_list = [None]\n", "for document_list_file in document_list_file_list:\n",