clean code

This commit is contained in:
Blade He 2025-03-24 17:10:16 -05:00
parent 9be6d1296d
commit 4edc4b4768
6 changed files with 180 additions and 98 deletions

View File

@ -394,11 +394,22 @@ class DataExtraction:
fund_name = data_item.get("fund_name", "")
if len(fund_name) == 0:
continue
share_name = data_item.get("share_name", "")
updated_fund_name = self.update_pension_ttr_fund_name(fund_name)
if updated_fund_name != fund_name:
fund_name = updated_fund_name
data_item["fund_name"] = fund_name
updated_share_name = self.update_pension_ttr_fund_name(share_name)
if updated_share_name != share_name:
share_name = updated_share_name
data_item["share_name"] = share_name
fund_name_splits = fund_name.split()
if fund_name_splits[-1] == "TTR":
if fund_name_splits[-1] == "TTR" and fund_name not in ttr_fund_name_list:
ttr_fund_name_list.append(fund_name)
exist_ttr = True
if fund_name_splits[-1] == "Pension":
if fund_name_splits[-1] == "Pension" and fund_name not in pension_fund_name_list:
pension_fund_name_list.append(fund_name)
exist_pension = True
if exist_ttr and exist_pension:
@ -449,6 +460,22 @@ class DataExtraction:
data.extend(new_item_list)
return data_list
def update_pension_ttr_fund_name(self, investment_name: str):
pension_prefix_list = ["retirement account", "account-based pension", "account based pension"]
ttr_prefix_list = ["transition to retirement account", "pre-retirement pension", "pre retirement pension"]
investment_name_lower = investment_name.lower()
for pension_prefix in pension_prefix_list:
if investment_name_lower.startswith(pension_prefix) and investment_name_lower != pension_prefix:
pension_prefix_split = pension_prefix.split()
investment_name = " ".join(investment_name.split()[len(pension_prefix_split):]) + " Pension"
break
for ttr_prefix in ttr_prefix_list:
if investment_name_lower.startswith(ttr_prefix) and investment_name_lower != ttr_prefix:
ttr_prefix_split = ttr_prefix.split()
investment_name = " ".join(investment_name.split()[len(ttr_prefix_split):]) + " TTR"
break
return investment_name
def check_administration_fees(self, data_list: list):
"""
If document source is aus_prospectus and document category is MIS, then remove the administration fees from data_list
@ -668,9 +695,10 @@ class DataExtraction:
for mf in management_fee_list:
mf_fund_name = mf.get("fund_name", "")
mf_share_name = mf.get("share_name", "")
if (mf_fund_name == fund_name and mf_share_name == share_name) or \
(len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and
(mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))):
# if (mf_fund_name == fund_name and mf_share_name == share_name) or \
# (len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and
# (mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))):
if (mf_fund_name == fund_name and mf_share_name == share_name):
if exist_complex_rule_keywords and \
("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
mf["management_fee"] = management_fee
@ -693,9 +721,10 @@ class DataExtraction:
for mfc in management_fee_costs_list:
mfc_fund_name = mfc.get("fund_name", "")
mfc_share_name = mfc.get("share_name", "")
if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \
(len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and
(mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))):
# if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \
# (len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and
# (mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))):
if (mfc_fund_name == fund_name and mfc_share_name == share_name):
if exist_complex_rule_keywords and \
("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
mfc["management_fee_and_costs"] = management_fee_costs

View File

@ -48,6 +48,24 @@
"---Example End---",
"Correct fund name: MLC Horizon 2 Income Portfolio",
"Correct share name: MLC Horizon 2 Income Portfolio",
"f. In table header, \"Retirement account\" or \"Account-based pension\" means \"Pension\"; ",
"\"Transition to Retirement account\" or \"Pre-retirement pension\" means \"TTR\". ",
"Please append them to the fund name and share name.",
"f.1 Example 1",
"---Example 1 Start---",
"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCash 0.05%0.00% 0.00% 0.05%\n",
"---Example 1 End---",
"The prefix is \"Retirement account\", the investment option is \"Cash\", so fund name and share name should be: \"Retirement account Cash\".",
"f.2 Example 2",
"---Example 2 Start---",
"Transition to Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCash 0.05%0.00% 0.00% 0.05%\n",
"---Example 2 End---",
"The prefix is \"Transition to Retirement account\", the investment option is \"Cash\", so fund name and share name should be: \"Transition to Retirement account Cash\".",
"f.3 Example 3",
"---Example 3 Start---",
"Fees and costs* \n\nRetirement account Transition to Retirement account \nAdministration fees (taken directly \nfrom your account) \n$1.50 per week plus 0.10% pa of your account balance on the day the fee \nis deducted (0.10% pa component is capped at $300 pa). \nAdministration costs (not taken \ndirectly from your account) \nThis is deducted from the Funds reserves throughout the year, not your account. \n0.09% pa (based on costs for the financial year ended 30 June 2024). \n\n\nRest Pension Product Disclosure Statement \n\n6",
"---Example 3 End---",
"Although exist \"Retirement account\" and \"Transition to Retirement account\", but the investment option is not exist, so fund name and share name should be: \"Rest Pension\".",
"\n",
"- 3. Only extract the latest data from context:",
"If with multiple data values in same row, please extract the latest.",
@ -62,8 +80,13 @@
"---Example End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}",
"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
"- 5. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\".",
"- 6. Identify the value of data point and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0",
"---Example Start---",
"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced Indexed 0.00% 0.00% 0.00% 0.00%\n",
"---Example End---",
"For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values."
],
"investment_level": {
"total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.",
@ -251,17 +274,24 @@
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\", so the output should be:",
"{\"data\": [{\"fund name\": \"FirstChoice Wholesale Defensive\", \"share name\": \"FirstChoice Wholesale Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.85}, {\"fund name\": \"FirstChoice Wholesale Conservative\", \"share name\": \"FirstChoice Wholesale Conservative\", \"management_fee_and_costs\": 0.9, \"management_fee\": 0.9, \"performance_fee_costs\": 0.02}]}",
"---Example 2 Start---",
"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCapital Stable 0.46% 0.04% 0.08% 0.54%\nBalanced 0.52% 0.06% 0.10%0.62% \n",
"---Example 2 End",
"The column: \"(A) Investment fees and costs (including (B) performance fees) (pa)*\" includes \"(B) performance fees) (pa)*\", we should subtract the \"(B) performance fees) (pa)*\" value, just output the pure management fee and costs value.",
"Besides, the \"Retirement account\" is the pre-fix fund name, should output it with fund/ share name together, e.g. \"Retirement account Capital Stable\"",
"The output should be:",
"{\"data\": [{\"fund name\": \"Retirement account Capital Stable\", \"share name\": \"Retirement account Capital Stable\", \"management_fee_and_costs\": 0.42, \"management_fee\": 0.42, \"performance_fee_costs\": 0.04}, {\"fund name\": \"Retirement account Balanced\", \"share name\": \"Retirement account Balanced\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.46, \"performance_fee_costs\": 0.06}]}",
"---Example 3 Start---",
"Investment \noption \nInvestment fees and \ncosts (p.a.) \n1 \nTransaction \ncosts (p.a.) \nMySuper/ \nBalanced \n0.38% (including 0.09% \nPerformance fee) \n0.18% \nManaged \nGrowth \n0.38% (including 0.11% \nPerformance fee) \n0.08% \n",
"---Example 2 End---",
"---Example 3 End---",
"The column: \"Investment fees and costs (p.a.)\", \"including Performance fee\", meaning the value is the sum of \"Management costs\" and \"performance fee\", We should subtract the \"performance fee\" value, just output the \"Management costs\" value.",
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\".",
"So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee_costs is 0.09.",
"For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee_costs is 0.11.",
"So the output should be:",
"{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee_costs\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee_costs\": 0.11}]}",
"---Example 3 Start---",
"---Example 4 Start---",
"Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n",
"---Example 3 End---",
"---Example 4 End---",
"The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.",
"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
@ -296,36 +326,56 @@
"---Example End---",
"The values in example is **Maximum management fee**, should ignore all of them.",
"The Output should be:",
"{\"data\": []}"
"{\"data\": []}",
"K. The management fee and costs in paragraph with speficic fund/ share prefix name: \"Account-based pension\" or \"Pre-retirement pension\"",
"---Example 1 Start---",
"Account-based pension \nInvestment fees \nand costs 2 \nHigh Growth 0.45%, Growth 0.49%",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Account-based pension High Growth\", \"share name\": \"Account-based pension High Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45}, {\"fund name\": \"Account-based pension Growth\", \"share name\": \"Account-based pension Growth\", \"management_fee_and_costs\": 0.49, \"management_fee\": 0.49}]}",
"---Example 2 Start---",
"Pre-retirement pension \nWe generally calculate \nand deduct this fee daily when unit \nprices are determined. \nHigh Growth 0.48%, Growth 0.50%",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Pre-retirement pension High Growth\", \"share name\": \"Pre-retirement pension High Growth\", \"management_fee_and_costs\": 0.48, \"management_fee\": 0.48}, {\"fund name\": \"Pre-retirement pension Growth\", \"share name\": \"Pre-retirement pension Growth\", \"management_fee_and_costs\": 0.50, \"management_fee\": 0.50}]}"
],
"administration_fees":[
"Administration fees and costs and total annual dollar-based charges are share class level data.",
"Simple case:",
"----Example 1 Start----",
"Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n",
"Fees and costs summary \n\nVision income streams \n\nType of fee Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n2 \n0.25% pa of your account balance (made up of \n0.25% of your account balance which is capped \nat $1,050 pa plus a reserving margin of 0.00% \npa of each investment options assets).",
"----Example 1 End----",
"According to example, the administration fee is 0.25% pa, so administration_fees is 0.25, ",
"The output should be:",
"{\"data\": [{\"fund name\": \"Vision income streams\", \"share name\": \"Vision income streams\", \"administration_fees\": 0.25}]}",
"\n",
"----Example 2 Start----",
"Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n",
"----Example 2 End----",
"According to example, the administration fee is $1.30 per week plus 0.29% pa, so administration_fees is 0.29, ",
"total_annual_dollar_based_charges is 1.30 * 52 = 67.6",
"The output should be:",
"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29, \"total_annual_dollar_based_charges\": 67.6}]}",
"\n",
"----Example 2 Start----",
"----Example 3 Start----",
"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouses \ncombined account balances exceed $969,410 (conditions apply)",
"----Example 2 End----",
"----Example 3 End----",
"According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ",
"total_annual_dollar_based_charges is 1 * 52 = 52",
"The output should be:",
"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}",
"---Example 3 Start---",
"\n",
"---Example 4 Start---",
"\nPrime Super Income Stream\nType of fee \nor cost \nAmount How and when paid \nOngoing annual fees and costs \n1 \nAdministration \nfees and costs \nAdministration \nfees of $1.30 \nper week \nPlus \n0.50% p.a. of \nyour account \nbalance, capped \nat $500 p.a. \nDeducted from your \naccount on the last \nbusiness day of each \nmonth, except if you \nare leaving Prime \nSuper, in which case \nit is deducted prior to \nyour exit from Prime \nSuper. \nInvestment \nfees and costs \n2 \n0.07% to 1.00% \nof assets p.a. \ndepending on \nthe investment \noption \nTaken into account \nprior to the declaration \nof weekly earning \nrates. This cost is not \ndeducted directly from \nyour account. \n",
"---Example 3 End---",
"---Example 4 End---",
"According to example, the administration fee is $1.30 per week plus 0.50% p.a., so administration_fees is 0.5, ",
"total_annual_dollar_based_charges is 1.30 * 52 = 67.6",
"The output should be:",
"{\"data\": [{\"fund name\": \"Prime Super Income Stream\", \"share name\": \"Prime Super Income Stream\", \"administration_fees\": 0.5, \"total_annual_dollar_based_charges\": 67.6}]}",
"---Example 4 Start---",
"\n",
"---Example 5 Start---",
"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nTTR income stream Retirement income stream Reference \nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouses \ncombined account balances exceed $969,410 (conditions apply) \nRefer to the Fees and \nother costs section on \npages 40-46 for details \n",
"---Example 4 End---",
"---Example 5 End---",
"According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ",
"total_annual_dollar_based_charges is 1 * 52 = 52",
"The output should be:",
@ -341,8 +391,21 @@
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"administration_fees\": 0.32}]}",
"---Example 2 Start---",
"Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Funds \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n",
"Mine Super\nType of fee or cost Amount (% pa) How and when paid \nOngoing annual fees and costs \n1 \nWe generally calculate and \ndeduct this fee daily when unit \nprices are determined. \nAdministration fees \nand costs \n0.16% pa \nPlus \n0.031% pa. \n",
"---Example 2 End---",
"According to example, the relevant values: 0.16% and 0.031%, so administration_fees is 0.16 + 0.031 = 0.191",
"The output should be:",
"{\"data\": [{\"fund name\": \"Mine Super\", \"share name\": \"Mine Super\", \"administration_fees\": 0.191}]}",
"---Example 3 Start---",
"Fees and costs* \n\nRetirement account Transition to Retirement account \nAdministration fees (taken directly \nfrom your account) \n$1.50 per week plus 0.10% pa of your account balance on the day the fee \nis deducted (0.10% pa component is capped at $300 pa). \nAdministration costs (not taken \ndirectly from your account) \nThis is deducted from the Funds reserves throughout the year, not your account. \n0.09% pa (based on costs for the financial year ended 30 June 2024). \n\n\nRest Pension Product Disclosure Statement \n\n6",
"---Example 3 End---",
"According to the example, the administration fee is $1.50 per week plus 0.10% pa, Administration costs is 0.09% pa so administration_fees is 0.1 + 0.09 = 0.19, ",
"total_annual_dollar_based_charges is 1.50 * 52 = 78",
"The output should be:",
"{\"data\": [{\"fund name\": \"Rest Pension\", \"share name\": \"Rest Pension\", \"administration_fees\": 0.19, \"total_annual_dollar_based_charges\": 78}]}",
"---Example 4 Start---",
"Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Funds \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n",
"---Example 4 End---",
"Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees."
],
"total_annual_dollar_based_charges": [

View File

@ -1533,11 +1533,11 @@ if __name__ == "__main__":
# doc_source = "emea_ar"
if doc_source == "aus_prospectus":
document_sample_file = (
r"./sample_documents/aus_prospectus_46_documents_sample.txt"
r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt"
)
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx"
# special_doc_id_list = ["441280757"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -15,35 +15,49 @@
"from utils.similarity import Similarity\n",
"\n",
"\n",
"imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n",
" \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n",
" \"Minimum Initial Investment\", \"Benchmark\"]\n",
"# imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\", \"Interposed vehicle Performance fee and Costs\",\n",
"# \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\", \"Performance Fee\",\n",
"# \"Minimum Initial Investment\", \"Benchmark\"]\n",
"\n",
"\n",
"# imp_datapoints_mapping = {\n",
"# \"Management Fee and Costs\": \"management_fee_and_costs\",\n",
"# \"Management Fee\": \"management_fee\",\n",
"# \"Performance fee and cost\": \"performance_fee_costs\",\n",
"# \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n",
"# \"Administration Fee and costs\": \"administration_fees\",\n",
"# \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n",
"# \"Buy Spread\": \"buy_spread\",\n",
"# \"Sell Spread\": \"sell_spread\",\n",
"# \"Performance Fee\": \"PerformanceFeeCharged\",\n",
"# \"Minimum Initial Investment\": \"minimum_initial_investment\",\n",
"# \"Benchmark\": \"benchmark_name\"\n",
"# }\n",
"\n",
"imp_datapoints = [\"Management Fee and Costs\", \"Management Fee\", \"Performance fee and cost\",\n",
" \"Administration Fee and costs\", \"Total Annual Dollar Based Charges\", \"Buy Spread\", \"Sell Spread\"]\n",
"\n",
"\n",
"imp_datapoints_mapping = {\n",
" \"Management Fee and Costs\": \"management_fee_and_costs\",\n",
" \"Management Fee\": \"management_fee\",\n",
" \"Performance fee and cost\": \"performance_fee_costs\",\n",
" \"Interposed vehicle Performance fee and Costs\": \"interposed_vehicle_performance_fee_cost\",\n",
" \"Administration Fee and costs\": \"administration_fees\",\n",
" \"Total Annual Dollar Based Charges\": \"total_annual_dollar_based_charges\",\n",
" \"Buy Spread\": \"buy_spread\",\n",
" \"Sell Spread\": \"sell_spread\",\n",
" \"Performance Fee\": \"PerformanceFeeCharged\",\n",
" \"Minimum Initial Investment\": \"minimum_initial_investment\",\n",
" \"Benchmark\": \"benchmark_name\"\n",
" \"Sell Spread\": \"sell_spread\"\n",
"}\n",
"\n",
"path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
"path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx\"\n",
"# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317.xlsx\"\n",
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250319000625.xlsx\"\n",
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_6_documents_by_text_20250324170432.xlsx\"\n",
"provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
@ -316,7 +330,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 34,
"metadata": {},
"outputs": [
{
@ -330,53 +344,17 @@
"All Providers Results: \n",
"Document List File - None\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.9123 \t0.8465 \t0.9891 \t0.8387 \t433 \t364 \t0 \t66 \t4 \n",
"management_fee \t0.9284 \t0.8744 \t0.9895 \t0.8664 \t433 \t376 \t0 \t54 \t4 \n",
"performance_fee_costs \t0.9217 \t0.8691 \t0.9811 \t0.8986 \t291 \t259 \t131 \t39 \t5 \n",
"interposed_vehicle_performance_fee_cost \t0.9536 \t0.9114 \t1.0000 \t0.9839 \t73 \t72 \t355 \t7 \t0 \n",
"administration_fees \t0.9857 \t0.9857 \t0.9857 \t0.9954 \t70 \t69 \t363 \t1 \t1 \n",
"total_annual_dollar_based_charges \t0.9920 \t0.9841 \t1.0000 \t0.9977 \t62 \t62 \t371 \t1 \t0 \n",
"buy_spread \t0.9483 \t0.9187 \t0.9798 \t0.9147 \t370 \t339 \t58 \t30 \t7 \n",
"sell_spread \t0.9526 \t0.9268 \t0.9799 \t0.9217 \t370 \t342 \t58 \t27 \t7 \n",
"minimum_initial_investment \t0.9593 \t0.9641 \t0.9547 \t0.9424 \t309 \t295 \t114 \t11 \t14 \n",
"benchmark_name \t0.8738 \t0.8084 \t0.9507 \t0.9101 \t157 \t135 \t260 \t32 \t7 \n",
"TOTAL \t0.9428 \t0.9089 \t0.9810 \t0.9270 \t2568 \t2313 \t1710 \t268 \t49 \n",
"Total Funds Matched - 434\n",
"Total Funds Not Matched - 131\n",
"Percentage of Funds Matched - 76.8141592920354\n",
"All Providers Results: \n",
"Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.9462 \t0.9027 \t0.9940 \t0.8978 \t185 \t167 \t0 \t18 \t1 \n",
"management_fee \t0.9724 \t0.9514 \t0.9944 \t0.9462 \t185 \t176 \t0 \t9 \t1 \n",
"performance_fee_costs \t0.9239 \t0.8750 \t0.9785 \t0.9194 \t99 \t91 \t80 \t13 \t2 \n",
"interposed_vehicle_performance_fee_cost \t0.9369 \t0.8814 \t1.0000 \t0.9624 \t53 \t52 \t127 \t7 \t0 \n",
"administration_fees \t0.9412 \t1.0000 \t0.8889 \t0.9946 \t9 \t8 \t177 \t0 \t1 \n",
"buy_spread \t0.9779 \t0.9672 \t0.9888 \t0.9570 \t183 \t177 \t1 \t6 \t2 \n",
"sell_spread \t0.9835 \t0.9781 \t0.9890 \t0.9677 \t183 \t179 \t1 \t4 \t2 \n",
"minimum_initial_investment \t0.9306 \t0.9571 \t0.9054 \t0.8925 \t148 \t134 \t32 \t6 \t14 \n",
"benchmark_name \t0.9206 \t0.8878 \t0.9560 \t0.9194 \t99 \t87 \t84 \t11 \t4 \n",
"TOTAL \t0.9481 \t0.9334 \t0.9661 \t0.9397 \t1144 \t1071 \t502 \t74 \t76 \n",
"Total Funds Matched - 186\n",
"Total Funds Not Matched - 10\n",
"Percentage of Funds Matched - 94.89795918367348\n",
"All Providers Results: \n",
"Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.8854 \t0.8041 \t0.9850 \t0.7944 \t248 \t197 \t0 \t48 \t3 \n",
"management_fee \t0.8929 \t0.8163 \t0.9852 \t0.8065 \t248 \t200 \t0 \t45 \t3 \n",
"performance_fee_costs \t0.9205 \t0.8660 \t0.9825 \t0.8831 \t192 \t168 \t51 \t26 \t3 \n",
"interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t228 \t0 \t0 \n",
"administration_fees \t0.9919 \t0.9839 \t1.0000 \t0.9960 \t61 \t61 \t186 \t1 \t0 \n",
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t62 \t62 \t186 \t0 \t0 \n",
"buy_spread \t0.9178 \t0.8710 \t0.9701 \t0.8831 \t187 \t162 \t57 \t24 \t5 \n",
"sell_spread \t0.9209 \t0.8763 \t0.9702 \t0.8871 \t187 \t163 \t57 \t23 \t5 \n",
"minimum_initial_investment \t0.9847 \t0.9699 \t1.0000 \t0.9798 \t161 \t161 \t82 \t5 \t0 \n",
"benchmark_name \t0.8000 \t0.6957 \t0.9412 \t0.9032 \t58 \t48 \t176 \t21 \t3 \n",
"TOTAL \t0.9314 \t0.8883 \t0.9834 \t0.9133 \t1424 \t1242 \t1023 \t193 \t98 \n",
"Total Funds Matched - 248\n",
"Total Funds Not Matched - 121\n",
"Percentage of Funds Matched - 67.20867208672087\n"
"management_fee_and_costs \t0.7470 \t0.6739 \t0.8378 \t0.5962 \t52 \t31 \t0 \t15 \t6 \n",
"management_fee \t0.8046 \t0.7609 \t0.8537 \t0.6731 \t52 \t35 \t0 \t11 \t6 \n",
"performance_fee_costs \t0.7805 \t0.9697 \t0.6531 \t0.6538 \t50 \t32 \t2 \t1 \t17 \n",
"administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t52 \t52 \t0 \t0 \t0 \n",
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t16 \t16 \t36 \t0 \t0 \n",
"buy_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n",
"sell_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n",
"TOTAL \t0.9046 \t0.9149 \t0.9064 \t0.8462 \t258 \t202 \t106 \t27 \t29 \n",
"Total Funds Matched - 52\n",
"Total Funds Not Matched - 28\n",
"Percentage of Funds Matched - 65.0\n"
]
}
],
@ -434,9 +412,10 @@
"\n",
"print(\"\\n\")\n",
"print(\"\\n\")\n",
"document_list_file_list = [None, \n",
" \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n",
" \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n",
"# document_list_file_list = [None, \n",
"# \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n",
"# \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n",
"document_list_file_list = [None]\n",
"for document_list_file in document_list_file_list:\n",
" document_list = None\n",
" if document_list_file is not None:\n",
@ -637,7 +616,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "blade",
"display_name": "emea_ar_test",
"language": "python",
"name": "python3"
},

View File

@ -1483,13 +1483,21 @@ def set_mapping_to_data_side_documents_data():
# mapping_sheet = "document_mapping"
# output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/audited_file_phase2_with_mapping.xlsx"
data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx"
# data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx"
# data_sheet = "ground_truth"
# raw_name_column = "raw_share_name"
# mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
# mapping_sheet = "document_mapping"
# raw_name_mapping_column = None
# output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth.xlsx"
data_sheet = "ground_truth"
raw_name_column = "raw_share_name"
mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
mapping_file_path = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx"
mapping_sheet = "document_mapping"
raw_name_mapping_column = None
output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx"
set_mapping_to_raw_name_data(data_file_path=data_file_path,
data_sheet=data_sheet,
raw_name_column=raw_name_column,
@ -1582,8 +1590,7 @@ def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/ou
"administration_fees",
"minimum_initial_investment",
"benchmark_name",
"performance_fee",
"performance_fee_charged",
"performance_fee_costs",
"buy_spread",
"sell_spread",
"total_annual_dollar_based_charges",
@ -1593,9 +1600,7 @@ def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/ou
"withdrawal_fee",
"exit_fee",
"switching_fee",
"activity_fee",
"hurdle_rate",
"analyst_name"
"activity_fee"
]]
except Exception as e:
print(e)
@ -1733,7 +1738,7 @@ def update_data_by_latest_ground_truth():
if __name__ == "__main__":
update_data_by_latest_ground_truth()
# update_data_by_latest_ground_truth()
# set_provider_to_ground_truth(
# groud_truth_file=r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx",
# ground_truth_sheet="Sheet1",
@ -1741,7 +1746,7 @@ if __name__ == "__main__":
# document_mapping_sheet="document_mapping"
# )
# set_mapping_to_data_side_documents_data()
set_mapping_to_data_side_documents_data()
# source_file = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx"
# target_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"

View File

@ -0,0 +1,6 @@
553449169
539791362
573372424
448906722
462780211
563608192