A. Metrics score
Blade's updates 1. Set the secondary key to be the share class name, instead of the fund name 2. Remove the data point which support is 0 to calculate the metrics 3. Add the message list to store the error message 4. Support save metrics/ error message to excel file 5. Support statistics for different document list 6. Set F1-Score to the first column in the metrics table B. Optimize instructions for benchmark_name
This commit is contained in:
parent
a090b5cc9e
commit
a48af9ddf0
|
|
@ -271,8 +271,49 @@ class DataExtraction:
|
|||
data_list = self.post_adjust_management_fee_costs(data_list)
|
||||
|
||||
data_list = self.check_administration_fees(data_list)
|
||||
data_list = self.check_benchmark(data_list)
|
||||
return data_list
|
||||
|
||||
def check_benchmark(self, data_list: list):
|
||||
"""
|
||||
Remove illegal benchmark data
|
||||
e.g.
|
||||
annual growth in dividends received from the underlying companies
|
||||
The fund's composite benchmark is shown on page 11
|
||||
benchmark
|
||||
A range of published indices
|
||||
composite benchmark
|
||||
fund’s composite benchmark
|
||||
|
||||
The rules are:
|
||||
1. If starts with alphabet and not starts with upper case, then remove it
|
||||
2. If starts with "A range", then remove it
|
||||
3. If starts with "The fund", then remove it
|
||||
"""
|
||||
for data_dict in data_list:
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
data = extract_data.get("data", [])
|
||||
remove_items = []
|
||||
for data_item in data:
|
||||
keys = list(data_item.keys())
|
||||
if "benchmark_name" not in keys:
|
||||
continue
|
||||
benchmark_name = data_item.get("benchmark_name", "")
|
||||
if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund"):
|
||||
data_item.pop("benchmark_name")
|
||||
elif benchmark_name[0].isalpha() and not benchmark_name[0].isupper():
|
||||
data_item.pop("benchmark_name")
|
||||
else:
|
||||
pass
|
||||
|
||||
keys = [key for key in keys if key not in ["fund_name", "share_name"]]
|
||||
if len(keys) == 0:
|
||||
remove_items.append(data_item)
|
||||
|
||||
for remove_item in remove_items:
|
||||
if remove_item in extract_data["data"]:
|
||||
extract_data["data"].remove(remove_item)
|
||||
|
||||
def align_fund_share_name(self, data_list: list):
|
||||
"""
|
||||
Align the fund name and share name to be the same format
|
||||
|
|
|
|||
|
|
@ -350,7 +350,7 @@
|
|||
],
|
||||
"total_annual_dollar_based_charges": [
|
||||
"Total annual dollar-based charges are share class level data.",
|
||||
"Its value corresponds to the administration fees and costs that are charged on a weekly basis.",
|
||||
"A. Its value corresponds to the administration fees and costs that are charged on a weekly basis.",
|
||||
"----Example 1 Start----",
|
||||
"MLC MasterKey Super & Pension Fundamentals\nType of fee or cost \nOngoing annual fees and costs 1 \nAmount \nHow and when paid \nOther administration costs paid from \nreserves of 0.00% pa of your account \nbalance. \nPlus \nA fixed fee of $1.50 per week \nThis fee is deducted monthly if your account balance is below $50,000 \nwhen the percentage administration fee is deducted. \nInvestment fees and \ncosts 2 \nInvestment fees and estimated costs \nfor MLC Horizon 4 Balanced Portfolio, \n1.20% pa. \nYou won ’ t see these fees and costs as direct charges to your account. \nThey're reflected in the daily unit price of each investment option and will \nreduce the net return on your investment \nInvestment fees and estimated costs \nfor other investment options, ranges \nfrom 0.00% pa to 2.84% pa \n(estimated). \nTransaction costs \nMLC Horizon 4 Balanced Portfolio, \n0.06% pa (estimated). \nOther investment options, ranges \nfrom 0.00% pa to 0.24% pa \n(estimated). \nYou won ’ t see these costs as direct charges to your account. They're \nreflected in the daily unit price of each investment option and will reduce \nthe net return on your investment. \nMember activity related fees and costs \nBuy-sell spread \nYou won ’ t see this fee as a direct charge to your account. It ’ s reflected in \nthe buy and sell unit price of each investment option when there ’ s a \ntransaction on your account. \nMLC Horizon 4 Balanced Portfolio, \n0.10%/0.10% \nOther investment options, ranges \nfrom 0.00%/0.00% to 0.30%/0.30% \nThe current buy-sell spreads of an investment option are available at \nmlc.com.au/buysellspreads \n",
|
||||
"----Example 1 End----",
|
||||
|
|
@ -358,18 +358,36 @@
|
|||
"In the context, also with management fees and costs, management fee, buy_spread and sell_spread for specific fund: MLC Horizon 4 Balanced Portfolio.",
|
||||
"Please output the relevant values based on specific fund name.",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
|
||||
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
|
||||
"\n",
|
||||
"B. Please identify some case which not belong to the total_annual_dollar_based_charges, and output empty.",
|
||||
"----Example Start----",
|
||||
"Cost of product information \n\nCost of product for 1 year \n\nThe cost of product gives a summary calculation about \nhow ongoing annual fees and costs can affect your \nsuperannuation investment over a 1-year period for all \ninvestment options. It is calculated in the manner \nshown in the 'Example of annual fees and costs'. \n\nThe cost of product information assumes a balance of \n$50,000 at the beginning of the year. (Additional fees \nsuch as a buy/sell spread may apply – refer to the ‘Fees \nand costs summary’ table for the relevant investment \noption.) \n\nYou should use this figure to help compare \nsuperannuation products and investment options. \n\nInvestment option \nCash \nCost of product \nPerpetual Cash \n$60.00 \nFixed income and credit \nBentham Global Income \n$485.00 \n",
|
||||
"----Example End----",
|
||||
"Explanation:",
|
||||
"The values provided in the example are not total annual dollar-based charges; ",
|
||||
"they represent the cost of product information, which is a calculated figure used to compare superannuation products and investment options. ",
|
||||
"This figure includes ongoing annual fees and costs, but it may not encompass all possible charges, such as additional fees like buy/sell spreads. ",
|
||||
"Therefore, it serves as a comparative tool rather than a comprehensive total of all annual charges.",
|
||||
"The output should be empty:",
|
||||
"{\"data\": []}"
|
||||
],
|
||||
"buy_spread": [
|
||||
"A. Exclude reported name",
|
||||
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
||||
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), ",
|
||||
"Estimated transaction costs offset by buy/sell spreads (% pa), ",
|
||||
"---Example Start---",
|
||||
"---Example 1 Start---",
|
||||
"Option name \nTotal estimated \ntransaction costs \n(% pa) \nEstimated transaction costs \noffset by buy/sell spreads \n(% pa) \nEstimated transaction costs \nborne by the option \n(% pa) \nGenerations Defensive \n0.21 \n0.04 \n0.17 \n",
|
||||
"---Example End---",
|
||||
"---Example 1 End---",
|
||||
"The data should be excluded, the output should be:",
|
||||
"{\"data\": []}",
|
||||
"\n",
|
||||
"---Example 2 Start---",
|
||||
"Transaction costs \nRetirement and TTR income streams \n0.06% p.a. for Defensive Growth, 0.04% p.a. for International \nShares, 0.08% p.a. for Australian Shares, 0.19% p.a. for Property",
|
||||
"---Example 2 End---",
|
||||
"The data is about Transaction costs, should be excluded, the output for buy_spread and sell_spread should be:",
|
||||
"{\"data\": []}",
|
||||
"B. Simple case with simple table structure:",
|
||||
"---Example 1 Start---",
|
||||
"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",
|
||||
|
|
@ -445,7 +463,7 @@
|
|||
"benchmark_name": [
|
||||
"Benchmark is fund leval data, usually as index fund name, e.g. S&P/ASX 300 A-REIT Total Return Index ",
|
||||
"Sometime, there are multiple benchmark names with weightings in the context, please extract them all including weightings and benchmark names.",
|
||||
"Example for single benchmark name",
|
||||
"A. Examples for single benchmark name",
|
||||
"---Example 1 Start---",
|
||||
"MLC Property Securities Fund \nInvestment objective \nAims to outperform the Benchmark (after fees and before tax) over 5 year periods. \nBenchmark \nS&P/ASX 300 A-REIT Total Return Index \n",
|
||||
"---Example 1 End---",
|
||||
|
|
@ -461,8 +479,39 @@
|
|||
"---Example 3 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"MLC Horizon 5 Growth Portfolio\", \"benchmark_name\": \"Consumer Price Index\"}]}",
|
||||
"---Example 4 Start---",
|
||||
"Benchmark returns over 25 years by Traditional asset class \n\nPast performance is not a reliable indicator of future performance.\n\nMarket indices: Australian shares – S&P/ASX300 Accumulation Index, International shares – MSCI World Ex-Australia \nIndex (Unhedged)",
|
||||
"---Example 4 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Australian shares\", \"benchmark_name\": \"S&P/ASX300 Accumulation Index\"}, {\"fund name\": \"International shares\", \"benchmark_name\": \"MSCI World Ex-Australia Index (Unhedged)\"}]}",
|
||||
"---Example 5 Start---",
|
||||
"Dimensional Global Small Company Trust – Active ETF\nFund name \nregistered with ASIC \nDimensional Global Small Company Trust \n\nInvestment objective \nThe investment objective of the Fund is to provide long term capital growth by gaining\nexposure to a diversified portfolio of small companies associated with approved developed \nmarkets (excluding Australia). \nThe Fund is not managed with the objective of achieving a particular return relative to a \nbenchmark index. However, to compare the performance of the Fund with a broad measure \nof market performance, reference may be made to the MSCI World ex Australia Small Cap\nIndex (net div.). \nThe index is referred to for comparison purposes only. The index is not intended to \nrepresent the current or targeted asset allocation of the Fund. The performance of the Fund \nmay differ significantly from the index. \n",
|
||||
"---Example 5 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Dimensional Global Small Company Trust – Active ETF\", \"benchmark_name\": \"MSCI World ex Australia Small Cap Index (net div.)\"}]}",
|
||||
"---Example 6 Start---",
|
||||
"MLC Inflation \nPlus portfolios \n\nInvestment objective \n\nBenchmark \n\nHow the investment option is \nmanaged \n\nMLC Inflation Plus - Conservative Portfolio \n\nAims to deliver a return of: \n\nSuper 1.7% pa above inflation (after fees and tax), \nPension (Pre-retirement phase) 1.7% pa above inflation (after fees and tax), or \nPension (Retirement phase) 2% pa above inflation (after fees and tax), \nsubject to limiting the risk of negative returns over 3 year periods. \n\nThis careful risk management approach means there may be times, such as when interest rates are \nunusually low, when the portfolio requires an extended time period to achieve its return objective. \n\nIn most circumstances the portfolio is expected to provide positive returns over 3 year periods, \nalthough there will sometimes be negative returns over shorter periods. \n\nInflation is measured by the Consumer Price Index, calculated by the Australian Bureau of Statistics. \n\n",
|
||||
"----Example 6 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"MLC Inflation Plus - Conservative Portfolio\", \"benchmark_name\": \"Consumer Price Index\"}]}",
|
||||
"---Example 7 Start---",
|
||||
"\n\nInvestment option \n1 \nPerformance - fee rate \n2 \nPerformance hurdle \n4 \n(maximum fee \n3 \n) \nPayment \nfrequency \nPerpetual SHARE-PLUS Long-Short \n13.98% \n(maximum 15%) \nBenchmark S&P/ASX 300 Accumulation Index plus 2% pa \nHalf-yearly \n",
|
||||
"---Example 7 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Perpetual SHARE-PLUS Long-Short\", \"benchmark_name\": \"S&P/ASX 300 Accumulation Index plus 2% pa\"}]}",
|
||||
"---Example 8 Start---",
|
||||
"Australian Shares \n\nBT Australian Share Fund BT Imputation Fund \n\nBenchmark S&P/ASX 300 (TR) Index S&P/ASX 300 (TR) Index \n\nBT Geared Imputation Fund BT Smaller Companies Fund \n\nBenchmark S&P/ASX 300 (TR) Index \n\nS&P/ASX Small Ordinaries \nAccumulation Index \n\n",
|
||||
"---Example 8 End---",
|
||||
"Description:",
|
||||
"This is a complex example for multiple fund names which each fund name with single benchmark name.",
|
||||
"The fund name line is with 2 funds.",
|
||||
"e.g. \"\n\nBT Australian Share Fund BT Imputation Fund \n\n\", with 2 fund names: \"BT Australian Share Fund\" and \"BT Imputation Fund\".",
|
||||
"The benchmark name line is with 2 benchmark names.",
|
||||
"e.g. \"\n\nBenchmark S&P/ASX 300 (TR) Index S&P/ASX 300 (TR) Index \n\n\", with 2 benchmark names: \"S&P/ASX 300 (TR) Index\" and \"S&P/ASX 300 (TR) Index\".",
|
||||
"Therefore, the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"BT Australian Share Fund\", \"benchmark_name\": \"S&P/ASX 300 (TR) Index\"}, {\"fund name\": \"BT Imputation Fund\", \"benchmark_name\": \"S&P/ASX 300 (TR) Index\"}, {\"fund name\": \"BT Geared Imputation Fund\", \"benchmark_name\": \"S&P/ASX 300 (TR) Index\"}, {\"fund name\": \"BT Smaller Companies Fund\", \"benchmark_name\": \"S&P/ASX Small Ordinaries Accumulation Index\"}]}",
|
||||
"\n",
|
||||
"Example for multiple benchmark names",
|
||||
"B. Example for multiple benchmark names",
|
||||
"---Example 1 Start---",
|
||||
"Investment options other \nthan MLC portfolios \n\nFixed income \n\nInvestment objective \n\nBenchmark \n\nHow the investment option is \nmanaged \n\nThe investment option may be \nsuited to you if... \n\nMinimum suggested time to \ninvest \n\nAsset allocation \n\nStandard Risk Measure \n\nInvestment objective \n\nBenchmark \n\nHow the investment option is \nmanaged \n\nThe investment option may be \nsuited to you if... \n\nMinimum suggested time to \ninvest \n\nAsset allocation \n\nStandard Risk Measure \n\nMacquarie Income Opportunities Fund \n\nThe fund aims to outperform the Benchmark over the medium term (before fees). It aims to provide \nhigher income returns than traditional cash investments at all stages of interest rate and economic \ncycles. \n\nBloomberg AusBond Bank Bill Index \n\nThe fund predominantly provides exposure to a wide range of domestic and global investment grade \nfloating and fixed rate instruments, asset-backed securities, and cash. The fund may also have \nopportunistic exposure to other fixed income sectors and instruments such as, high yield and emerging \n\nmarkets debt as well as other fixed income instruments. Interest rate risk will generally be hedged \nthrough the use of derivatives such as swaps and futures. \n\nThe investment process aims to reduce the risk of the fund being adversely affected by unexpected \nevents or downgrades in the credit rating of the fund ’ s investments. A disciplined framework is used \nto analyse each sector and proposed investment to assess its risk. \n\nThe fund may be exposed to derivatives to implement its investment strategy. For example, protection \nmay be purchased on issuers that are believed to be over-valued or at risk of downgrade. These \npositions increase in value when the underlying instrument falls in value and decrease in value when \nthe underlying instrument rises in value. \n\nThe portfolio is generally hedged to Australian dollars. However, any exposure to emerging markets \ndebt issued in the local currency of the debt will generally be unhedged. Small active currency positions \nmay also be taken when the investment manager believes that there are opportunities to add value \nor hedge risks in the portfolio. \n\nyou want a medium term investment horizon, seeking a steady and reliable income stream. \n\n3 years \n\nAsset class \n\nInvestment grade credit* \n\nHigh yield \n\nEmerging markets debt** \n\nCash \n\n* Includes Australian and global investment grade credit. \n** May include holdings of sub-investment grade instruments. \n\nRanges \n\n0 – 100% \n\n0 – 25% \n\n0 – 25% \n\n0 – 100% \n\nMedium to high (estimate of 3 to 4 negative annual returns in any 20 year period) \n\nPIMCO Diversified Fixed Interest Fund - Wholesale Class \n\nTo achieve maximum total return by investing in underlying funds that invest in Australian and \nglobal bonds, and to seek to preserve capital through prudent investment management. \n\n50% Bloomberg Barclays Global Aggregate Index (Hedged in Australian dollars) and 50% Bloomberg \nAusBond Composite 0+ Yr Index \n\n",
|
||||
"---Example 1 End---",
|
||||
|
|
@ -472,7 +521,28 @@
|
|||
"Australian shares continued \n\nAusbil Australian Emerging Leaders Fund \nInvestment objective \nTo provide returns above the Benchmark over the medium to long term (before fees and tax). \nBenchmark \n70% S&P/ASX Midcap 50 Accumulation Index \n30% S&P/ASX Small Ordinaries Accumulation Index \nHow the investment option is \nmanaged \nThe fund predominantly invests in a portfolio of mid and small cap Australian equities primarily \nchosen from the S&P/ASX 300 Index, but generally excludes securities from the S&P/ASX 50 Index. \nAt all times the fund will favour sectors and specific companies which it believes will experience \npositive earnings revisions. \n",
|
||||
"---Example 2 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders Fund\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}"
|
||||
"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders Fund\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
|
||||
"---Example 3 Start---",
|
||||
"\n\nFund name \nComposite benchmark \nCFS Select High \nGrowth \n1.0% Bloomberg AusBond Bank Bill Index, 31.0% MSCI All Country World ex Australia Net Index, 18.0% \nMSCI All Country World ex Australia Net Index Hedged AUD, 4.0% MSCI World Small Cap Index, 5.0% \nMSCI Emerging Markets Index, 37.0% S&P/ASX 300 Accumulation Index, 4.0% S&P/ASX Small \nOrdinaries Index. ",
|
||||
"---Example 3 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"CFS Select High Growth\", \"benchmark_name\": \"1.0% Bloomberg AusBond Bank Bill Index, 31.0% MSCI All Country World ex Australia Net Index, 18.0% MSCI All Country World ex Australia Net Index Hedged AUD, 4.0% MSCI World Small Cap Index, 5.0% MSCI Emerging Markets Index, 37.0% S&P/ASX 300 Accumulation Index, 4.0% S&P/ASX Small Ordinaries Index\"}]}",
|
||||
"---Example 4 Start---",
|
||||
"\n\nInvestment option \n1 \nPerformance - fee rate \n2 \nPerformance hurdle \n4 \n(maximum fee \n3 \n) \nPayment \nfrequency \nAusbil Australian \nEmerging Leaders \n15.375% \n(maximum 15% ex-GST) \nComposite benchmark (70% S&P/ASX Midcap 50 \nAccumulation Index and 30% S&P/ASX Small Ordinaries \nAccumulation Index) plus 0.85% pa \nMonthly \n",
|
||||
"---Example 4 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index and 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
|
||||
"\n",
|
||||
"C. Example to exclude the benchmark name",
|
||||
"---Example 1 Start---",
|
||||
"A closer look at our sector investment options \n\nCash¹\nDiversified Fixed Interest\nReturn target CPI minus 0.5% per annum on average over 20 years.",
|
||||
"---Example 1 End---",
|
||||
"Explanation:",
|
||||
"The CPI minus 0.5% target is not suitable for Cash and Diversified Fixed Interest funds. ",
|
||||
"Because these funds have different objectives: Cash funds focus on capital preservation and liquidity, aligning with short-term interest rates, ",
|
||||
"while Diversified Fixed Interest funds aim to reflect bond market performance, influenced by interest rates and credit risk, not inflation.",
|
||||
"The output should be:",
|
||||
"{\"data\": []}"
|
||||
]
|
||||
},
|
||||
"sepcial_rule_by_keywords":
|
||||
|
|
|
|||
6
main.py
6
main.py
|
|
@ -1526,8 +1526,8 @@ if __name__ == "__main__":
|
|||
|
||||
# special_doc_id_list = ["553242411"]
|
||||
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = False
|
||||
force_save_total_data = True
|
||||
doc_source = "aus_prospectus"
|
||||
# doc_source = "emea_ar"
|
||||
|
|
@ -1560,7 +1560,7 @@ if __name__ == "__main__":
|
|||
# "544886057",
|
||||
# "550769189",
|
||||
# "553449663"]
|
||||
special_doc_id_list = ["420339794", "401212184"]
|
||||
special_doc_id_list = ["521606755"]
|
||||
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -30,328 +30,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
|
||||
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313024715.xlsx\"\n",
|
||||
"provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"All Providers Results: \n",
|
||||
"Metric \tPrecision \tRecall \tAccuracy \tF1-Score \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||
"Management Fee and Costs \t0.8790 \t0.9250 \t0.8213 \t0.9014 \t494 \t407 \t2 \t56 \t33 \n",
|
||||
"Management Fee \t0.8985 \t0.9265 \t0.8394 \t0.9123 \t494 \t416 \t2 \t47 \t33 \n",
|
||||
"Performance fee and cost \t0.7871 \t0.8472 \t0.7791 \t0.8161 \t327 \t244 \t144 \t66 \t44 \n",
|
||||
"Interposed vehicle Performance fee and Costs \t0.5000 \t1.0000 \t0.9237 \t0.6667 \t39 \t38 \t422 \t38 \t0 \n",
|
||||
"Administration Fee and costs \t0.9787 \t0.9388 \t0.9839 \t0.9583 \t98 \t92 \t398 \t2 \t6 \n",
|
||||
"Total Annual Dollar Based Charges \t0.8165 \t1.0000 \t0.9598 \t0.8990 \t90 \t89 \t389 \t20 \t0 \n",
|
||||
"Buy Spread \t0.8957 \t0.8910 \t0.8394 \t0.8933 \t405 \t335 \t83 \t39 \t41 \n",
|
||||
"Sell Spread \t0.9064 \t0.8921 \t0.8474 \t0.8992 \t405 \t339 \t83 \t35 \t41 \n",
|
||||
"Minimum Initial Investment \t0.8571 \t0.9671 \t0.8815 \t0.9088 \t310 \t294 \t145 \t49 \t10 \n",
|
||||
"Benchmark \t0.6402 \t0.8582 \t0.8233 \t0.7333 \t173 \t121 \t289 \t68 \t20 \n",
|
||||
"TOTAL \t0.8159 \t0.9246 \t0.8699 \t0.8588 \t2835 \t2375 \t1957 \t420 \t228 \n",
|
||||
"Total Funds Matched - 498\n",
|
||||
"Total Funds Not Matched - 28\n",
|
||||
"Percentage of Funds Matched - 94.67680608365019\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import openpyxl\n",
|
||||
"from collections import defaultdict\n",
|
||||
"import pandas as pd\n",
|
||||
"import statistics\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"funds_matched = 0\n",
|
||||
"funds_not_matched = 0\n",
|
||||
"def load_excel(filepath, header_row_index):\n",
|
||||
" \"\"\"Load an Excel file and use the specified row as the header.\"\"\"\n",
|
||||
" wb = openpyxl.load_workbook(filepath, data_only=True)\n",
|
||||
" sheet = wb.active\n",
|
||||
" headers = []\n",
|
||||
" data = []\n",
|
||||
"\n",
|
||||
" for index, row in enumerate(sheet.iter_rows(values_only=True)):\n",
|
||||
" if index == header_row_index:\n",
|
||||
" headers = [cell if cell is not None else \"\" for cell in row]\n",
|
||||
" elif index > header_row_index:\n",
|
||||
" data.append([cell if cell is not None else \"\" for cell in row])\n",
|
||||
"\n",
|
||||
" return headers, data\n",
|
||||
"\n",
|
||||
"def index_data_by_key(data, key_index, secondary_key_index, header):\n",
|
||||
" \"\"\"Index data by primary and secondary keys (doc_id and fund_name).\"\"\"\n",
|
||||
" indexed_data = defaultdict(dict)\n",
|
||||
" \n",
|
||||
" for row in data:\n",
|
||||
" row_data = {}\n",
|
||||
" # Store the entire row, which will be useful for full row comparison\n",
|
||||
" for i in range(len(row)):\n",
|
||||
" if header[i] == \"doc_id\":\n",
|
||||
" primary_key = int(row[i])\n",
|
||||
" elif header[i] == \"fund_name\":\n",
|
||||
" secondary_key = str(row[i])\n",
|
||||
" else:\n",
|
||||
" row_data[header[i]] = convert_if_number(row[i])\n",
|
||||
" indexed_data[primary_key][secondary_key] = row_data\n",
|
||||
" return indexed_data\n",
|
||||
"\n",
|
||||
"def convert_if_number(value):\n",
|
||||
" \"\"\"Attempt to convert value to a float or int, otherwise return as string.\"\"\"\n",
|
||||
" try:\n",
|
||||
" float_value = round(float(value), 2)\n",
|
||||
" int_value = int(float_value)\n",
|
||||
" return int_value if int_value == float_value else float_value\n",
|
||||
" except (ValueError, TypeError):\n",
|
||||
" return value\n",
|
||||
"\n",
|
||||
"def compare_values(value1, value2):\n",
|
||||
" \"\"\"Convert values to numbers if possible and compare, otherwise compare as strings.\"\"\"\n",
|
||||
" value1 = convert_if_number(value1)\n",
|
||||
" value2 = convert_if_number(value2)\n",
|
||||
" return value1 == value2\n",
|
||||
"\n",
|
||||
"def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched):\n",
|
||||
" \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n",
|
||||
" results = {}\n",
|
||||
" funds_matched, funds_not_matched = 0, 0\n",
|
||||
" # Initialize result dictionaries for each column except 'doc_id'\n",
|
||||
" for keys in headers:\n",
|
||||
" if keys != \"doc_id\":\n",
|
||||
" results[keys] = {}\n",
|
||||
" results[keys][\"TP\"] = 0\n",
|
||||
" results[keys][\"TN\"] = 0\n",
|
||||
" results[keys][\"FP\"] = 0\n",
|
||||
" results[keys][\"FN\"] = 0\n",
|
||||
" results[keys][\"SUPPORT\"] = 0\n",
|
||||
"\n",
|
||||
" # Iterate over the generated results instead of the ground truth\n",
|
||||
" for doc_id, funds in ground_truth.items():\n",
|
||||
" if doc_id in generated_results:\n",
|
||||
" for fund_name, truth_values in funds.items():\n",
|
||||
" if fund_name in generated_results[doc_id]:\n",
|
||||
" generated_values = generated_results[doc_id][fund_name]\n",
|
||||
" # Compare all other columns\n",
|
||||
" for i in intersection_list:\n",
|
||||
" for keys in imp_datapoints:\n",
|
||||
" if i == imp_datapoints_mapping[keys]:\n",
|
||||
" if truth_values[i] == \"\":\n",
|
||||
" if truth_values[i] == generated_values[i]:\n",
|
||||
" results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
|
||||
" else:\n",
|
||||
" results[i][\"FP\"] = results[i][\"FP\"] + 1 \n",
|
||||
" else:\n",
|
||||
" if truth_values[i] == generated_values[i]:\n",
|
||||
" results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
|
||||
" elif generated_values[i] != \"\":\n",
|
||||
" results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
|
||||
" else:\n",
|
||||
" results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
|
||||
" results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" # if truth_values[i] == generated_values[i] and truth_values[i] == \"\":\n",
|
||||
" # results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
|
||||
" # elif truth_values[i] == generated_values[i]:\n",
|
||||
" # results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
|
||||
" # elif truth_values[i] != \"\" and generated_values[i] == \"\":\n",
|
||||
" # results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
|
||||
" # elif truth_values[i] == \"\" and generated_values[i] != \"\":\n",
|
||||
" # results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
|
||||
" # else:\n",
|
||||
" # results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
|
||||
" # if truth_values[i] != \"\":\n",
|
||||
" # results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
|
||||
" funds_matched += 1\n",
|
||||
" else:\n",
|
||||
" funds_not_matched += 1\n",
|
||||
" # for keys in headers:\n",
|
||||
" # if keys != \"doc_id\":\n",
|
||||
" # results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
|
||||
" else:\n",
|
||||
" # If the entire document is not found, count all funds as not matched\n",
|
||||
" funds_not_matched += len(funds)\n",
|
||||
" # for fund_name in funds:\n",
|
||||
" # for keys in headers:\n",
|
||||
" # if keys != \"doc_id\":\n",
|
||||
" # results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n",
|
||||
"\n",
|
||||
" return results, funds_matched, funds_not_matched\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Load the files\n",
|
||||
"headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n",
|
||||
"headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n",
|
||||
"\n",
|
||||
"# Assuming doc_id is the first column and fund_name is the second column\n",
|
||||
"doc_id_index = 0\n",
|
||||
"fund_name_index = 1\n",
|
||||
"\n",
|
||||
"# Index the data\n",
|
||||
"ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n",
|
||||
"generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n",
|
||||
"\n",
|
||||
"intersection = set(headers_gen).intersection(headers_gt)\n",
|
||||
"\n",
|
||||
"# Convert the result back to a list (if you need it as a list)\n",
|
||||
"intersection_list = list(intersection)\n",
|
||||
"\n",
|
||||
"total_fn = []\n",
|
||||
"def calculate_metrics(tp, tn, fp, fn):\n",
|
||||
" \"\"\"Calculate precision, recall, accuracy, and F1-score.\"\"\"\n",
|
||||
" precision = tp / (tp + fp) if (tp + fp) != 0 else 0\n",
|
||||
" recall = tp / (tp + fn) if (tp + fn) != 0 else 0\n",
|
||||
" accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0\n",
|
||||
" f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0\n",
|
||||
" return precision, recall, accuracy, f1_score\n",
|
||||
"\n",
|
||||
"def print_metrics_table(data):\n",
|
||||
" # Print table headers\n",
|
||||
" print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"Precision\", \"Recall\", \"Accuracy\", \"F1-Score\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n",
|
||||
" total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]\n",
|
||||
" \n",
|
||||
" total_tp = []\n",
|
||||
" total_tn = []\n",
|
||||
" total_fp = []\n",
|
||||
" #total_fn = []\n",
|
||||
" # Calculate and print metrics for each item\n",
|
||||
" for keys in imp_datapoints:\n",
|
||||
" try:\n",
|
||||
" key = imp_datapoints_mapping[keys]\n",
|
||||
" values = data[key]\n",
|
||||
" tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
|
||||
" precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
|
||||
" total_precision.append(precision)\n",
|
||||
" total_recall.append(recall)\n",
|
||||
" total_accuracy.append(accuracy)\n",
|
||||
" total_f1_score.append(f1_score)\n",
|
||||
" total_support.append(values[\"SUPPORT\"])\n",
|
||||
" total_tp.append(tp)\n",
|
||||
" total_tn.append(tn)\n",
|
||||
" total_fp.append(fp)\n",
|
||||
" total_fn.append(fn)\n",
|
||||
"\n",
|
||||
" if values[\"SUPPORT\"] > 0 and key > \"\":\n",
|
||||
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(keys, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", statistics.mean(total_precision), statistics.mean(total_recall), statistics.mean(total_accuracy), statistics.mean(total_f1_score), sum(total_support), sum(total_tp), sum(total_tn), sum(total_fp), sum(total_fn)))\n",
|
||||
" \n",
|
||||
"def create_metrics_df(data):\n",
|
||||
" # Define a list to hold data for DataFrame\n",
|
||||
" rows = []\n",
|
||||
" \n",
|
||||
" # Iterate through each metric item\n",
|
||||
" for key in imp_datapoints:\n",
|
||||
" try:\n",
|
||||
" mapped_key = imp_datapoints_mapping[key]\n",
|
||||
" values = data[mapped_key]\n",
|
||||
" tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
|
||||
" precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
|
||||
" \n",
|
||||
" # Only add rows where SUPPORT > 0\n",
|
||||
" if values[\"SUPPORT\"] > 0:\n",
|
||||
" row = {\n",
|
||||
" \"Metric\": key,\n",
|
||||
" \"Precision\": precision,\n",
|
||||
" \"Recall\": recall,\n",
|
||||
" \"Accuracy\": accuracy,\n",
|
||||
" \"F1-Score\": f1_score,\n",
|
||||
" \"SUPPORT\": values[\"SUPPORT\"]\n",
|
||||
" }\n",
|
||||
" rows.append(row)\n",
|
||||
" except KeyError as e:\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" # Create a DataFrame from the list of rows\n",
|
||||
" df_metrics = pd.DataFrame(rows)\n",
|
||||
" df_metrics.reset_index(inplace=True)\n",
|
||||
" df_metrics.drop(columns=[\"index\"], inplace=True)\n",
|
||||
" print(df_metrics)\n",
|
||||
" return df_metrics\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_provider_mapping(file_path):\n",
|
||||
" df = pd.read_excel(file_path)\n",
|
||||
" df = (df.groupby([\"Docid\", \"ProviderName\"]).first())\n",
|
||||
" df.reset_index(inplace = True)\n",
|
||||
" return df[[\"Docid\", \"ProviderName\"]]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_provider_names(generated_results_indexed, df_provider_mapping):\n",
|
||||
" providers_dict = {}\n",
|
||||
" for doc_id in generated_results_indexed:\n",
|
||||
" try:\n",
|
||||
" provider_name = (df_provider_mapping[df_provider_mapping[\"Docid\"] == doc_id][\"ProviderName\"].values)[0]\n",
|
||||
" if provider_name in providers_dict:\n",
|
||||
" providers_dict[provider_name].append(doc_id)\n",
|
||||
" else:\n",
|
||||
" providers_dict[provider_name] = []\n",
|
||||
" providers_dict[provider_name].append(doc_id)\n",
|
||||
"\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" return providers_dict\n",
|
||||
"\n",
|
||||
"def get_specified_doc_data(results, doc_list):\n",
|
||||
" provider_res = {}\n",
|
||||
" for doc_id in doc_list:\n",
|
||||
" if doc_id in results:\n",
|
||||
" provider_res[doc_id] = results[doc_id]\n",
|
||||
" return provider_res\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df_provider_mapping = get_provider_mapping(provider_mapping_file_path)\n",
|
||||
"\n",
|
||||
"all_provider_dict = get_provider_names(generated_results_indexed, df_provider_mapping)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# for provider_name in all_provider_dict:\n",
|
||||
"# provider_vise_generated_results = get_specified_doc_data(generated_results_indexed, all_provider_dict[provider_name])\n",
|
||||
"# comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, provider_vise_generated_results, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
|
||||
"# print(\"\\n\")\n",
|
||||
"# print(\"\\n\")\n",
|
||||
"# print(\"Provider Name - \" + provider_name + \"\\t Number of Docs - \" + str(len(all_provider_dict[provider_name])))\n",
|
||||
"# #create_metrics_df(comparison_results)\n",
|
||||
"# print_metrics_table(comparison_results)\n",
|
||||
"# print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
|
||||
"# print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(\"\\n\")\n",
|
||||
"print(\"\\n\")\n",
|
||||
"print(\"All Providers Results: \")\n",
|
||||
"comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, generated_results_indexed, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n",
|
||||
"\n",
|
||||
"print_metrics_table(comparison_results)\n",
|
||||
"print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n",
|
||||
"print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -364,54 +43,54 @@
|
|||
"\n",
|
||||
"All Providers Results: \n",
|
||||
"Document List File - None\n",
|
||||
"Metric \tPrecision \tRecall \tAccuracy \tF1-Score \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||
"management_fee_and_costs \t0.8907 \t0.9513 \t0.8525 \t0.9200 \t457 \t391 \t2 \t48 \t20 \n",
|
||||
"management_fee \t0.9043 \t0.9520 \t0.8655 \t0.9276 \t457 \t397 \t2 \t42 \t20 \n",
|
||||
"performance_fee_costs \t0.8408 \t0.8556 \t0.8113 \t0.8482 \t303 \t243 \t131 \t46 \t41 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.6316 \t1.0000 \t0.9393 \t0.7742 \t49 \t48 \t385 \t28 \t0 \n",
|
||||
"administration_fees \t0.9767 \t0.9655 \t0.9892 \t0.9711 \t87 \t84 \t372 \t2 \t3 \n",
|
||||
"total_annual_dollar_based_charges \t0.8350 \t1.0000 \t0.9631 \t0.9101 \t87 \t86 \t358 \t17 \t0 \n",
|
||||
"buy_spread \t0.9059 \t0.9258 \t0.8655 \t0.9158 \t391 \t337 \t62 \t35 \t27 \n",
|
||||
"sell_spread \t0.9113 \t0.9262 \t0.8698 \t0.9187 \t391 \t339 \t62 \t33 \t27 \n",
|
||||
"minimum_initial_investment \t0.9463 \t0.9814 \t0.9479 \t0.9635 \t329 \t317 \t120 \t18 \t6 \n",
|
||||
"benchmark_name \t0.7444 \t0.8701 \t0.8568 \t0.8024 \t172 \t134 \t261 \t46 \t20 \n",
|
||||
"TOTAL \t0.8587 \t0.9428 \t0.8961 \t0.8951 \t2723 \t2376 \t1755 \t315 \t164 \n",
|
||||
"Total Funds Matched - 461\n",
|
||||
"Total Funds Not Matched - 125\n",
|
||||
"Percentage of Funds Matched - 78.66894197952219\n",
|
||||
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||
"management_fee_and_costs \t0.9463 \t0.9087 \t0.9873 \t0.8986 \t431 \t388 \t2 \t39 \t5 \n",
|
||||
"management_fee \t0.9502 \t0.9157 \t0.9874 \t0.9055 \t431 \t391 \t2 \t36 \t5 \n",
|
||||
"performance_fee_costs \t0.8614 \t0.8473 \t0.8759 \t0.8272 \t281 \t233 \t126 \t42 \t33 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.9726 \t0.9467 \t1.0000 \t0.9908 \t72 \t71 \t359 \t4 \t0 \n",
|
||||
"administration_fees \t0.9935 \t0.9872 \t1.0000 \t0.9977 \t77 \t77 \t356 \t1 \t0 \n",
|
||||
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t72 \t72 \t362 \t0 \t0 \n",
|
||||
"buy_spread \t0.9322 \t0.9066 \t0.9593 \t0.8894 \t370 \t330 \t56 \t34 \t14 \n",
|
||||
"sell_spread \t0.9352 \t0.9121 \t0.9595 \t0.8940 \t370 \t332 \t56 \t32 \t14 \n",
|
||||
"minimum_initial_investment \t0.9577 \t0.9474 \t0.9684 \t0.9378 \t322 \t306 \t101 \t17 \t10 \n",
|
||||
"benchmark_name \t0.8067 \t0.7562 \t0.8643 \t0.8664 \t154 \t121 \t255 \t39 \t19 \n",
|
||||
"TOTAL \t0.9356 \t0.9128 \t0.9602 \t0.9207 \t2580 \t2321 \t1675 \t244 \t100 \n",
|
||||
"Total Funds Matched - 434\n",
|
||||
"Total Funds Not Matched - 131\n",
|
||||
"Percentage of Funds Matched - 76.8141592920354\n",
|
||||
"All Providers Results: \n",
|
||||
"Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
|
||||
"Metric \tPrecision \tRecall \tAccuracy \tF1-Score \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||
"management_fee_and_costs \t0.8960 \t0.9451 \t0.8516 \t0.9199 \t180 \t155 \t0 \t18 \t9 \n",
|
||||
"management_fee \t0.9017 \t0.9455 \t0.8571 \t0.9231 \t180 \t156 \t0 \t17 \t9 \n",
|
||||
"performance_fee_costs \t0.8000 \t0.8261 \t0.8077 \t0.8128 \t94 \t76 \t71 \t19 \t16 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.5273 \t1.0000 \t0.8571 \t0.6905 \t30 \t29 \t127 \t26 \t0 \n",
|
||||
"administration_fees \t1.0000 \t0.3333 \t0.9890 \t0.5000 \t3 \t1 \t179 \t0 \t2 \n",
|
||||
"buy_spread \t0.9643 \t0.9419 \t0.9121 \t0.9529 \t176 \t162 \t4 \t6 \t10 \n",
|
||||
"sell_spread \t0.9702 \t0.9422 \t0.9176 \t0.9560 \t176 \t163 \t4 \t5 \t10 \n",
|
||||
"minimum_initial_investment \t0.9137 \t0.9549 \t0.9011 \t0.9338 \t139 \t127 \t37 \t12 \t6 \n",
|
||||
"benchmark_name \t0.7188 \t0.8734 \t0.7967 \t0.7886 \t91 \t69 \t76 \t27 \t10 \n",
|
||||
"TOTAL \t0.7692 \t0.7762 \t0.8885 \t0.7478 \t1069 \t938 \t679 \t131 \t236 \n",
|
||||
"Total Funds Matched - 182\n",
|
||||
"Total Funds Not Matched - 24\n",
|
||||
"Percentage of Funds Matched - 88.3495145631068\n",
|
||||
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||
"management_fee_and_costs \t0.9499 \t0.9096 \t0.9938 \t0.9045 \t177 \t161 \t0 \t16 \t1 \n",
|
||||
"management_fee \t0.9529 \t0.9153 \t0.9939 \t0.9101 \t177 \t162 \t0 \t15 \t1 \n",
|
||||
"performance_fee_costs \t0.8197 \t0.7979 \t0.8427 \t0.8146 \t91 \t75 \t70 \t19 \t14 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.9811 \t0.9630 \t1.0000 \t0.9888 \t53 \t52 \t124 \t2 \t0 \n",
|
||||
"administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t8 \t8 \t170 \t0 \t0 \n",
|
||||
"buy_spread \t0.9738 \t0.9653 \t0.9824 \t0.9494 \t174 \t167 \t2 \t6 \t3 \n",
|
||||
"sell_spread \t0.9767 \t0.9711 \t0.9825 \t0.9551 \t174 \t168 \t2 \t5 \t3 \n",
|
||||
"minimum_initial_investment \t0.9185 \t0.9118 \t0.9254 \t0.8764 \t140 \t124 \t32 \t12 \t10 \n",
|
||||
"benchmark_name \t0.8121 \t0.7528 \t0.8816 \t0.8258 \t86 \t67 \t80 \t22 \t9 \n",
|
||||
"TOTAL \t0.9316 \t0.9096 \t0.9558 \t0.9139 \t1080 \t984 \t480 \t97 \t141 \n",
|
||||
"Total Funds Matched - 178\n",
|
||||
"Total Funds Not Matched - 18\n",
|
||||
"Percentage of Funds Matched - 90.81632653061224\n",
|
||||
"All Providers Results: \n",
|
||||
"Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
|
||||
"Metric \tPrecision \tRecall \tAccuracy \tF1-Score \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||
"management_fee_and_costs \t0.8872 \t0.9555 \t0.8530 \t0.9201 \t277 \t236 \t2 \t30 \t11 \n",
|
||||
"management_fee \t0.9060 \t0.9563 \t0.8710 \t0.9305 \t277 \t241 \t2 \t25 \t11 \n",
|
||||
"performance_fee_costs \t0.8608 \t0.8698 \t0.8136 \t0.8653 \t209 \t167 \t60 \t27 \t25 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.9048 \t1.0000 \t0.9928 \t0.9500 \t19 \t19 \t258 \t2 \t0 \n",
|
||||
"administration_fees \t0.9765 \t0.9881 \t0.9892 \t0.9822 \t84 \t83 \t193 \t2 \t1 \n",
|
||||
"total_annual_dollar_based_charges \t0.8431 \t1.0000 \t0.9427 \t0.9149 \t87 \t86 \t177 \t16 \t0 \n",
|
||||
"buy_spread \t0.8578 \t0.9115 \t0.8351 \t0.8838 \t215 \t175 \t58 \t29 \t17 \n",
|
||||
"sell_spread \t0.8627 \t0.9119 \t0.8387 \t0.8866 \t215 \t176 \t58 \t28 \t17 \n",
|
||||
"minimum_initial_investment \t0.9694 \t1.0000 \t0.9785 \t0.9845 \t190 \t190 \t83 \t6 \t0 \n",
|
||||
"benchmark_name \t0.7738 \t0.8667 \t0.8961 \t0.8176 \t81 \t65 \t185 \t19 \t10 \n",
|
||||
"TOTAL \t0.8842 \t0.9460 \t0.9011 \t0.9136 \t1654 \t1438 \t1076 \t184 \t328 \n",
|
||||
"Total Funds Matched - 279\n",
|
||||
"Total Funds Not Matched - 101\n",
|
||||
"Percentage of Funds Matched - 73.42105263157895\n"
|
||||
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||
"management_fee_and_costs \t0.9439 \t0.9080 \t0.9827 \t0.8945 \t254 \t227 \t2 \t23 \t4 \n",
|
||||
"management_fee \t0.9482 \t0.9160 \t0.9828 \t0.9023 \t254 \t229 \t2 \t21 \t4 \n",
|
||||
"performance_fee_costs \t0.8827 \t0.8729 \t0.8927 \t0.8359 \t190 \t158 \t56 \t23 \t19 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.9500 \t0.9048 \t1.0000 \t0.9922 \t19 \t19 \t235 \t2 \t0 \n",
|
||||
"administration_fees \t0.9928 \t0.9857 \t1.0000 \t0.9961 \t69 \t69 \t186 \t1 \t0 \n",
|
||||
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t72 \t72 \t184 \t0 \t0 \n",
|
||||
"buy_spread \t0.8932 \t0.8534 \t0.9368 \t0.8477 \t196 \t163 \t54 \t28 \t11 \n",
|
||||
"sell_spread \t0.8962 \t0.8586 \t0.9371 \t0.8516 \t196 \t164 \t54 \t27 \t11 \n",
|
||||
"minimum_initial_investment \t0.9864 \t0.9733 \t1.0000 \t0.9805 \t182 \t182 \t69 \t5 \t0 \n",
|
||||
"benchmark_name \t0.8000 \t0.7606 \t0.8438 \t0.8945 \t68 \t54 \t175 \t17 \t10 \n",
|
||||
"TOTAL \t0.9293 \t0.9033 \t0.9576 \t0.9195 \t1500 \t1337 \t1017 \t147 \t200 \n",
|
||||
"Total Funds Matched - 256\n",
|
||||
"Total Funds Not Matched - 113\n",
|
||||
"Percentage of Funds Matched - 69.37669376693766\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -424,6 +103,20 @@
|
|||
"import re\n",
|
||||
"from utils.similarity import Similarity\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"Blade's updates\n",
|
||||
"1. Set the secondary key to be the share class name, instead of the fund name\n",
|
||||
"2. Remove the data point which support is 0 to calculate the metrics\n",
|
||||
"3. Add the message list to store the error message\n",
|
||||
"4. Support save metrics/ error message to excel file\n",
|
||||
"5. Support statistics for different document list\n",
|
||||
"6. Set F1-Score to the first column in the metrics table\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
|
||||
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313153441.xlsx\"\n",
|
||||
"provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n",
|
||||
"\n",
|
||||
"funds_matched = 0\n",
|
||||
"funds_not_matched = 0\n",
|
||||
"def load_excel(filepath, header_row_index):\n",
|
||||
|
|
@ -456,6 +149,8 @@
|
|||
" secondary_key = str(row[i])\n",
|
||||
" else:\n",
|
||||
" row_data[header[i]] = convert_if_number(row[i])\n",
|
||||
" if secondary_key is None or (isinstance(secondary_key, str) and len(secondary_key) == 0):\n",
|
||||
" continue\n",
|
||||
" indexed_data[primary_key][secondary_key] = row_data\n",
|
||||
" return indexed_data\n",
|
||||
"\n",
|
||||
|
|
@ -588,7 +283,7 @@
|
|||
"\n",
|
||||
"def print_metrics_table(data):\n",
|
||||
" # Print table headers\n",
|
||||
" print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"Precision\", \"Recall\", \"Accuracy\", \"F1-Score\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n",
|
||||
" print(\"{:<50}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"F1-Score\", \"Precision\", \"Recall\", \"Accuracy\", \"SUPPORT\", \"TP\", \"TN\", \"FP\", \"FN\"))\n",
|
||||
" total_precision, total_recall, total_accuracy, total_f1_score, total_support= [],[],[],[],[]\n",
|
||||
" \n",
|
||||
" total_tp = []\n",
|
||||
|
|
@ -601,9 +296,11 @@
|
|||
" try:\n",
|
||||
" key = imp_datapoints_mapping[keys]\n",
|
||||
" values = data[key]\n",
|
||||
" if values[\"SUPPORT\"] == 0:\n",
|
||||
" continue\n",
|
||||
" tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n",
|
||||
" precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n",
|
||||
" metrics = {\"Datapoint\": keys, \"F1-Score\": f1_score, \"Precision\": precision, \"Recall\": recall, \"Accuracy\": accuracy, \"SUPPORT\": values[\"SUPPORT\"], \"TP\": tp, \"TN\": tn, \"FP\": fp, \"FN\": fn}\n",
|
||||
" metrics = {\"Datapoint\": key, \"F1-Score\": f1_score, \"Precision\": precision, \"Recall\": recall, \"Accuracy\": accuracy, \"SUPPORT\": values[\"SUPPORT\"], \"TP\": tp, \"TN\": tn, \"FP\": fp, \"FN\": fn}\n",
|
||||
" metrics_list.append(metrics)\n",
|
||||
" total_precision.append(precision)\n",
|
||||
" total_recall.append(recall)\n",
|
||||
|
|
@ -616,7 +313,7 @@
|
|||
" total_fn.append(fn)\n",
|
||||
"\n",
|
||||
" if values[\"SUPPORT\"] > 0 and key > \"\":\n",
|
||||
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(key, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
|
||||
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(key, f1_score, precision, recall, accuracy, values[\"SUPPORT\"], tp, tn, fp, fn))\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" total_mean_precision = statistics.mean(total_precision)\n",
|
||||
|
|
@ -630,7 +327,7 @@
|
|||
" total_sum_fn = sum(total_fn)\n",
|
||||
" total_metrics = {\"Datapoint\": \"TOTAL\", \"F1-Score\": total_mean_f1_score, \"Precision\": total_mean_precision, \"Recall\": total_mean_recall, \"Accuracy\": total_mean_accuracy, \"SUPPORT\": total_sum_support, \"TP\": total_sum_tp, \"TN\": total_sum_tn, \"FP\": total_sum_fp, \"FN\": total_sum_fn}\n",
|
||||
" metrics_list.append(total_metrics)\n",
|
||||
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", total_mean_precision, total_mean_recall, total_mean_accuracy, total_mean_f1_score, total_sum_support, total_sum_tp, total_sum_tn, total_sum_fp, total_sum_fn))\n",
|
||||
" print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", total_mean_f1_score, total_mean_precision, total_mean_recall, total_mean_accuracy, total_sum_support, total_sum_tp, total_sum_tn, total_sum_fp, total_sum_fn))\n",
|
||||
" return metrics_list\n",
|
||||
" \n",
|
||||
"def create_metrics_df(data):\n",
|
||||
|
|
@ -746,6 +443,7 @@
|
|||
" message_df = pd.DataFrame(message_list)\n",
|
||||
"\n",
|
||||
" output_metrics_folder = r\"/data/aus_prospectus/output/metrics_data/\"\n",
|
||||
" os.makedirs(output_metrics_folder, exist_ok=True)\n",
|
||||
" if os.path.exists(output_metrics_folder):\n",
|
||||
" generated_file_base_name = os.path.basename(path_generated_results).replace(\".xlsx\", \"\")\n",
|
||||
" metrics_file_name = f\"metrics_{generated_file_base_name}\"\n",
|
||||
|
|
|
|||
Loading…
Reference in New Issue