diff --git a/calc_metrics.py b/calc_metrics.py index 3d521f8..b9a4038 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -567,11 +567,11 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros verify_data_df = pd.DataFrame() audit_fields = [ - "DocumentId", - "FundLegalName", - "FundId", - "FundClassLegalName", - "FundClassId", + "doc_id", + "fund_name", + "fund_id", + "sec_name", + "sec_id", "management_fee_and_costs", "management_fee", "administration_fees", @@ -590,11 +590,11 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros audit_data_df = pd.read_excel(audit_file_path, sheet_name=audit_data_sheet) audit_data_df = audit_data_df[audit_fields] audit_data_df = audit_data_df.drop_duplicates() - audit_data_df = audit_data_df.rename(columns={"DocumentId": "doc_id", - "FundLegalName": "fund_name", - "FundId": "fund_id", - "FundClassLegalName": "sec_name", - "FundClassId": "sec_id"}) + # audit_data_df = audit_data_df.rename(columns={"DocumentId": "doc_id", + # "FundLegalName": "fund_name", + # "FundId": "fund_id", + # "FundClassLegalName": "sec_name", + # "FundClassId": "sec_id"}) audit_data_df.fillna("", inplace=True) audit_data_df.reset_index(drop=True, inplace=True) diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index bd149b6..2c25dea 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -403,14 +403,15 @@ "---Example 1 End---", "The relevant values: 0.00 and 2.18, are in the range, so the output should be:", "{\"data\": []}", - "B If with pure performance fee in table, please extract relevant values", + "B. If with pure performance fee in table, please extract relevant values", "---Example Start---", "\n\nFees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nPerformance fees \nAmounts deducted from your investment in \nrelation to the performance of the product. \nPlatinum International Fund Nil 0.15%\nPlatinum Global Fund (Long Only) Nil 0.24%\n", "---Example End---", "a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.", "b. This example mentioned share classes, please output according to share class.", "The output should be", - "{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}" + "{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}", + "C. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0" ], "minimum_initial_investment": [ @@ -518,6 +519,7 @@ "For main fund: Platinum Asia with values: 2.14 2.99 0.02 0.00 0.21 2.37 3.22, ", "the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", "the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", + "Identify the value of the column \"Estimated Performance fees\" and if it is written 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0", "Therefore, the output should be:", "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}" ] @@ -597,6 +599,7 @@ "---Example Start---", "Performance fee \nPlus other investment fees and costs \nEquals investment fees and costs \nTransaction costs(net) \nBuy-sell spreads \nTransaction costs(gross) \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.04 \n0.10 / 0.10 \n0.09 \n", "---Example End---", + "Identify the value of the 1st column \"Performance fee\" and if it is written 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0", "Please ignore the 3rd column: \"Equals investment fees and costs\" values!!", "Please read context carefully, don't miss any data row!!", "The output should be:", diff --git a/main.py b/main.py index c927a60..9f6e92d 100644 --- a/main.py +++ b/main.py @@ -1560,7 +1560,7 @@ if __name__ == "__main__": # "544886057", # "550769189", # "553449663"] - # special_doc_id_list = ["446324179"] + special_doc_id_list = ["420339794", "401212184"] # special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" diff --git a/performance.ipynb b/performance.ipynb index 2264c54..becd91b 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -30,16 +30,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "\n", - "path_ground_truth = r\"C:\\data\\aus_prospectus\\output\\Performance\\46_documents_ground_truth_with_mapping.xlsx\"\n", - "path_generated_results = r\"C:\\data\\aus_prospectus\\output\\Performance\\mapping_data_info_46_documents_by_text_20250313024715.xlsx\"\n", - "provider_mapping_file_path = r\"C:\\Users\\rmahesh\\OneDrive - MORNINGSTAR INC\\Desktop\\NLP Transitions\\Project\\Exprs\\INO71\\dc-ml-dataextraction-llm-aus-nz-pro-AUS_NZ_EXE_COMBINED_PHASE1_PHASE2\\output_files\\ground_truth\\TopProvidersBiz.xlsx\"\n", - "\n", - "\n" + "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", + "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313024715.xlsx\"\n", + "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n" ] }, { @@ -353,7 +351,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -365,132 +363,55 @@ "\n", "\n", "All Providers Results: \n", - "Performance fee and cost - 377377369 truth is null and generated - 0 SPDR® S&P Emerging Markets Carbon Control Fund\n", - "Performance fee and cost - 397107472 truth is null and generated - 0 AMP Capital Specialist Diversified Fixed Income Fund\n", - "Performance fee and cost - 401212184 truth - 0 and generated - 0.11 OnePath OneAnswer Frontier Investment Portfolio-OnePath Multi Asset Income Trust\n", - "Performance fee and cost - 401212184 truth - 0 and generated - 0.07 OA Frontier IP-OnePath Australian Share Trust\n", - "Performance fee and cost - 401212184 truth - 0 and generated - 0.33 OA Frontier Investment Portfolio- BlackRock Tactical Growth\n", - "Performance fee and cost - 401212184 truth - 0 and generated - 0.02 OA Frontier Investment Portfolio- Pendal Monthly Income Plus\n", - "Performance fee and cost - 401212184 truth - 0.41 and generated - 0.13 OnePath Alternatives Growth Trust\n", - "Performance fee and cost - 401212184 truth - 0 and generated - 0.03 OA Frontier IP-Ausbil Australian Emerging Leaders Trust\n", - "Performance fee and cost - 401212184 truth - 0 and generated - 0.15 OA Frontier IP-Perpetual Balanced Growth\n", - "Performance fee and cost - 401212184 truth - 0 and generated - 0.03 OA Frontier IP-Perpetual Conservative Growth\n", - "Performance fee and cost - 401212184 truth - 0 and generated - 0.06 OA Frontier IP-Platinum International\n", - "Performance fee and cost - 401212184 truth - 0 and generated - 0.15 OnePath OneAnswer Investment Portfolio - BlackRock Diversified ESG Growth\n", - "Performance fee and cost - 401212184 truth - 0 and generated - 0.01 ANZ OneAnswer Investment Portfolio - OnePath Balanced Index\n", - "Performance fee and cost - 401212184 generated is null and truth is - 0 ANZ OneAnswer Investment Portfolio - OnePath Growth Index\n", - "Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard Index Diversified Bond\n", - "Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard International Shares Index\n", - "Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard Investor Short Term Fixed Interest Fund\n", - "Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard Index Hedged International Shares Fund\n", - "Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard LifeStrategy Growth\n", - "Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard LifeStrategy Conservative\n", - "Performance fee and cost - 409723592 truth is null and generated - 0 Vanguard LifeStrategy High Growth\n", - "Performance fee and cost - 411062815 truth is null and generated - 13.98 Perpetual WFP-Perpetual Share Plus L/S\n", - "Performance fee and cost - 411062815 truth - 0 and generated - 0.01 WFP Schroder Fixed Income\n", - "Performance fee and cost - 411062815 truth - 0 and generated - 15.38 Perpetual Ausbil Australian Emerg Ldrs\n", - "Performance fee and cost - 411062815 truth - 0.03 and generated - 0.12 WFP Macquarie Income Opportunities\n", - "Performance fee and cost - 411062815 generated is null and truth is - 0 WFP Diversified Income\n", - "Performance fee and cost - 412778803 generated is null and truth is - 0.14 \n", - "Performance fee and cost - 412778803 generated is null and truth is - 0.67 Telstra Property Pension\n", - "Performance fee and cost - 412778803 generated is null and truth is - 0.01 Telstra Cash Pension\n", - "Performance fee and cost - 412778803 generated is null and truth is - 0.01 Telstra Australian shares Pension\n", - "Performance fee and cost - 412778803 generated is null and truth is - 0.14 Telstra Defensive growth Pension\n", - "Performance fee and cost - 412778803 generated is null and truth is - 0.01 Telstra International shares Pension\n", - "Performance fee and cost - 414751292 truth - 0.24 and generated - 0 Platinum Global Fund (Long Only)\n", - "Performance fee and cost - 414751292 truth - 0.15 and generated - 0 \n", - "Performance fee and cost - 414751292 truth - 0.03 and generated - 0 Platinum International Brands Fund\n", - "Performance fee and cost - 414751292 truth - 0.86 and generated - 0 Platinum International Healthcare\n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 \n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - Ausbil Aus. Emrging Leaders\n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - Investors Mutual Aus. Shre\n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - Macquarie Inc Opportunities\n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Cash\n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - Global Share Fund\n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPF - Hedged Global Share Fund\n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - Hedged Global Share Fund\n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - IncomeBuilder\n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPF - PIMCO Div. Fixed Interest\n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPF - PIMCO Global Bond Fund\n", - "Performance fee and cost - 420339794 generated is null and truth is - 0 MLC MKPFPR - PIMCO Global Bond Fund\n", - "Performance fee and cost - 446324179 generated is null and truth is - 0.28 Lifeplan Investment Bond - Allan Gray Australian Equity Fund\n", - "Performance fee and cost - 446324179 generated is null and truth is - 0.05 Lifeplan MLC Horizon 2-Capital Stable Open\n", - "Performance fee and cost - 454036250 generated is null and truth is -   \n", - "Performance fee and cost - 530101994 truth is null and generated - 0 Dimensional Global Value Trust -Active ETF\n", - "Performance fee and cost - 530101994 truth is null and generated - 0 Dimensional Australia Core Equity Trust - Active ETF\n", - "Performance fee and cost - 530101994 truth is null and generated - 0 Dimensional Australian Value Trust - Active ETF\n", - "Performance fee and cost - 530101994 truth is null and generated - 0 Dimensional Global Core Equity Trust (Unhedged Class) - Active ETF\n", - "Performance fee and cost - 530101994 truth is null and generated - 0 Dimensional Global Core Equity Tr\n", - "Performance fee and cost - 550769189 truth is null and generated - 0 Acadian Global Managed Volatility Equity - Class A\n", - "Performance fee and cost - 550522985 truth is null and generated - 0 RQI Global Value – Class A\n", - "Performance fee and cost - 539266893 generated is null and truth is - AMP - Generations - BlackRock Australian Fixed Interest Index\n", - "Performance fee and cost - 539266893 generated is null and truth is - AMP - Generations - BlackRock Australian Equity Index\n", - "Performance fee and cost - 539266893 generated is null and truth is - AMP Generations - Alliance Capital Cash Management\n", - "Performance fee and cost - 539266893 generated is null and truth is - AMP - Generations - BlackRock Property Securities Index\n", - "Performance fee and cost - 539266893 generated is null and truth is - AMP - Generations - BlackRock International Equity Index (Unhedged)\n", - "Performance fee and cost - 539266893 generated is null and truth is - AMP - Generations - BlackRock International Equity Index (Hedged)\n", - "Performance fee and cost - 539241700 truth - 0.08 and generated - 0.05 North Professional Balanced\n", - "Performance fee and cost - 539241700 truth - 0.06 and generated - 0 North Professional High Growth\n", - "Performance fee and cost - 539241700 truth - 0.08 and generated - 0 North Professional Conservative\n", - "Performance fee and cost - 539241700 truth - 0.08 and generated - 0 North Professional Growth\n", - "Performance fee and cost - 539241700 truth - 0.09 and generated - 0 North Professional Moderately Conservative\n", - "Performance fee and cost - 539261734 truth - 0.01 and generated - 0 ipac life choices Income Generator\n", - "Performance fee and cost - 539261734 truth - 0.06 and generated - 0 ipac life choices Active 100\n", - "Performance fee and cost - 539261734 truth - 0.08 and generated - 0 ipac life choices Active 85\n", - "Performance fee and cost - 539261734 truth - 0.01 and generated - 0 ipac life choices Index 50\n", - "Performance fee and cost - 539261734 truth - 0.09 and generated - 0 ipac life choices Active 50\n", - "Performance fee and cost - 539261734 truth - 0.08 and generated - 0 ipac life choices Active 70\n", - "Performance fee and cost - 506913190 generated is null and truth is - 0.03 FC W Pen-CFS TTR Moderate\n", - "Performance fee and cost - 506913190 generated is null and truth is - 0.04 FC W Pen-CFS TTR Growth\n", - "Performance fee and cost - 506913190 generated is null and truth is - 0.47 \n", - "Performance fee and cost - 553449663 truth - 0 and generated - 0.07 AMP Capital Specialist International Share (Hedged) Fund\n", - "Performance fee and cost - 539266874 truth - 0.03 and generated - 0 SUMMIT Select - Active High Growth Units\n", - "Performance fee and cost - 539266874 truth - 0.05 and generated - 0 SUMMIT Select - Active Moderately Defensive\n", - "Performance fee and cost - 539266874 truth - 0.05 and generated - 0 SUMMIT Select - Active Growth Units\n", - "Performance fee and cost - 539266874 truth - 0.05 and generated - 0 SUMMIT Select - Active Balanced\n", - "Performance fee and cost - 539266874 truth - 0.06 and generated - 0 SUMMIT Select - Active Defensive Units\n", - "Performance fee and cost - 539266880 truth - 0.01 and generated - 0 North Multi-manager Active High Growth\n", - "Performance fee and cost - 539266880 truth - 0.01 and generated - 0 North Multi-manager Active Moderately Defensive\n", - "Performance fee and cost - 539266880 truth - 0.01 and generated - 0 North Multi-manager Active Growth\n", - "Performance fee and cost - 539266880 truth - 0.01 and generated - 0 North Multi-manager Balanced\n", - "Performance fee and cost - 526200514 generated is null and truth is - 0 BT Future Goals BTFM\n", - "Performance fee and cost - 526200514 generated is null and truth is - 0 BTFM Asian Share\n", - "Performance fee and cost - 526200514 generated is null and truth is - 0 BT International Share BTFM\n", - "Performance fee and cost - 526200514 generated is null and truth is - 0 BT Smaller Companies BTFM\n", - "Performance fee and cost - 526200514 generated is null and truth is - 0 BT Investment Funds - BT TIME Fund\n", - "Performance fee and cost - 526200514 generated is null and truth is - 0 BT European Share Growth\n", - "Performance fee and cost - 526200514 generated is null and truth is - 0 BT American Share Growth\n", - "Performance fee and cost - 526200514 generated is null and truth is - 0 BT Imputation Share BTFM\n", - "Performance fee and cost - 526200514 generated is null and truth is - 0 \n", - "Performance fee and cost - 521606755 truth is null and generated - 0 CFS Index Diversified\n", - "Performance fee and cost - 557526129 truth is null and generated - 0 Fortlake Real-Income Fund\n", - "Performance fee and cost - 540028470 truth is null and generated - 0 CFS Wholesale Index Australian Share\n", - "Performance fee and cost - 531373053 truth is null and generated - 0 Dimensional Global Core Equity Trust (Unhedged Class) - Active ETF\n", - "Performance fee and cost - 531373053 truth is null and generated - 0 Dimensional Australian Value Trust - Active ETF\n", - "Performance fee and cost - 531373053 truth is null and generated - 0 Dimensional Global Value Trust -Active ETF\n", - "Performance fee and cost - 531373053 truth is null and generated - 0 Dimensional Australia Core Equity Trust - Active ETF\n", - "Performance fee and cost - 531373053 truth is null and generated - 0 Dimensional Global Small Company Trust\n", - "Performance fee and cost - 557362553 truth is null and generated - 0 JPMorgan Global Select Equity Fund\n", - "Performance fee and cost - 527969661 truth is null and generated - 0 JPMorgan Global Equity Premium Income (Hedged) Complex ETF\n", - "Performance fee and cost - 384508026 generated is null and truth is - 0 Mercer Multi-manager High Growth Fund\n", - "Performance fee and cost - 384508026 generated is null and truth is - 0 Mercer Multi-manager Growth Fund\n", - "Performance fee and cost - 384508026 generated is null and truth is - 0 \n", - "total - 452.72727272727275\n", + "Document List File - None\n", "Metric \tPrecision \tRecall \tAccuracy \tF1-Score \tSUPPORT \tTP \tTN \tFP \tFN \n", - "Management Fee and Costs \t0.8790 \t0.9250 \t0.8213 \t0.9014 \t494 \t407 \t2 \t56 \t33 \n", - "Management Fee \t0.8985 \t0.9265 \t0.8394 \t0.9123 \t494 \t416 \t2 \t47 \t33 \n", - "Performance fee and cost \t0.7871 \t0.8472 \t0.7791 \t0.8161 \t327 \t244 \t144 \t66 \t44 \n", - "Interposed vehicle Performance fee and Costs \t0.5000 \t1.0000 \t0.9237 \t0.6667 \t39 \t38 \t422 \t38 \t0 \n", - "Administration Fee and costs \t0.9787 \t0.9388 \t0.9839 \t0.9583 \t98 \t92 \t398 \t2 \t6 \n", - "Total Annual Dollar Based Charges \t0.8165 \t1.0000 \t0.9598 \t0.8990 \t90 \t89 \t389 \t20 \t0 \n", - "Buy Spread \t0.8957 \t0.8910 \t0.8394 \t0.8933 \t405 \t335 \t83 \t39 \t41 \n", - "Sell Spread \t0.9064 \t0.8921 \t0.8474 \t0.8992 \t405 \t339 \t83 \t35 \t41 \n", - "Minimum Initial Investment \t0.8571 \t0.9671 \t0.8815 \t0.9088 \t310 \t294 \t145 \t49 \t10 \n", - "Benchmark \t0.6402 \t0.8582 \t0.8233 \t0.7333 \t173 \t121 \t289 \t68 \t20 \n", - "TOTAL \t0.8159 \t0.9246 \t0.8699 \t0.8588 \t2835 \t2375 \t1957 \t420 \t228 \n", - "Total Funds Matched - 498\n", - "Total Funds Not Matched - 28\n", - "Percentage of Funds Matched - 94.67680608365019\n" + "management_fee_and_costs \t0.8907 \t0.9513 \t0.8525 \t0.9200 \t457 \t391 \t2 \t48 \t20 \n", + "management_fee \t0.9043 \t0.9520 \t0.8655 \t0.9276 \t457 \t397 \t2 \t42 \t20 \n", + "performance_fee_costs \t0.8408 \t0.8556 \t0.8113 \t0.8482 \t303 \t243 \t131 \t46 \t41 \n", + "interposed_vehicle_performance_fee_cost \t0.6316 \t1.0000 \t0.9393 \t0.7742 \t49 \t48 \t385 \t28 \t0 \n", + "administration_fees \t0.9767 \t0.9655 \t0.9892 \t0.9711 \t87 \t84 \t372 \t2 \t3 \n", + "total_annual_dollar_based_charges \t0.8350 \t1.0000 \t0.9631 \t0.9101 \t87 \t86 \t358 \t17 \t0 \n", + "buy_spread \t0.9059 \t0.9258 \t0.8655 \t0.9158 \t391 \t337 \t62 \t35 \t27 \n", + "sell_spread \t0.9113 \t0.9262 \t0.8698 \t0.9187 \t391 \t339 \t62 \t33 \t27 \n", + "minimum_initial_investment \t0.9463 \t0.9814 \t0.9479 \t0.9635 \t329 \t317 \t120 \t18 \t6 \n", + "benchmark_name \t0.7444 \t0.8701 \t0.8568 \t0.8024 \t172 \t134 \t261 \t46 \t20 \n", + "TOTAL \t0.8587 \t0.9428 \t0.8961 \t0.8951 \t2723 \t2376 \t1755 \t315 \t164 \n", + "Total Funds Matched - 461\n", + "Total Funds Not Matched - 125\n", + "Percentage of Funds Matched - 78.66894197952219\n", + "All Providers Results: \n", + "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", + "Metric \tPrecision \tRecall \tAccuracy \tF1-Score \tSUPPORT \tTP \tTN \tFP \tFN \n", + "management_fee_and_costs \t0.8960 \t0.9451 \t0.8516 \t0.9199 \t180 \t155 \t0 \t18 \t9 \n", + "management_fee \t0.9017 \t0.9455 \t0.8571 \t0.9231 \t180 \t156 \t0 \t17 \t9 \n", + "performance_fee_costs \t0.8000 \t0.8261 \t0.8077 \t0.8128 \t94 \t76 \t71 \t19 \t16 \n", + "interposed_vehicle_performance_fee_cost \t0.5273 \t1.0000 \t0.8571 \t0.6905 \t30 \t29 \t127 \t26 \t0 \n", + "administration_fees \t1.0000 \t0.3333 \t0.9890 \t0.5000 \t3 \t1 \t179 \t0 \t2 \n", + "buy_spread \t0.9643 \t0.9419 \t0.9121 \t0.9529 \t176 \t162 \t4 \t6 \t10 \n", + "sell_spread \t0.9702 \t0.9422 \t0.9176 \t0.9560 \t176 \t163 \t4 \t5 \t10 \n", + "minimum_initial_investment \t0.9137 \t0.9549 \t0.9011 \t0.9338 \t139 \t127 \t37 \t12 \t6 \n", + "benchmark_name \t0.7188 \t0.8734 \t0.7967 \t0.7886 \t91 \t69 \t76 \t27 \t10 \n", + "TOTAL \t0.7692 \t0.7762 \t0.8885 \t0.7478 \t1069 \t938 \t679 \t131 \t236 \n", + "Total Funds Matched - 182\n", + "Total Funds Not Matched - 24\n", + "Percentage of Funds Matched - 88.3495145631068\n", + "All Providers Results: \n", + "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", + "Metric \tPrecision \tRecall \tAccuracy \tF1-Score \tSUPPORT \tTP \tTN \tFP \tFN \n", + "management_fee_and_costs \t0.8872 \t0.9555 \t0.8530 \t0.9201 \t277 \t236 \t2 \t30 \t11 \n", + "management_fee \t0.9060 \t0.9563 \t0.8710 \t0.9305 \t277 \t241 \t2 \t25 \t11 \n", + "performance_fee_costs \t0.8608 \t0.8698 \t0.8136 \t0.8653 \t209 \t167 \t60 \t27 \t25 \n", + "interposed_vehicle_performance_fee_cost \t0.9048 \t1.0000 \t0.9928 \t0.9500 \t19 \t19 \t258 \t2 \t0 \n", + "administration_fees \t0.9765 \t0.9881 \t0.9892 \t0.9822 \t84 \t83 \t193 \t2 \t1 \n", + "total_annual_dollar_based_charges \t0.8431 \t1.0000 \t0.9427 \t0.9149 \t87 \t86 \t177 \t16 \t0 \n", + "buy_spread \t0.8578 \t0.9115 \t0.8351 \t0.8838 \t215 \t175 \t58 \t29 \t17 \n", + "sell_spread \t0.8627 \t0.9119 \t0.8387 \t0.8866 \t215 \t176 \t58 \t28 \t17 \n", + "minimum_initial_investment \t0.9694 \t1.0000 \t0.9785 \t0.9845 \t190 \t190 \t83 \t6 \t0 \n", + "benchmark_name \t0.7738 \t0.8667 \t0.8961 \t0.8176 \t81 \t65 \t185 \t19 \t10 \n", + "TOTAL \t0.8842 \t0.9460 \t0.9011 \t0.9136 \t1654 \t1438 \t1076 \t184 \t328 \n", + "Total Funds Matched - 279\n", + "Total Funds Not Matched - 101\n", + "Percentage of Funds Matched - 73.42105263157895\n" ] } ], @@ -499,7 +420,9 @@ "from collections import defaultdict\n", "import pandas as pd\n", "import statistics\n", - "\n", + "import os\n", + "import re\n", + "from utils.similarity import Similarity\n", "\n", "funds_matched = 0\n", "funds_not_matched = 0\n", @@ -519,7 +442,7 @@ " return headers, data\n", "\n", "def index_data_by_key(data, key_index, secondary_key_index, header):\n", - " \"\"\"Index data by primary and secondary keys (doc_id and fund_name).\"\"\"\n", + " \"\"\"Index data by primary and secondary keys (doc_id and sec_name).\"\"\"\n", " indexed_data = defaultdict(dict)\n", " \n", " for row in data:\n", @@ -528,7 +451,8 @@ " for i in range(len(row)):\n", " if header[i] == \"doc_id\":\n", " primary_key = int(row[i])\n", - " elif header[i] == \"fund_name\":\n", + " elif header[i] == \"sec_name\":\n", + " # share class should be the comparison level and key\n", " secondary_key = str(row[i])\n", " else:\n", " row_data[header[i]] = convert_if_number(row[i])\n", @@ -549,7 +473,7 @@ " value1 = convert_if_number(value1)\n", " value2 = convert_if_number(value2)\n", " return value1 == value2\n", - "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched):\n", + "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched, document_list):\n", " \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n", " results = {}\n", " funds_matched, funds_not_matched = 0, 0\n", @@ -566,11 +490,15 @@ " # Iterate over the generated results instead of the ground truth\n", " \n", " total = 0\n", - " for doc_id, funds in ground_truth.items():\n", + " message_list = []\n", + " # print(document_list)\n", + " for doc_id, secs in ground_truth.items():\n", + " if document_list is not None and str(doc_id) not in document_list:\n", + " continue\n", " if doc_id in generated_results:\n", - " for fund_name, truth_values in funds.items():\n", - " if fund_name in generated_results[doc_id]:\n", - " generated_values = generated_results[doc_id][fund_name]\n", + " for sec_name, truth_values in secs.items():\n", + " if sec_name in generated_results[doc_id]:\n", + " generated_values = generated_results[doc_id][sec_name]\n", " # Compare all other columns\n", " for i in intersection_list:\n", " for keys in imp_datapoints:\n", @@ -581,52 +509,56 @@ " results[i][\"TN\"] = results[i][\"TN\"] + 1\n", " else:\n", " results[i][\"FP\"] = results[i][\"FP\"] + 1\n", - " if \"Performance fee and cost\" in keys:\n", - " debug = 0\n", - " print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], fund_name) \n", + " # if \"Performance fee and cost\" in keys:\n", + " debug = 0\n", + " # print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], sec_name) \n", + " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Truth is null and generated is not null\"}\n", + " message_list.append(message) \n", " else:\n", " if truth_values[i] == generated_values[i]:\n", " results[i][\"TP\"] = results[i][\"TP\"] + 1\n", " elif generated_values[i] != \"\":\n", - " results[i][\"FP\"] = results[i][\"FP\"] + 1\n", - " if \"Performance fee and cost\" in keys:\n", + " if i == \"benchmark_name\" and compare_text(truth_values[i], generated_values[i]):\n", + " results[i][\"TP\"] = results[i][\"TP\"] + 1\n", + " else:\n", + " results[i][\"FP\"] = results[i][\"FP\"] + 1\n", + " # if \"Performance fee and cost\" in keys:\n", " debug = 0\n", - " print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", fund_name)\n", + " # print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", sec_name)\n", + " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Truth is not equal with generated\"}\n", + " message_list.append(message)\n", " else:\n", " results[i][\"FN\"] = results[i][\"FN\"] + 1\n", - " if \"Performance fee and cost\" in keys:\n", - " debug = 0\n", - " print(keys, \" - \" , doc_id, \" generated is null and truth is - \", truth_values[i], fund_name)\n", + " # if \"Performance fee and cost\" in keys:\n", + " debug = 0\n", + " # print(keys, \" - \" , doc_id, \" generated is null and truth is - \", truth_values[i], sec_name)\n", + " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Generated is null and truth is not null\"}\n", + " message_list.append(message)\n", " results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n", - "\n", - "\n", - " # if truth_values[i] == generated_values[i] and truth_values[i] == \"\":\n", - " # results[i][\"TN\"] = results[i][\"TN\"] + 1\n", - " # elif truth_values[i] == generated_values[i]:\n", - " # results[i][\"TP\"] = results[i][\"TP\"] + 1\n", - " # elif truth_values[i] != \"\" and generated_values[i] == \"\":\n", - " # results[i][\"FN\"] = results[i][\"FN\"] + 1\n", - " # elif truth_values[i] == \"\" and generated_values[i] != \"\":\n", - " # results[i][\"FP\"] = results[i][\"FP\"] + 1\n", - " # else:\n", - " # results[i][\"FP\"] = results[i][\"FP\"] + 1\n", - " # if truth_values[i] != \"\":\n", - " # results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n", " funds_matched += 1\n", " else:\n", " funds_not_matched += 1\n", - " # for keys in headers:\n", - " # if keys != \"doc_id\":\n", - " # results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n", " else:\n", " # If the entire document is not found, count all funds as not matched\n", - " funds_not_matched += len(funds)\n", - " # for fund_name in funds:\n", - " # for keys in headers:\n", - " # if keys != \"doc_id\":\n", - " # results[keys][\"FN\"] = results[keys][\"FN\"] + 1\n", - " return results, funds_matched, funds_not_matched\n", + " funds_not_matched += len(secs)\n", + " return results, message_list, funds_matched, funds_not_matched\n", "\n", + "def clean_text(text: str):\n", + " if text is None or len(text) == 0:\n", + " return text\n", + " text = re.sub(r\"\\W\", \" \", text)\n", + " text = re.sub(r\"\\s+\", \" \", text)\n", + " return text\n", + "\n", + "def compare_text(source_text, target_text):\n", + " source_text = clean_text(source_text)\n", + " target_text = clean_text(target_text)\n", + " if source_text == target_text or source_text in target_text or target_text in source_text:\n", + " return True\n", + " similarity = Similarity()\n", + " jacard_score = similarity.jaccard_similarity(source_text.lower().split(), target_text.lower().split())\n", + " if jacard_score > 0.8:\n", + " return True\n", "\n", "# Load the files\n", "headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n", @@ -664,12 +596,15 @@ " total_fp = []\n", " #total_fn = []\n", " # Calculate and print metrics for each item\n", + " metrics_list = []\n", " for keys in imp_datapoints:\n", " try:\n", " key = imp_datapoints_mapping[keys]\n", " values = data[key]\n", " tp, tn, fp, fn = values['TP'], values['TN'], values['FP'], values['FN']\n", " precision, recall, accuracy, f1_score = calculate_metrics(tp, tn, fp, fn)\n", + " metrics = {\"Datapoint\": keys, \"F1-Score\": f1_score, \"Precision\": precision, \"Recall\": recall, \"Accuracy\": accuracy, \"SUPPORT\": values[\"SUPPORT\"], \"TP\": tp, \"TN\": tn, \"FP\": fp, \"FN\": fn}\n", + " metrics_list.append(metrics)\n", " total_precision.append(precision)\n", " total_recall.append(recall)\n", " total_accuracy.append(accuracy)\n", @@ -681,10 +616,22 @@ " total_fn.append(fn)\n", "\n", " if values[\"SUPPORT\"] > 0 and key > \"\":\n", - " print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(keys, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n", + " print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(key, precision, recall, accuracy, f1_score, values[\"SUPPORT\"], tp, tn, fp, fn))\n", " except:\n", " pass\n", - " print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", statistics.mean(total_precision), statistics.mean(total_recall), statistics.mean(total_accuracy), statistics.mean(total_f1_score), sum(total_support), sum(total_tp), sum(total_tn), sum(total_fp), sum(total_fn)))\n", + " total_mean_precision = statistics.mean(total_precision)\n", + " total_mean_recall = statistics.mean(total_recall)\n", + " total_mean_accuracy = statistics.mean(total_accuracy)\n", + " total_mean_f1_score = statistics.mean(total_f1_score)\n", + " total_sum_support = sum(total_support)\n", + " total_sum_tp = sum(total_tp)\n", + " total_sum_tn = sum(total_tn)\n", + " total_sum_fp = sum(total_fp)\n", + " total_sum_fn = sum(total_fn)\n", + " total_metrics = {\"Datapoint\": \"TOTAL\", \"F1-Score\": total_mean_f1_score, \"Precision\": total_mean_precision, \"Recall\": total_mean_recall, \"Accuracy\": total_mean_accuracy, \"SUPPORT\": total_sum_support, \"TP\": total_sum_tp, \"TN\": total_sum_tn, \"FP\": total_sum_fp, \"FN\": total_sum_fn}\n", + " metrics_list.append(total_metrics)\n", + " print(\"{:<50}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.4f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\\t{:<10.0f}\".format(\"TOTAL\", total_mean_precision, total_mean_recall, total_mean_accuracy, total_mean_f1_score, total_sum_support, total_sum_tp, total_sum_tn, total_sum_fp, total_sum_fn))\n", + " return metrics_list\n", " \n", "def create_metrics_df(data):\n", " # Define a list to hold data for DataFrame\n", @@ -771,14 +718,45 @@ "\n", "print(\"\\n\")\n", "print(\"\\n\")\n", - "print(\"All Providers Results: \")\n", - "comparison_results, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, generated_results_indexed, headers_gt, doc_id_index, fund_name_index, intersection_list,funds_matched, funds_not_matched)\n", + "document_list_file_list = [None, \n", + " \"./sample_documents/aus_prospectus_29_documents_sample.txt\", \n", + " \"./sample_documents/aus_prospectus_17_documents_sample.txt\"]\n", + "for document_list_file in document_list_file_list:\n", + " document_list = None\n", + " if document_list_file is not None:\n", + " with open(document_list_file, \"r\", encoding=\"utf-8\") as f:\n", + " document_list = f.readlines()\n", + " document_list = [doc_id.strip() for doc_id in document_list]\n", + " \n", + " print(\"All Providers Results: \")\n", + " print(\"Document List File - \", document_list_file)\n", + " comparison_results, message_list, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, \n", + " generated_results_indexed, \n", + " headers_gt, doc_id_index, \n", + " fund_name_index, \n", + " intersection_list,\n", + " funds_matched, \n", + " funds_not_matched,\n", + " document_list)\n", + " metrics_list = print_metrics_table(comparison_results)\n", + " print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n", + " print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n", "\n", - "print_metrics_table(comparison_results)\n", - "print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n", - "print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n", + " metrics_df = pd.DataFrame(metrics_list)\n", + " message_df = pd.DataFrame(message_list)\n", "\n", - "\n" + " output_metrics_folder = r\"/data/aus_prospectus/output/metrics_data/\"\n", + " if os.path.exists(output_metrics_folder):\n", + " generated_file_base_name = os.path.basename(path_generated_results).replace(\".xlsx\", \"\")\n", + " metrics_file_name = f\"metrics_{generated_file_base_name}\"\n", + " if document_list_file is not None:\n", + " metrics_file_name = f\"{metrics_file_name}_{len(document_list)}_documents.xlsx\"\n", + " else:\n", + " metrics_file_name = f\"{metrics_file_name}_all_documents.xlsx\"\n", + " metrics_file_path = os.path.join(output_metrics_folder, metrics_file_name)\n", + " with pd.ExcelWriter(metrics_file_path) as writer:\n", + " metrics_df.to_excel(writer, sheet_name=\"metrics_data\", index=False)\n", + " message_df.to_excel(writer, sheet_name=\"message_data\", index=False)\n" ] }, { @@ -833,7 +811,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.12.6" }, "orig_nbformat": 4 }, diff --git a/utils/pdf_download.py b/utils/pdf_download.py index 5191ecb..ed0acd0 100644 --- a/utils/pdf_download.py +++ b/utils/pdf_download.py @@ -41,11 +41,17 @@ def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str): ACCESS_KEY = os.getenv('ACCESS_KEY') SECRET_KEY = os.getenv('SECRET_KEY') AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN') - s3 = boto3.client("s3", region_name="us-east-1", verify=certifi.where(), + if AWS_SESSION_TOKEN: + s3 = boto3.client("s3", region_name="us-east-1", verify=certifi.where(), aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, aws_session_token=AWS_SESSION_TOKEN ) + else: + s3 = boto3.client("s3", region_name="us-east-1", verify=certifi.where(), + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=SECRET_KEY + ) else: s3 = boto3.client('s3')