diff --git a/calc_metrics.py b/calc_metrics.py index b9a4038..bb286fc 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -1376,44 +1376,53 @@ def clean_text(text: str): def merge_inference_data(): - file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250308220117.xlsx" - file2 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_5_documents_by_text_20250311165607.xlsx" + file1 = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317_Ravi.xlsx" + file2 = r"/data/aus_prospectus/output/merged_data/docs/excel/merged_420339794.xlsx" columns = [ "doc_id", + "effective_date", "raw_fund_name", + "raw_share_name", + "raw_name", "fund_id", "fund_name", - "raw_share_name", "sec_id", "sec_name", + "page_index", "management_fee_and_costs", "management_fee", "administration_fees", - "minimum_initial_investment", - "benchmark_name", "performance_fee_costs", "interposed_vehicle_performance_fee_cost", "buy_spread", "sell_spread", - "total_annual_dollar_based_charges" + "total_annual_dollar_based_charges", + "minimum_initial_investment", + "benchmark_name", + "indirect_costs", + "recoverable_expenses", + "change_recoverable_expenses" ] file1_data_df = pd.read_excel(file1, sheet_name="total_mapping_data") file1_data_df = file1_data_df[columns] - file2_data_df = pd.read_excel(file2, sheet_name="total_mapping_data") + # remove the rows which doc_id is 420339794 from file1_data_df + file1_data_df = file1_data_df[file1_data_df["doc_id"] != 420339794] + + file2_data_df = pd.read_excel(file2, sheet_name="merged_data") file2_data_df = file2_data_df[columns] total_data_df = pd.concat([file1_data_df, file2_data_df]) total_data_df.reset_index(drop=True, inplace=True) output_folder = r"/data/aus_prospectus/output/mapping_data/total/" - output_file = os.path.join(output_folder, "merged_mapping_data_info_46_documents_by_text.xlsx") + output_file = os.path.join(output_folder, "mapping_data_info_46_documents_by_text_20250317_Ravi_modified.xlsx") with pd.ExcelWriter(output_file) as f: total_data_df.to_excel(f, index=False, sheet_name="total_mapping_data") if __name__ == "__main__": - # merge_inference_data() + merge_inference_data() # adjust_column_order() # set_mapping_to_data_side_documents_data() @@ -1436,14 +1445,14 @@ if __name__ == "__main__": "./sample_documents/aus_prospectus_17_documents_sample.txt"] zero_equal_none = False is_for_all = True - for verify_document_list_file in verify_document_list_file_list: - calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, - audit_data_sheet=audit_data_sheet, - verify_file_path=verify_file_path, - verify_data_sheet=verify_data_sheet, - verify_document_list_file = verify_document_list_file, - is_for_all=is_for_all, - zero_equal_none=zero_equal_none) + # for verify_document_list_file in verify_document_list_file_list: + # calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, + # audit_data_sheet=audit_data_sheet, + # verify_file_path=verify_file_path, + # verify_data_sheet=verify_data_sheet, + # verify_document_list_file = verify_document_list_file, + # is_for_all=is_for_all, + # zero_equal_none=zero_equal_none) # for verify_document_list_file in verify_document_list_file_list: # calculate_metrics_by_provider(audit_file_path=audit_file_path, diff --git a/core/data_extraction.py b/core/data_extraction.py index 76472a6..60faba4 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -793,7 +793,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num not in [25]: + # if page_num not in [4, 5]: # continue if page_num in handled_page_num_list: continue diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index f768996..f4e832a 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -351,12 +351,11 @@ "total_annual_dollar_based_charges": [ "Total annual dollar-based charges are share class level data.", "A. Its value corresponds to the administration fees and costs that are charged on a weekly basis.", - "----Example 1 Start----", + "----Example Start----", "MLC MasterKey Super & Pension Fundamentals\nType of fee or cost \nOngoing annual fees and costs 1 \nAmount \nHow and when paid \nOther administration costs paid from \nreserves of 0.00% pa of your account \nbalance. \nPlus \nA fixed fee of $1.50 per week \nThis fee is deducted monthly if your account balance is below $50,000 \nwhen the percentage administration fee is deducted. \nInvestment fees and \ncosts 2 \nInvestment fees and estimated costs \nfor MLC Horizon 4 Balanced Portfolio, \n1.20% pa. \nYou won ’ t see these fees and costs as direct charges to your account. \nThey're reflected in the daily unit price of each investment option and will \nreduce the net return on your investment \nInvestment fees and estimated costs \nfor other investment options, ranges \nfrom 0.00% pa to 2.84% pa \n(estimated). \nTransaction costs \nMLC Horizon 4 Balanced Portfolio, \n0.06% pa (estimated). \nOther investment options, ranges \nfrom 0.00% pa to 0.24% pa \n(estimated). \nYou won ’ t see these costs as direct charges to your account. They're \nreflected in the daily unit price of each investment option and will reduce \nthe net return on your investment. \nMember activity related fees and costs \nBuy-sell spread \nYou won ’ t see this fee as a direct charge to your account. It ’ s reflected in \nthe buy and sell unit price of each investment option when there ’ s a \ntransaction on your account. \nMLC Horizon 4 Balanced Portfolio, \n0.10%/0.10% \nOther investment options, ranges \nfrom 0.00%/0.00% to 0.30%/0.30% \nThe current buy-sell spreads of an investment option are available at \nmlc.com.au/buysellspreads \n", - "----Example 1 End----", + "----Example End----", "According to example, the fixed fee is $1.50 per week, so total_annual_dollar_based_charges is 1.50 * 52 = 78", - "In the context, also with management fees and costs, management fee, buy_spread and sell_spread for specific fund: MLC Horizon 4 Balanced Portfolio.", - "Please output the relevant values based on specific fund name.", + "In the example, also with management fees and costs, management fee, buy_spread and sell_spread for specific fund: MLC Horizon 4 Balanced Portfolio.", "The output should be:", "{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", "\n", diff --git a/main.py b/main.py index 08aa17d..d7b0046 100644 --- a/main.py +++ b/main.py @@ -1538,7 +1538,7 @@ if __name__ == "__main__": with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" - # special_doc_id_list = ["441280757", "454036250"] + # special_doc_id_list = ["420339794"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/performance.ipynb b/performance.ipynb index 9b08bd5..4869a68 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 18, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -44,53 +44,53 @@ "All Providers Results: \n", "Document List File - None\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9169 \t0.8581 \t0.9843 \t0.8465 \t442 \t375 \t0 \t62 \t6 \n", - "management_fee \t0.9351 \t0.8902 \t0.9848 \t0.8781 \t442 \t389 \t0 \t48 \t6 \n", - "performance_fee_costs \t0.8653 \t0.8426 \t0.8893 \t0.8194 \t309 \t257 \t106 \t48 \t32 \n", - "interposed_vehicle_performance_fee_cost \t0.9412 \t0.8889 \t1.0000 \t0.9797 \t73 \t72 \t362 \t9 \t0 \n", - "administration_fees \t0.9811 \t0.9873 \t0.9750 \t0.9932 \t80 \t78 \t362 \t1 \t2 \n", - "total_annual_dollar_based_charges \t0.9857 \t0.9718 \t1.0000 \t0.9955 \t69 \t69 \t372 \t2 \t0 \n", - "buy_spread \t0.9129 \t0.8879 \t0.9392 \t0.8668 \t363 \t309 \t75 \t39 \t20 \n", - "sell_spread \t0.9176 \t0.8966 \t0.9398 \t0.8736 \t363 \t312 \t75 \t36 \t20 \n", - "minimum_initial_investment \t0.9532 \t0.9641 \t0.9425 \t0.9345 \t313 \t295 \t119 \t11 \t18 \n", - "benchmark_name \t0.8100 \t0.7847 \t0.8370 \t0.8804 \t148 \t113 \t277 \t31 \t22 \n", - "TOTAL \t0.9219 \t0.8972 \t0.9492 \t0.9068 \t2602 \t2269 \t1748 \t287 \t126 \n", - "Total Funds Matched - 443\n", - "Total Funds Not Matched - 122\n", - "Percentage of Funds Matched - 78.40707964601769\n", + "management_fee_and_costs \t0.9204 \t0.8852 \t0.9586 \t0.8533 \t406 \t347 \t2 \t45 \t15 \n", + "management_fee \t0.9415 \t0.9235 \t0.9602 \t0.8900 \t406 \t362 \t2 \t30 \t15 \n", + "performance_fee_costs \t0.8953 \t0.9277 \t0.8652 \t0.8680 \t281 \t231 \t124 \t18 \t36 \n", + "interposed_vehicle_performance_fee_cost \t0.9600 \t0.9231 \t1.0000 \t0.9853 \t73 \t72 \t331 \t6 \t0 \n", + "administration_fees \t0.8319 \t0.9592 \t0.7344 \t0.9535 \t64 \t47 \t343 \t2 \t17 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t66 \t66 \t343 \t0 \t0 \n", + "buy_spread \t0.9359 \t0.9235 \t0.9486 \t0.8949 \t349 \t314 \t52 \t26 \t17 \n", + "sell_spread \t0.9407 \t0.9324 \t0.9491 \t0.9022 \t349 \t317 \t52 \t23 \t17 \n", + "minimum_initial_investment \t0.9737 \t0.9642 \t0.9834 \t0.9609 \t301 \t296 \t97 \t11 \t5 \n", + "benchmark_name \t0.8047 \t0.8175 \t0.7923 \t0.8778 \t141 \t103 \t256 \t23 \t27 \n", + "TOTAL \t0.9204 \t0.9256 \t0.9192 \t0.9186 \t2436 \t2155 \t1602 \t184 \t149 \n", + "Total Funds Matched - 409\n", + "Total Funds Not Matched - 156\n", + "Percentage of Funds Matched - 72.38938053097344\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9412 \t0.9040 \t0.9816 \t0.8889 \t179 \t160 \t0 \t17 \t3 \n", - "management_fee \t0.9744 \t0.9661 \t0.9828 \t0.9500 \t179 \t171 \t0 \t6 \t3 \n", - "performance_fee_costs \t0.7876 \t0.8172 \t0.7600 \t0.7722 \t102 \t76 \t63 \t17 \t24 \n", - "interposed_vehicle_performance_fee_cost \t0.9286 \t0.8667 \t1.0000 \t0.9556 \t53 \t52 \t120 \t8 \t0 \n", - "administration_fees \t0.9231 \t1.0000 \t0.8571 \t0.9889 \t14 \t12 \t166 \t0 \t2 \n", - "buy_spread \t0.9217 \t0.9053 \t0.9387 \t0.8556 \t177 \t153 \t1 \t16 \t10 \n", - "sell_spread \t0.9281 \t0.9172 \t0.9394 \t0.8667 \t177 \t155 \t1 \t14 \t10 \n", - "minimum_initial_investment \t0.9118 \t0.9538 \t0.8732 \t0.8667 \t142 \t124 \t32 \t6 \t18 \n", - "benchmark_name \t0.8280 \t0.8333 \t0.8228 \t0.8500 \t87 \t65 \t88 \t13 \t14 \n", - "TOTAL \t0.9049 \t0.9071 \t0.9062 \t0.8883 \t1110 \t968 \t471 \t97 \t210 \n", - "Total Funds Matched - 180\n", - "Total Funds Not Matched - 16\n", - "Percentage of Funds Matched - 91.83673469387756\n", + "management_fee_and_costs \t0.9457 \t0.8970 \t1.0000 \t0.8970 \t164 \t148 \t0 \t17 \t0 \n", + "management_fee \t0.9783 \t0.9576 \t1.0000 \t0.9576 \t164 \t158 \t0 \t7 \t0 \n", + "performance_fee_costs \t0.8263 \t0.8846 \t0.7753 \t0.8242 \t95 \t69 \t67 \t9 \t20 \n", + "interposed_vehicle_performance_fee_cost \t0.9455 \t0.8966 \t1.0000 \t0.9636 \t53 \t52 \t107 \t6 \t0 \n", + "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t1 \t1 \t164 \t0 \t0 \n", + "buy_spread \t0.9812 \t0.9752 \t0.9874 \t0.9636 \t162 \t157 \t2 \t4 \t2 \n", + "sell_spread \t0.9876 \t0.9876 \t0.9876 \t0.9758 \t162 \t159 \t2 \t2 \t2 \n", + "minimum_initial_investment \t0.9569 \t0.9531 \t0.9606 \t0.9333 \t127 \t122 \t32 \t6 \t5 \n", + "benchmark_name \t0.7651 \t0.7808 \t0.7500 \t0.7879 \t85 \t57 \t73 \t16 \t19 \n", + "TOTAL \t0.9318 \t0.9258 \t0.9401 \t0.9226 \t1013 \t923 \t447 \t67 \t197 \n", + "Total Funds Matched - 165\n", + "Total Funds Not Matched - 31\n", + "Percentage of Funds Matched - 84.18367346938776\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.8996 \t0.8269 \t0.9862 \t0.8175 \t263 \t215 \t0 \t45 \t3 \n", - "management_fee \t0.9064 \t0.8385 \t0.9864 \t0.8289 \t263 \t218 \t0 \t42 \t3 \n", - "performance_fee_costs \t0.9027 \t0.8538 \t0.9577 \t0.8517 \t207 \t181 \t43 \t31 \t8 \n", - "interposed_vehicle_performance_fee_cost \t0.9756 \t0.9524 \t1.0000 \t0.9962 \t20 \t20 \t242 \t1 \t0 \n", - "administration_fees \t0.9925 \t0.9851 \t1.0000 \t0.9962 \t66 \t66 \t196 \t1 \t0 \n", - "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t69 \t69 \t194 \t0 \t0 \n", - "buy_spread \t0.9043 \t0.8715 \t0.9398 \t0.8745 \t186 \t156 \t74 \t23 \t10 \n", - "sell_spread \t0.9075 \t0.8771 \t0.9401 \t0.8783 \t186 \t157 \t74 \t22 \t10 \n", - "minimum_initial_investment \t0.9856 \t0.9716 \t1.0000 \t0.9810 \t171 \t171 \t87 \t5 \t0 \n", - "benchmark_name \t0.7869 \t0.7273 \t0.8571 \t0.9011 \t61 \t48 \t189 \t18 \t8 \n", - "TOTAL \t0.9261 \t0.8904 \t0.9667 \t0.9125 \t1492 \t1301 \t1099 \t188 \t252 \n", - "Total Funds Matched - 263\n", - "Total Funds Not Matched - 106\n", - "Percentage of Funds Matched - 71.27371273712737\n" + "management_fee_and_costs \t0.9025 \t0.8767 \t0.9299 \t0.8238 \t242 \t199 \t2 \t28 \t15 \n", + "management_fee \t0.9148 \t0.8987 \t0.9315 \t0.8443 \t242 \t204 \t2 \t23 \t15 \n", + "performance_fee_costs \t0.9284 \t0.9474 \t0.9101 \t0.8975 \t186 \t162 \t57 \t9 \t16 \n", + "interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t224 \t0 \t0 \n", + "administration_fees \t0.8288 \t0.9583 \t0.7302 \t0.9221 \t63 \t46 \t179 \t2 \t17 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t66 \t66 \t178 \t0 \t0 \n", + "buy_spread \t0.8946 \t0.8771 \t0.9128 \t0.8484 \t187 \t157 \t50 \t22 \t15 \n", + "sell_spread \t0.8977 \t0.8827 \t0.9133 \t0.8525 \t187 \t158 \t50 \t21 \t15 \n", + "minimum_initial_investment \t0.9858 \t0.9721 \t1.0000 \t0.9795 \t174 \t174 \t65 \t5 \t0 \n", + "benchmark_name \t0.8598 \t0.8679 \t0.8519 \t0.9385 \t56 \t46 \t183 \t7 \t8 \n", + "TOTAL \t0.9212 \t0.9281 \t0.9180 \t0.9107 \t1423 \t1232 \t990 \t117 \t298 \n", + "Total Funds Matched - 244\n", + "Total Funds Not Matched - 125\n", + "Percentage of Funds Matched - 66.12466124661248\n" ] } ], @@ -114,7 +114,8 @@ "\"\"\"\n", "\n", "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", - "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250314113438.xlsx\"\n", + "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317_Ravi_modified.xlsx\"\n", + "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250317_Ravi.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", "\n", "funds_matched = 0\n",