From dd15c1c48e7847cad4cc6441134634c739bce687 Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 14 Mar 2025 11:51:10 -0500 Subject: [PATCH] Optimize for benchmark name --- core/data_extraction.py | 6 +- .../data_extraction_prompts_config.json | 16 +++- main.py | 14 +-- performance.ipynb | 86 +++++++++---------- 4 files changed, 64 insertions(+), 58 deletions(-) diff --git a/core/data_extraction.py b/core/data_extraction.py index f7d4609..76472a6 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -322,10 +322,14 @@ class DataExtraction: if "benchmark_name" not in keys: continue benchmark_name = data_item.get("benchmark_name", "") - if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund"): + if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund") or \ + benchmark_name.startswith("CPI "): data_item.pop("benchmark_name") elif benchmark_name[0].isalpha() and not benchmark_name[0].isupper(): data_item.pop("benchmark_name") + elif benchmark_name.lower() in ["benchmark", "composite benchmark", + "fund’s composite benchmark", "long term benchmark"]: + data_item.pop("benchmark_name") else: pass diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 29c54f2..f03c51a 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -499,6 +499,8 @@ "---Example 4 Start---", "Benchmark returns over 25 years by Traditional asset class \n\nPast performance is not a reliable indicator of future performance.\n\nMarket indices: Australian shares – S&P/ASX300 Accumulation Index, International shares – MSCI World Ex-Australia \nIndex (Unhedged)", "---Example 4 End---", + "For this case, after keywords: \"Market indices\", there are multiple benchmark names with format: Fund name – Benchmark name,", + "please extract them all from the context contents.", "The output should be:", "{\"data\": [{\"fund name\": \"Australian shares\", \"benchmark_name\": \"S&P/ASX300 Accumulation Index\"}, {\"fund name\": \"International shares\", \"benchmark_name\": \"MSCI World Ex-Australia Index (Unhedged)\"}]}", "---Example 5 Start---", @@ -550,7 +552,9 @@ "The output should be:", "{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index and 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}", "\n", - "C. Example to exclude the benchmark name", + "C. Don't extract benchmark name from context when fit below cases.", + "1. Exclude benchmark name when its reported name is \"Return target\".", + "2. Exclude benchmark name which start with \"CPI minus\" or \"CPI plus\" or \"CPI +\" or \"CPI -\".", "---Example 1 Start---", "A closer look at our sector investment options \n\nCash¹\nDiversified Fixed Interest\nReturn target CPI minus 0.5% per annum on average over 20 years.", "---Example 1 End---", @@ -559,6 +563,16 @@ "Because these funds have different objectives: Cash funds focus on capital preservation and liquidity, aligning with short-term interest rates, ", "while Diversified Fixed Interest funds aim to reflect bond market performance, influenced by interest rates and credit risk, not inflation.", "The output should be:", + "{\"data\": []}", + "---Example 2 Start---", + "Infrastructure1 Australian Shares \n\nReturn target \n\nCPI plus 2.0% per annum on average over 20 years. CPI plus 4.0% per annum on average over 20 years. \n\n", + "---Example 2 End---", + "Explanation:", + "The terms \"CPI plus 2.0% per annum\" and \"CPI plus 4.0% per annum\" are return targets, not benchmarks. ", + "A benchmark is typically a specific index or standard used to measure the performance of an investment, such as the S&P/ASX 200 for Australian shares. ", + "Return targets, on the other hand, are goals set by the fund to achieve a certain level of performance over a specified period, in this case, 20 years. ", + "They indicate the desired outcome rather than serving as a comparative measure against market performance.", + "The output should be:", "{\"data\": []}" ] }, diff --git a/main.py b/main.py index 01dc940..389aa18 100644 --- a/main.py +++ b/main.py @@ -1524,31 +1524,19 @@ if __name__ == "__main__": # get_aus_prospectus_document_category() - # special_doc_id_list = ["553242411"] - re_run_extract_data = True re_run_mapping_data = True force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" if doc_source == "aus_prospectus": - # document_sample_file = ( - # r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" - # ) - # document_sample_file = ( - # r"./sample_documents/aus_prospectus_17_documents_sample.txt" - # ) document_sample_file = ( r"./sample_documents/aus_prospectus_46_documents_sample.txt" ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] - # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" - # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" - # document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" - # special_doc_id_list = ["420339794"] - # special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"] + # special_doc_id_list = ["441280757", "454036250"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/performance.ipynb b/performance.ipynb index bd2aed5..9b08bd5 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -44,53 +44,53 @@ "All Providers Results: \n", "Document List File - None\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9395 \t0.9002 \t0.9823 \t0.8858 \t437 \t388 \t0 \t43 \t7 \n", - "management_fee \t0.9496 \t0.9188 \t0.9826 \t0.9041 \t437 \t396 \t0 \t35 \t7 \n", - "performance_fee_costs \t0.8597 \t0.8755 \t0.8445 \t0.8219 \t298 \t239 \t121 \t34 \t44 \n", - "interposed_vehicle_performance_fee_cost \t0.9362 \t0.9429 \t0.9296 \t0.9795 \t72 \t66 \t363 \t4 \t5 \n", - "administration_fees \t0.7862 \t0.9828 \t0.6552 \t0.9292 \t87 \t57 \t350 \t1 \t30 \n", - "total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9772 \t82 \t72 \t356 \t10 \t0 \n", - "buy_spread \t0.9374 \t0.9208 \t0.9547 \t0.8973 \t375 \t337 \t56 \t29 \t16 \n", - "sell_spread \t0.9418 \t0.9290 \t0.9551 \t0.9041 \t375 \t340 \t56 \t26 \t16 \n", - "minimum_initial_investment \t0.9518 \t0.9457 \t0.9579 \t0.9315 \t315 \t296 \t112 \t17 \t13 \n", - "benchmark_name \t0.8553 \t0.8418 \t0.8693 \t0.8973 \t166 \t133 \t260 \t25 \t20 \n", - "TOTAL \t0.9093 \t0.9135 \t0.9131 \t0.9128 \t2644 \t2324 \t1674 \t224 \t158 \n", - "Total Funds Matched - 438\n", - "Total Funds Not Matched - 127\n", - "Percentage of Funds Matched - 77.5221238938053\n", + "management_fee_and_costs \t0.9169 \t0.8581 \t0.9843 \t0.8465 \t442 \t375 \t0 \t62 \t6 \n", + "management_fee \t0.9351 \t0.8902 \t0.9848 \t0.8781 \t442 \t389 \t0 \t48 \t6 \n", + "performance_fee_costs \t0.8653 \t0.8426 \t0.8893 \t0.8194 \t309 \t257 \t106 \t48 \t32 \n", + "interposed_vehicle_performance_fee_cost \t0.9412 \t0.8889 \t1.0000 \t0.9797 \t73 \t72 \t362 \t9 \t0 \n", + "administration_fees \t0.9811 \t0.9873 \t0.9750 \t0.9932 \t80 \t78 \t362 \t1 \t2 \n", + "total_annual_dollar_based_charges \t0.9857 \t0.9718 \t1.0000 \t0.9955 \t69 \t69 \t372 \t2 \t0 \n", + "buy_spread \t0.9129 \t0.8879 \t0.9392 \t0.8668 \t363 \t309 \t75 \t39 \t20 \n", + "sell_spread \t0.9176 \t0.8966 \t0.9398 \t0.8736 \t363 \t312 \t75 \t36 \t20 \n", + "minimum_initial_investment \t0.9532 \t0.9641 \t0.9425 \t0.9345 \t313 \t295 \t119 \t11 \t18 \n", + "benchmark_name \t0.8100 \t0.7847 \t0.8370 \t0.8804 \t148 \t113 \t277 \t31 \t22 \n", + "TOTAL \t0.9219 \t0.8972 \t0.9492 \t0.9068 \t2602 \t2269 \t1748 \t287 \t126 \n", + "Total Funds Matched - 443\n", + "Total Funds Not Matched - 122\n", + "Percentage of Funds Matched - 78.40707964601769\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9419 \t0.9059 \t0.9809 \t0.8902 \t172 \t154 \t0 \t16 \t3 \n", - "management_fee \t0.9547 \t0.9294 \t0.9814 \t0.9133 \t172 \t158 \t0 \t12 \t3 \n", - "performance_fee_costs \t0.8315 \t0.9024 \t0.7708 \t0.8266 \t97 \t74 \t69 \t8 \t22 \n", - "interposed_vehicle_performance_fee_cost \t0.9630 \t0.9286 \t1.0000 \t0.9769 \t53 \t52 \t117 \t4 \t0 \n", - "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t8 \t8 \t165 \t0 \t0 \n", - "buy_spread \t0.9699 \t0.9699 \t0.9699 \t0.9422 \t169 \t161 \t2 \t5 \t5 \n", - "sell_spread \t0.9760 \t0.9819 \t0.9702 \t0.9538 \t169 \t163 \t2 \t3 \t5 \n", - "minimum_initial_investment \t0.9027 \t0.9062 \t0.8992 \t0.8555 \t135 \t116 \t32 \t12 \t13 \n", - "benchmark_name \t0.8333 \t0.8025 \t0.8667 \t0.8497 \t85 \t65 \t82 \t16 \t10 \n", - "TOTAL \t0.9303 \t0.9252 \t0.9377 \t0.9120 \t1060 \t951 \t469 \t76 \t219 \n", - "Total Funds Matched - 173\n", - "Total Funds Not Matched - 23\n", - "Percentage of Funds Matched - 88.26530612244898\n", + "management_fee_and_costs \t0.9412 \t0.9040 \t0.9816 \t0.8889 \t179 \t160 \t0 \t17 \t3 \n", + "management_fee \t0.9744 \t0.9661 \t0.9828 \t0.9500 \t179 \t171 \t0 \t6 \t3 \n", + "performance_fee_costs \t0.7876 \t0.8172 \t0.7600 \t0.7722 \t102 \t76 \t63 \t17 \t24 \n", + "interposed_vehicle_performance_fee_cost \t0.9286 \t0.8667 \t1.0000 \t0.9556 \t53 \t52 \t120 \t8 \t0 \n", + "administration_fees \t0.9231 \t1.0000 \t0.8571 \t0.9889 \t14 \t12 \t166 \t0 \t2 \n", + "buy_spread \t0.9217 \t0.9053 \t0.9387 \t0.8556 \t177 \t153 \t1 \t16 \t10 \n", + "sell_spread \t0.9281 \t0.9172 \t0.9394 \t0.8667 \t177 \t155 \t1 \t14 \t10 \n", + "minimum_initial_investment \t0.9118 \t0.9538 \t0.8732 \t0.8667 \t142 \t124 \t32 \t6 \t18 \n", + "benchmark_name \t0.8280 \t0.8333 \t0.8228 \t0.8500 \t87 \t65 \t88 \t13 \t14 \n", + "TOTAL \t0.9049 \t0.9071 \t0.9062 \t0.8883 \t1110 \t968 \t471 \t97 \t210 \n", + "Total Funds Matched - 180\n", + "Total Funds Not Matched - 16\n", + "Percentage of Funds Matched - 91.83673469387756\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9379 \t0.8966 \t0.9832 \t0.8830 \t265 \t234 \t0 \t27 \t4 \n", - "management_fee \t0.9463 \t0.9119 \t0.9835 \t0.8981 \t265 \t238 \t0 \t23 \t4 \n", - "performance_fee_costs \t0.8730 \t0.8639 \t0.8824 \t0.8189 \t201 \t165 \t52 \t26 \t22 \n", - "interposed_vehicle_performance_fee_cost \t0.8485 \t1.0000 \t0.7368 \t0.9811 \t19 \t14 \t246 \t0 \t5 \n", - "administration_fees \t0.7597 \t0.9800 \t0.6203 \t0.8830 \t79 \t49 \t185 \t1 \t30 \n", - "total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9623 \t82 \t72 \t183 \t10 \t0 \n", - "buy_spread \t0.9096 \t0.8800 \t0.9412 \t0.8679 \t206 \t176 \t54 \t24 \t11 \n", - "sell_spread \t0.9124 \t0.8850 \t0.9415 \t0.8717 \t206 \t177 \t54 \t23 \t11 \n", - "minimum_initial_investment \t0.9863 \t0.9730 \t1.0000 \t0.9811 \t180 \t180 \t80 \t5 \t0 \n", - "benchmark_name \t0.8774 \t0.8831 \t0.8718 \t0.9283 \t81 \t68 \t178 \t9 \t10 \n", - "TOTAL \t0.8986 \t0.9151 \t0.8961 \t0.9075 \t1584 \t1373 \t1032 \t148 \t316 \n", - "Total Funds Matched - 265\n", - "Total Funds Not Matched - 104\n", - "Percentage of Funds Matched - 71.81571815718158\n" + "management_fee_and_costs \t0.8996 \t0.8269 \t0.9862 \t0.8175 \t263 \t215 \t0 \t45 \t3 \n", + "management_fee \t0.9064 \t0.8385 \t0.9864 \t0.8289 \t263 \t218 \t0 \t42 \t3 \n", + "performance_fee_costs \t0.9027 \t0.8538 \t0.9577 \t0.8517 \t207 \t181 \t43 \t31 \t8 \n", + "interposed_vehicle_performance_fee_cost \t0.9756 \t0.9524 \t1.0000 \t0.9962 \t20 \t20 \t242 \t1 \t0 \n", + "administration_fees \t0.9925 \t0.9851 \t1.0000 \t0.9962 \t66 \t66 \t196 \t1 \t0 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t69 \t69 \t194 \t0 \t0 \n", + "buy_spread \t0.9043 \t0.8715 \t0.9398 \t0.8745 \t186 \t156 \t74 \t23 \t10 \n", + "sell_spread \t0.9075 \t0.8771 \t0.9401 \t0.8783 \t186 \t157 \t74 \t22 \t10 \n", + "minimum_initial_investment \t0.9856 \t0.9716 \t1.0000 \t0.9810 \t171 \t171 \t87 \t5 \t0 \n", + "benchmark_name \t0.7869 \t0.7273 \t0.8571 \t0.9011 \t61 \t48 \t189 \t18 \t8 \n", + "TOTAL \t0.9261 \t0.8904 \t0.9667 \t0.9125 \t1492 \t1301 \t1099 \t188 \t252 \n", + "Total Funds Matched - 263\n", + "Total Funds Not Matched - 106\n", + "Percentage of Funds Matched - 71.27371273712737\n" ] } ], @@ -114,7 +114,7 @@ "\"\"\"\n", "\n", "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", - "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250314032024.xlsx\"\n", + "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250314113438.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", "\n", "funds_matched = 0\n",