Optimize for benchmark name

This commit is contained in:
Blade He 2025-03-14 11:51:10 -05:00
parent bceff71fa4
commit dd15c1c48e
4 changed files with 64 additions and 58 deletions

View File

@ -322,10 +322,14 @@ class DataExtraction:
if "benchmark_name" not in keys: if "benchmark_name" not in keys:
continue continue
benchmark_name = data_item.get("benchmark_name", "") benchmark_name = data_item.get("benchmark_name", "")
if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund"): if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund") or \
benchmark_name.startswith("CPI "):
data_item.pop("benchmark_name") data_item.pop("benchmark_name")
elif benchmark_name[0].isalpha() and not benchmark_name[0].isupper(): elif benchmark_name[0].isalpha() and not benchmark_name[0].isupper():
data_item.pop("benchmark_name") data_item.pop("benchmark_name")
elif benchmark_name.lower() in ["benchmark", "composite benchmark",
"funds composite benchmark", "long term benchmark"]:
data_item.pop("benchmark_name")
else: else:
pass pass

View File

@ -499,6 +499,8 @@
"---Example 4 Start---", "---Example 4 Start---",
"Benchmark returns over 25 years by Traditional asset class \n\nPast performance is not a reliable indicator of future performance.\n\nMarket indices: Australian shares S&P/ASX300 Accumulation Index, International shares MSCI World Ex-Australia \nIndex (Unhedged)", "Benchmark returns over 25 years by Traditional asset class \n\nPast performance is not a reliable indicator of future performance.\n\nMarket indices: Australian shares S&P/ASX300 Accumulation Index, International shares MSCI World Ex-Australia \nIndex (Unhedged)",
"---Example 4 End---", "---Example 4 End---",
"For this case, after keywords: \"Market indices\", there are multiple benchmark names with format: Fund name Benchmark name,",
"please extract them all from the context contents.",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Australian shares\", \"benchmark_name\": \"S&P/ASX300 Accumulation Index\"}, {\"fund name\": \"International shares\", \"benchmark_name\": \"MSCI World Ex-Australia Index (Unhedged)\"}]}", "{\"data\": [{\"fund name\": \"Australian shares\", \"benchmark_name\": \"S&P/ASX300 Accumulation Index\"}, {\"fund name\": \"International shares\", \"benchmark_name\": \"MSCI World Ex-Australia Index (Unhedged)\"}]}",
"---Example 5 Start---", "---Example 5 Start---",
@ -550,7 +552,9 @@
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index and 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}", "{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index and 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
"\n", "\n",
"C. Example to exclude the benchmark name", "C. Don't extract benchmark name from context when fit below cases.",
"1. Exclude benchmark name when its reported name is \"Return target\".",
"2. Exclude benchmark name which start with \"CPI minus\" or \"CPI plus\" or \"CPI +\" or \"CPI -\".",
"---Example 1 Start---", "---Example 1 Start---",
"A closer look at our sector investment options \n\nCash¹\nDiversified Fixed Interest\nReturn target CPI minus 0.5% per annum on average over 20 years.", "A closer look at our sector investment options \n\nCash¹\nDiversified Fixed Interest\nReturn target CPI minus 0.5% per annum on average over 20 years.",
"---Example 1 End---", "---Example 1 End---",
@ -559,6 +563,16 @@
"Because these funds have different objectives: Cash funds focus on capital preservation and liquidity, aligning with short-term interest rates, ", "Because these funds have different objectives: Cash funds focus on capital preservation and liquidity, aligning with short-term interest rates, ",
"while Diversified Fixed Interest funds aim to reflect bond market performance, influenced by interest rates and credit risk, not inflation.", "while Diversified Fixed Interest funds aim to reflect bond market performance, influenced by interest rates and credit risk, not inflation.",
"The output should be:", "The output should be:",
"{\"data\": []}",
"---Example 2 Start---",
"Infrastructure1 Australian Shares \n\nReturn target \n\nCPI plus 2.0% per annum on average over 20 years. CPI plus 4.0% per annum on average over 20 years. \n\n",
"---Example 2 End---",
"Explanation:",
"The terms \"CPI plus 2.0% per annum\" and \"CPI plus 4.0% per annum\" are return targets, not benchmarks. ",
"A benchmark is typically a specific index or standard used to measure the performance of an investment, such as the S&P/ASX 200 for Australian shares. ",
"Return targets, on the other hand, are goals set by the fund to achieve a certain level of performance over a specified period, in this case, 20 years. ",
"They indicate the desired outcome rather than serving as a comparative measure against market performance.",
"The output should be:",
"{\"data\": []}" "{\"data\": []}"
] ]
}, },

14
main.py
View File

@ -1524,31 +1524,19 @@ if __name__ == "__main__":
# get_aus_prospectus_document_category() # get_aus_prospectus_document_category()
# special_doc_id_list = ["553242411"]
re_run_extract_data = True re_run_extract_data = True
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = True force_save_total_data = True
doc_source = "aus_prospectus" doc_source = "aus_prospectus"
# doc_source = "emea_ar" # doc_source = "emea_ar"
if doc_source == "aus_prospectus": if doc_source == "aus_prospectus":
# document_sample_file = (
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
# )
# document_sample_file = (
# r"./sample_documents/aus_prospectus_17_documents_sample.txt"
# )
document_sample_file = ( document_sample_file = (
r"./sample_documents/aus_prospectus_46_documents_sample.txt" r"./sample_documents/aus_prospectus_46_documents_sample.txt"
) )
with open(document_sample_file, "r", encoding="utf-8") as f: with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
# special_doc_id_list = ["420339794"] # special_doc_id_list = ["441280757", "454036250"]
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (

View File

@ -30,7 +30,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 33, "execution_count": 34,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -44,53 +44,53 @@
"All Providers Results: \n", "All Providers Results: \n",
"Document List File - None\n", "Document List File - None\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.9395 \t0.9002 \t0.9823 \t0.8858 \t437 \t388 \t0 \t43 \t7 \n", "management_fee_and_costs \t0.9169 \t0.8581 \t0.9843 \t0.8465 \t442 \t375 \t0 \t62 \t6 \n",
"management_fee \t0.9496 \t0.9188 \t0.9826 \t0.9041 \t437 \t396 \t0 \t35 \t7 \n", "management_fee \t0.9351 \t0.8902 \t0.9848 \t0.8781 \t442 \t389 \t0 \t48 \t6 \n",
"performance_fee_costs \t0.8597 \t0.8755 \t0.8445 \t0.8219 \t298 \t239 \t121 \t34 \t44 \n", "performance_fee_costs \t0.8653 \t0.8426 \t0.8893 \t0.8194 \t309 \t257 \t106 \t48 \t32 \n",
"interposed_vehicle_performance_fee_cost \t0.9362 \t0.9429 \t0.9296 \t0.9795 \t72 \t66 \t363 \t4 \t5 \n", "interposed_vehicle_performance_fee_cost \t0.9412 \t0.8889 \t1.0000 \t0.9797 \t73 \t72 \t362 \t9 \t0 \n",
"administration_fees \t0.7862 \t0.9828 \t0.6552 \t0.9292 \t87 \t57 \t350 \t1 \t30 \n", "administration_fees \t0.9811 \t0.9873 \t0.9750 \t0.9932 \t80 \t78 \t362 \t1 \t2 \n",
"total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9772 \t82 \t72 \t356 \t10 \t0 \n", "total_annual_dollar_based_charges \t0.9857 \t0.9718 \t1.0000 \t0.9955 \t69 \t69 \t372 \t2 \t0 \n",
"buy_spread \t0.9374 \t0.9208 \t0.9547 \t0.8973 \t375 \t337 \t56 \t29 \t16 \n", "buy_spread \t0.9129 \t0.8879 \t0.9392 \t0.8668 \t363 \t309 \t75 \t39 \t20 \n",
"sell_spread \t0.9418 \t0.9290 \t0.9551 \t0.9041 \t375 \t340 \t56 \t26 \t16 \n", "sell_spread \t0.9176 \t0.8966 \t0.9398 \t0.8736 \t363 \t312 \t75 \t36 \t20 \n",
"minimum_initial_investment \t0.9518 \t0.9457 \t0.9579 \t0.9315 \t315 \t296 \t112 \t17 \t13 \n", "minimum_initial_investment \t0.9532 \t0.9641 \t0.9425 \t0.9345 \t313 \t295 \t119 \t11 \t18 \n",
"benchmark_name \t0.8553 \t0.8418 \t0.8693 \t0.8973 \t166 \t133 \t260 \t25 \t20 \n", "benchmark_name \t0.8100 \t0.7847 \t0.8370 \t0.8804 \t148 \t113 \t277 \t31 \t22 \n",
"TOTAL \t0.9093 \t0.9135 \t0.9131 \t0.9128 \t2644 \t2324 \t1674 \t224 \t158 \n", "TOTAL \t0.9219 \t0.8972 \t0.9492 \t0.9068 \t2602 \t2269 \t1748 \t287 \t126 \n",
"Total Funds Matched - 438\n", "Total Funds Matched - 443\n",
"Total Funds Not Matched - 127\n", "Total Funds Not Matched - 122\n",
"Percentage of Funds Matched - 77.5221238938053\n", "Percentage of Funds Matched - 78.40707964601769\n",
"All Providers Results: \n", "All Providers Results: \n",
"Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.9419 \t0.9059 \t0.9809 \t0.8902 \t172 \t154 \t0 \t16 \t3 \n", "management_fee_and_costs \t0.9412 \t0.9040 \t0.9816 \t0.8889 \t179 \t160 \t0 \t17 \t3 \n",
"management_fee \t0.9547 \t0.9294 \t0.9814 \t0.9133 \t172 \t158 \t0 \t12 \t3 \n", "management_fee \t0.9744 \t0.9661 \t0.9828 \t0.9500 \t179 \t171 \t0 \t6 \t3 \n",
"performance_fee_costs \t0.8315 \t0.9024 \t0.7708 \t0.8266 \t97 \t74 \t69 \t8 \t22 \n", "performance_fee_costs \t0.7876 \t0.8172 \t0.7600 \t0.7722 \t102 \t76 \t63 \t17 \t24 \n",
"interposed_vehicle_performance_fee_cost \t0.9630 \t0.9286 \t1.0000 \t0.9769 \t53 \t52 \t117 \t4 \t0 \n", "interposed_vehicle_performance_fee_cost \t0.9286 \t0.8667 \t1.0000 \t0.9556 \t53 \t52 \t120 \t8 \t0 \n",
"administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t8 \t8 \t165 \t0 \t0 \n", "administration_fees \t0.9231 \t1.0000 \t0.8571 \t0.9889 \t14 \t12 \t166 \t0 \t2 \n",
"buy_spread \t0.9699 \t0.9699 \t0.9699 \t0.9422 \t169 \t161 \t2 \t5 \t5 \n", "buy_spread \t0.9217 \t0.9053 \t0.9387 \t0.8556 \t177 \t153 \t1 \t16 \t10 \n",
"sell_spread \t0.9760 \t0.9819 \t0.9702 \t0.9538 \t169 \t163 \t2 \t3 \t5 \n", "sell_spread \t0.9281 \t0.9172 \t0.9394 \t0.8667 \t177 \t155 \t1 \t14 \t10 \n",
"minimum_initial_investment \t0.9027 \t0.9062 \t0.8992 \t0.8555 \t135 \t116 \t32 \t12 \t13 \n", "minimum_initial_investment \t0.9118 \t0.9538 \t0.8732 \t0.8667 \t142 \t124 \t32 \t6 \t18 \n",
"benchmark_name \t0.8333 \t0.8025 \t0.8667 \t0.8497 \t85 \t65 \t82 \t16 \t10 \n", "benchmark_name \t0.8280 \t0.8333 \t0.8228 \t0.8500 \t87 \t65 \t88 \t13 \t14 \n",
"TOTAL \t0.9303 \t0.9252 \t0.9377 \t0.9120 \t1060 \t951 \t469 \t76 \t219 \n", "TOTAL \t0.9049 \t0.9071 \t0.9062 \t0.8883 \t1110 \t968 \t471 \t97 \t210 \n",
"Total Funds Matched - 173\n", "Total Funds Matched - 180\n",
"Total Funds Not Matched - 23\n", "Total Funds Not Matched - 16\n",
"Percentage of Funds Matched - 88.26530612244898\n", "Percentage of Funds Matched - 91.83673469387756\n",
"All Providers Results: \n", "All Providers Results: \n",
"Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
"management_fee_and_costs \t0.9379 \t0.8966 \t0.9832 \t0.8830 \t265 \t234 \t0 \t27 \t4 \n", "management_fee_and_costs \t0.8996 \t0.8269 \t0.9862 \t0.8175 \t263 \t215 \t0 \t45 \t3 \n",
"management_fee \t0.9463 \t0.9119 \t0.9835 \t0.8981 \t265 \t238 \t0 \t23 \t4 \n", "management_fee \t0.9064 \t0.8385 \t0.9864 \t0.8289 \t263 \t218 \t0 \t42 \t3 \n",
"performance_fee_costs \t0.8730 \t0.8639 \t0.8824 \t0.8189 \t201 \t165 \t52 \t26 \t22 \n", "performance_fee_costs \t0.9027 \t0.8538 \t0.9577 \t0.8517 \t207 \t181 \t43 \t31 \t8 \n",
"interposed_vehicle_performance_fee_cost \t0.8485 \t1.0000 \t0.7368 \t0.9811 \t19 \t14 \t246 \t0 \t5 \n", "interposed_vehicle_performance_fee_cost \t0.9756 \t0.9524 \t1.0000 \t0.9962 \t20 \t20 \t242 \t1 \t0 \n",
"administration_fees \t0.7597 \t0.9800 \t0.6203 \t0.8830 \t79 \t49 \t185 \t1 \t30 \n", "administration_fees \t0.9925 \t0.9851 \t1.0000 \t0.9962 \t66 \t66 \t196 \t1 \t0 \n",
"total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9623 \t82 \t72 \t183 \t10 \t0 \n", "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t69 \t69 \t194 \t0 \t0 \n",
"buy_spread \t0.9096 \t0.8800 \t0.9412 \t0.8679 \t206 \t176 \t54 \t24 \t11 \n", "buy_spread \t0.9043 \t0.8715 \t0.9398 \t0.8745 \t186 \t156 \t74 \t23 \t10 \n",
"sell_spread \t0.9124 \t0.8850 \t0.9415 \t0.8717 \t206 \t177 \t54 \t23 \t11 \n", "sell_spread \t0.9075 \t0.8771 \t0.9401 \t0.8783 \t186 \t157 \t74 \t22 \t10 \n",
"minimum_initial_investment \t0.9863 \t0.9730 \t1.0000 \t0.9811 \t180 \t180 \t80 \t5 \t0 \n", "minimum_initial_investment \t0.9856 \t0.9716 \t1.0000 \t0.9810 \t171 \t171 \t87 \t5 \t0 \n",
"benchmark_name \t0.8774 \t0.8831 \t0.8718 \t0.9283 \t81 \t68 \t178 \t9 \t10 \n", "benchmark_name \t0.7869 \t0.7273 \t0.8571 \t0.9011 \t61 \t48 \t189 \t18 \t8 \n",
"TOTAL \t0.8986 \t0.9151 \t0.8961 \t0.9075 \t1584 \t1373 \t1032 \t148 \t316 \n", "TOTAL \t0.9261 \t0.8904 \t0.9667 \t0.9125 \t1492 \t1301 \t1099 \t188 \t252 \n",
"Total Funds Matched - 265\n", "Total Funds Matched - 263\n",
"Total Funds Not Matched - 104\n", "Total Funds Not Matched - 106\n",
"Percentage of Funds Matched - 71.81571815718158\n" "Percentage of Funds Matched - 71.27371273712737\n"
] ]
} }
], ],
@ -114,7 +114,7 @@
"\"\"\"\n", "\"\"\"\n",
"\n", "\n",
"path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250314032024.xlsx\"\n", "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250314113438.xlsx\"\n",
"provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n",
"\n", "\n",
"funds_matched = 0\n", "funds_matched = 0\n",