Optimize for benchmark name
This commit is contained in:
parent
bceff71fa4
commit
dd15c1c48e
|
|
@ -322,10 +322,14 @@ class DataExtraction:
|
|||
if "benchmark_name" not in keys:
|
||||
continue
|
||||
benchmark_name = data_item.get("benchmark_name", "")
|
||||
if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund"):
|
||||
if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund") or \
|
||||
benchmark_name.startswith("CPI "):
|
||||
data_item.pop("benchmark_name")
|
||||
elif benchmark_name[0].isalpha() and not benchmark_name[0].isupper():
|
||||
data_item.pop("benchmark_name")
|
||||
elif benchmark_name.lower() in ["benchmark", "composite benchmark",
|
||||
"fund’s composite benchmark", "long term benchmark"]:
|
||||
data_item.pop("benchmark_name")
|
||||
else:
|
||||
pass
|
||||
|
||||
|
|
|
|||
|
|
@ -499,6 +499,8 @@
|
|||
"---Example 4 Start---",
|
||||
"Benchmark returns over 25 years by Traditional asset class \n\nPast performance is not a reliable indicator of future performance.\n\nMarket indices: Australian shares – S&P/ASX300 Accumulation Index, International shares – MSCI World Ex-Australia \nIndex (Unhedged)",
|
||||
"---Example 4 End---",
|
||||
"For this case, after keywords: \"Market indices\", there are multiple benchmark names with format: Fund name – Benchmark name,",
|
||||
"please extract them all from the context contents.",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Australian shares\", \"benchmark_name\": \"S&P/ASX300 Accumulation Index\"}, {\"fund name\": \"International shares\", \"benchmark_name\": \"MSCI World Ex-Australia Index (Unhedged)\"}]}",
|
||||
"---Example 5 Start---",
|
||||
|
|
@ -550,7 +552,9 @@
|
|||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index and 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
|
||||
"\n",
|
||||
"C. Example to exclude the benchmark name",
|
||||
"C. Don't extract benchmark name from context when fit below cases.",
|
||||
"1. Exclude benchmark name when its reported name is \"Return target\".",
|
||||
"2. Exclude benchmark name which start with \"CPI minus\" or \"CPI plus\" or \"CPI +\" or \"CPI -\".",
|
||||
"---Example 1 Start---",
|
||||
"A closer look at our sector investment options \n\nCash¹\nDiversified Fixed Interest\nReturn target CPI minus 0.5% per annum on average over 20 years.",
|
||||
"---Example 1 End---",
|
||||
|
|
@ -559,6 +563,16 @@
|
|||
"Because these funds have different objectives: Cash funds focus on capital preservation and liquidity, aligning with short-term interest rates, ",
|
||||
"while Diversified Fixed Interest funds aim to reflect bond market performance, influenced by interest rates and credit risk, not inflation.",
|
||||
"The output should be:",
|
||||
"{\"data\": []}",
|
||||
"---Example 2 Start---",
|
||||
"Infrastructure1 Australian Shares \n\nReturn target \n\nCPI plus 2.0% per annum on average over 20 years. CPI plus 4.0% per annum on average over 20 years. \n\n",
|
||||
"---Example 2 End---",
|
||||
"Explanation:",
|
||||
"The terms \"CPI plus 2.0% per annum\" and \"CPI plus 4.0% per annum\" are return targets, not benchmarks. ",
|
||||
"A benchmark is typically a specific index or standard used to measure the performance of an investment, such as the S&P/ASX 200 for Australian shares. ",
|
||||
"Return targets, on the other hand, are goals set by the fund to achieve a certain level of performance over a specified period, in this case, 20 years. ",
|
||||
"They indicate the desired outcome rather than serving as a comparative measure against market performance.",
|
||||
"The output should be:",
|
||||
"{\"data\": []}"
|
||||
]
|
||||
},
|
||||
|
|
|
|||
14
main.py
14
main.py
|
|
@ -1524,31 +1524,19 @@ if __name__ == "__main__":
|
|||
|
||||
# get_aus_prospectus_document_category()
|
||||
|
||||
# special_doc_id_list = ["553242411"]
|
||||
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = True
|
||||
doc_source = "aus_prospectus"
|
||||
# doc_source = "emea_ar"
|
||||
if doc_source == "aus_prospectus":
|
||||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||
# )
|
||||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
||||
# )
|
||||
document_sample_file = (
|
||||
r"./sample_documents/aus_prospectus_46_documents_sample.txt"
|
||||
)
|
||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
|
||||
# special_doc_id_list = ["420339794"]
|
||||
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"]
|
||||
# special_doc_id_list = ["441280757", "454036250"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -44,53 +44,53 @@
|
|||
"All Providers Results: \n",
|
||||
"Document List File - None\n",
|
||||
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||
"management_fee_and_costs \t0.9395 \t0.9002 \t0.9823 \t0.8858 \t437 \t388 \t0 \t43 \t7 \n",
|
||||
"management_fee \t0.9496 \t0.9188 \t0.9826 \t0.9041 \t437 \t396 \t0 \t35 \t7 \n",
|
||||
"performance_fee_costs \t0.8597 \t0.8755 \t0.8445 \t0.8219 \t298 \t239 \t121 \t34 \t44 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.9362 \t0.9429 \t0.9296 \t0.9795 \t72 \t66 \t363 \t4 \t5 \n",
|
||||
"administration_fees \t0.7862 \t0.9828 \t0.6552 \t0.9292 \t87 \t57 \t350 \t1 \t30 \n",
|
||||
"total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9772 \t82 \t72 \t356 \t10 \t0 \n",
|
||||
"buy_spread \t0.9374 \t0.9208 \t0.9547 \t0.8973 \t375 \t337 \t56 \t29 \t16 \n",
|
||||
"sell_spread \t0.9418 \t0.9290 \t0.9551 \t0.9041 \t375 \t340 \t56 \t26 \t16 \n",
|
||||
"minimum_initial_investment \t0.9518 \t0.9457 \t0.9579 \t0.9315 \t315 \t296 \t112 \t17 \t13 \n",
|
||||
"benchmark_name \t0.8553 \t0.8418 \t0.8693 \t0.8973 \t166 \t133 \t260 \t25 \t20 \n",
|
||||
"TOTAL \t0.9093 \t0.9135 \t0.9131 \t0.9128 \t2644 \t2324 \t1674 \t224 \t158 \n",
|
||||
"Total Funds Matched - 438\n",
|
||||
"Total Funds Not Matched - 127\n",
|
||||
"Percentage of Funds Matched - 77.5221238938053\n",
|
||||
"management_fee_and_costs \t0.9169 \t0.8581 \t0.9843 \t0.8465 \t442 \t375 \t0 \t62 \t6 \n",
|
||||
"management_fee \t0.9351 \t0.8902 \t0.9848 \t0.8781 \t442 \t389 \t0 \t48 \t6 \n",
|
||||
"performance_fee_costs \t0.8653 \t0.8426 \t0.8893 \t0.8194 \t309 \t257 \t106 \t48 \t32 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.9412 \t0.8889 \t1.0000 \t0.9797 \t73 \t72 \t362 \t9 \t0 \n",
|
||||
"administration_fees \t0.9811 \t0.9873 \t0.9750 \t0.9932 \t80 \t78 \t362 \t1 \t2 \n",
|
||||
"total_annual_dollar_based_charges \t0.9857 \t0.9718 \t1.0000 \t0.9955 \t69 \t69 \t372 \t2 \t0 \n",
|
||||
"buy_spread \t0.9129 \t0.8879 \t0.9392 \t0.8668 \t363 \t309 \t75 \t39 \t20 \n",
|
||||
"sell_spread \t0.9176 \t0.8966 \t0.9398 \t0.8736 \t363 \t312 \t75 \t36 \t20 \n",
|
||||
"minimum_initial_investment \t0.9532 \t0.9641 \t0.9425 \t0.9345 \t313 \t295 \t119 \t11 \t18 \n",
|
||||
"benchmark_name \t0.8100 \t0.7847 \t0.8370 \t0.8804 \t148 \t113 \t277 \t31 \t22 \n",
|
||||
"TOTAL \t0.9219 \t0.8972 \t0.9492 \t0.9068 \t2602 \t2269 \t1748 \t287 \t126 \n",
|
||||
"Total Funds Matched - 443\n",
|
||||
"Total Funds Not Matched - 122\n",
|
||||
"Percentage of Funds Matched - 78.40707964601769\n",
|
||||
"All Providers Results: \n",
|
||||
"Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
|
||||
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||
"management_fee_and_costs \t0.9419 \t0.9059 \t0.9809 \t0.8902 \t172 \t154 \t0 \t16 \t3 \n",
|
||||
"management_fee \t0.9547 \t0.9294 \t0.9814 \t0.9133 \t172 \t158 \t0 \t12 \t3 \n",
|
||||
"performance_fee_costs \t0.8315 \t0.9024 \t0.7708 \t0.8266 \t97 \t74 \t69 \t8 \t22 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.9630 \t0.9286 \t1.0000 \t0.9769 \t53 \t52 \t117 \t4 \t0 \n",
|
||||
"administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t8 \t8 \t165 \t0 \t0 \n",
|
||||
"buy_spread \t0.9699 \t0.9699 \t0.9699 \t0.9422 \t169 \t161 \t2 \t5 \t5 \n",
|
||||
"sell_spread \t0.9760 \t0.9819 \t0.9702 \t0.9538 \t169 \t163 \t2 \t3 \t5 \n",
|
||||
"minimum_initial_investment \t0.9027 \t0.9062 \t0.8992 \t0.8555 \t135 \t116 \t32 \t12 \t13 \n",
|
||||
"benchmark_name \t0.8333 \t0.8025 \t0.8667 \t0.8497 \t85 \t65 \t82 \t16 \t10 \n",
|
||||
"TOTAL \t0.9303 \t0.9252 \t0.9377 \t0.9120 \t1060 \t951 \t469 \t76 \t219 \n",
|
||||
"Total Funds Matched - 173\n",
|
||||
"Total Funds Not Matched - 23\n",
|
||||
"Percentage of Funds Matched - 88.26530612244898\n",
|
||||
"management_fee_and_costs \t0.9412 \t0.9040 \t0.9816 \t0.8889 \t179 \t160 \t0 \t17 \t3 \n",
|
||||
"management_fee \t0.9744 \t0.9661 \t0.9828 \t0.9500 \t179 \t171 \t0 \t6 \t3 \n",
|
||||
"performance_fee_costs \t0.7876 \t0.8172 \t0.7600 \t0.7722 \t102 \t76 \t63 \t17 \t24 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.9286 \t0.8667 \t1.0000 \t0.9556 \t53 \t52 \t120 \t8 \t0 \n",
|
||||
"administration_fees \t0.9231 \t1.0000 \t0.8571 \t0.9889 \t14 \t12 \t166 \t0 \t2 \n",
|
||||
"buy_spread \t0.9217 \t0.9053 \t0.9387 \t0.8556 \t177 \t153 \t1 \t16 \t10 \n",
|
||||
"sell_spread \t0.9281 \t0.9172 \t0.9394 \t0.8667 \t177 \t155 \t1 \t14 \t10 \n",
|
||||
"minimum_initial_investment \t0.9118 \t0.9538 \t0.8732 \t0.8667 \t142 \t124 \t32 \t6 \t18 \n",
|
||||
"benchmark_name \t0.8280 \t0.8333 \t0.8228 \t0.8500 \t87 \t65 \t88 \t13 \t14 \n",
|
||||
"TOTAL \t0.9049 \t0.9071 \t0.9062 \t0.8883 \t1110 \t968 \t471 \t97 \t210 \n",
|
||||
"Total Funds Matched - 180\n",
|
||||
"Total Funds Not Matched - 16\n",
|
||||
"Percentage of Funds Matched - 91.83673469387756\n",
|
||||
"All Providers Results: \n",
|
||||
"Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
|
||||
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||
"management_fee_and_costs \t0.9379 \t0.8966 \t0.9832 \t0.8830 \t265 \t234 \t0 \t27 \t4 \n",
|
||||
"management_fee \t0.9463 \t0.9119 \t0.9835 \t0.8981 \t265 \t238 \t0 \t23 \t4 \n",
|
||||
"performance_fee_costs \t0.8730 \t0.8639 \t0.8824 \t0.8189 \t201 \t165 \t52 \t26 \t22 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.8485 \t1.0000 \t0.7368 \t0.9811 \t19 \t14 \t246 \t0 \t5 \n",
|
||||
"administration_fees \t0.7597 \t0.9800 \t0.6203 \t0.8830 \t79 \t49 \t185 \t1 \t30 \n",
|
||||
"total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9623 \t82 \t72 \t183 \t10 \t0 \n",
|
||||
"buy_spread \t0.9096 \t0.8800 \t0.9412 \t0.8679 \t206 \t176 \t54 \t24 \t11 \n",
|
||||
"sell_spread \t0.9124 \t0.8850 \t0.9415 \t0.8717 \t206 \t177 \t54 \t23 \t11 \n",
|
||||
"minimum_initial_investment \t0.9863 \t0.9730 \t1.0000 \t0.9811 \t180 \t180 \t80 \t5 \t0 \n",
|
||||
"benchmark_name \t0.8774 \t0.8831 \t0.8718 \t0.9283 \t81 \t68 \t178 \t9 \t10 \n",
|
||||
"TOTAL \t0.8986 \t0.9151 \t0.8961 \t0.9075 \t1584 \t1373 \t1032 \t148 \t316 \n",
|
||||
"Total Funds Matched - 265\n",
|
||||
"Total Funds Not Matched - 104\n",
|
||||
"Percentage of Funds Matched - 71.81571815718158\n"
|
||||
"management_fee_and_costs \t0.8996 \t0.8269 \t0.9862 \t0.8175 \t263 \t215 \t0 \t45 \t3 \n",
|
||||
"management_fee \t0.9064 \t0.8385 \t0.9864 \t0.8289 \t263 \t218 \t0 \t42 \t3 \n",
|
||||
"performance_fee_costs \t0.9027 \t0.8538 \t0.9577 \t0.8517 \t207 \t181 \t43 \t31 \t8 \n",
|
||||
"interposed_vehicle_performance_fee_cost \t0.9756 \t0.9524 \t1.0000 \t0.9962 \t20 \t20 \t242 \t1 \t0 \n",
|
||||
"administration_fees \t0.9925 \t0.9851 \t1.0000 \t0.9962 \t66 \t66 \t196 \t1 \t0 \n",
|
||||
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t69 \t69 \t194 \t0 \t0 \n",
|
||||
"buy_spread \t0.9043 \t0.8715 \t0.9398 \t0.8745 \t186 \t156 \t74 \t23 \t10 \n",
|
||||
"sell_spread \t0.9075 \t0.8771 \t0.9401 \t0.8783 \t186 \t157 \t74 \t22 \t10 \n",
|
||||
"minimum_initial_investment \t0.9856 \t0.9716 \t1.0000 \t0.9810 \t171 \t171 \t87 \t5 \t0 \n",
|
||||
"benchmark_name \t0.7869 \t0.7273 \t0.8571 \t0.9011 \t61 \t48 \t189 \t18 \t8 \n",
|
||||
"TOTAL \t0.9261 \t0.8904 \t0.9667 \t0.9125 \t1492 \t1301 \t1099 \t188 \t252 \n",
|
||||
"Total Funds Matched - 263\n",
|
||||
"Total Funds Not Matched - 106\n",
|
||||
"Percentage of Funds Matched - 71.27371273712737\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -114,7 +114,7 @@
|
|||
"\"\"\"\n",
|
||||
"\n",
|
||||
"path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
|
||||
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250314032024.xlsx\"\n",
|
||||
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250314113438.xlsx\"\n",
|
||||
"provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n",
|
||||
"\n",
|
||||
"funds_matched = 0\n",
|
||||
|
|
|
|||
Loading…
Reference in New Issue