1. optimize instructions
Only load relevant fund name for investment objective, instead of full page text with the most recent investment objective 2. Exclude the table which with only one numeric column: Cost Product
This commit is contained in:
parent
551f754379
commit
f539340d04
|
|
@ -109,6 +109,29 @@ class DataExtraction:
|
||||||
|
|
||||||
return document_category, document_production
|
return document_category, document_production
|
||||||
|
|
||||||
|
def get_objective_fund_name(self, page_text: str) -> str:
|
||||||
|
fund_name = ""
|
||||||
|
if self.doc_source == "aus_prospectus":
|
||||||
|
objective_fund_name_prompts_file = os.path.join(self.instruction_folder, "objective_fund_name_prompts.json")
|
||||||
|
if not os.path.exists(objective_fund_name_prompts_file):
|
||||||
|
return fund_name
|
||||||
|
with open(objective_fund_name_prompts_file, "r", encoding="utf-8") as f:
|
||||||
|
objective_fund_name_prompt = "\n".join(json.load(f).get("prompts", []))
|
||||||
|
if len(objective_fund_name_prompt) > 0:
|
||||||
|
prompts = f"Context: \n{page_text}\n\Instructions: \n{objective_fund_name_prompt}"
|
||||||
|
result, with_error = chat(
|
||||||
|
prompt=prompts, response_format={"type": "json_object"}, max_tokens=1000
|
||||||
|
)
|
||||||
|
response = result.get("response", "")
|
||||||
|
if not with_error:
|
||||||
|
try:
|
||||||
|
data = json.loads(response)
|
||||||
|
fund_name = data.get("fund_name", "")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return fund_name
|
||||||
|
|
||||||
|
|
||||||
def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict:
|
def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
If document source is aus_propectus and document category is MIS
|
If document source is aus_propectus and document category is MIS
|
||||||
|
|
@ -647,7 +670,7 @@ class DataExtraction:
|
||||||
(mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))):
|
(mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))):
|
||||||
if exist_complex_rule_keywords and \
|
if exist_complex_rule_keywords and \
|
||||||
("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
|
("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
|
||||||
mfc["management_fee"] = management_fee
|
mf["management_fee"] = management_fee
|
||||||
found = True
|
found = True
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
|
@ -766,7 +789,7 @@ class DataExtraction:
|
||||||
previous_page_datapoints = []
|
previous_page_datapoints = []
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
# if page_num not in [74]:
|
# if page_num not in [25]:
|
||||||
# continue
|
# continue
|
||||||
if page_num in handled_page_num_list:
|
if page_num in handled_page_num_list:
|
||||||
continue
|
continue
|
||||||
|
|
@ -1092,12 +1115,16 @@ class DataExtraction:
|
||||||
diff_pages = [page_num - investment_objective_page for investment_objective_page
|
diff_pages = [page_num - investment_objective_page for investment_objective_page
|
||||||
in self.investment_objective_pages
|
in self.investment_objective_pages
|
||||||
if investment_objective_page <= page_num]
|
if investment_objective_page <= page_num]
|
||||||
if len(diff_pages) > 0 and diff_pages[-1] < 5:
|
if len(diff_pages) > 0 and diff_pages[-1] < 5 and diff_pages[-1] > 0:
|
||||||
top_nearest_investment_objective_page = self.investment_objective_pages[len(diff_pages) - 1]
|
top_nearest_investment_objective_page = self.investment_objective_pages[len(diff_pages) - 1]
|
||||||
top_nearest_investment_objective_text = self.page_text_dict.get(top_nearest_investment_objective_page, "")
|
top_nearest_investment_objective_text = self.page_text_dict.get(top_nearest_investment_objective_page, "")
|
||||||
if top_nearest_investment_objective_text in page_text:
|
|
||||||
|
if top_nearest_investment_objective_text in page_text and \
|
||||||
|
top_nearest_investment_objective_text != page_text:
|
||||||
page_text = page_text.replace(top_nearest_investment_objective_text, "").strip()
|
page_text = page_text.replace(top_nearest_investment_objective_text, "").strip()
|
||||||
pre_context = f"\nThe most recent investment objective page text which maybe with fund name is: \n{top_nearest_investment_objective_text}.\n"
|
pre_context_fund_name = self.get_objective_fund_name(top_nearest_investment_objective_text)
|
||||||
|
if pre_context_fund_name is not None and len(pre_context_fund_name) > 0:
|
||||||
|
pre_context = f"\nThe fund name for most recent investment objective page text is: \n{pre_context_fund_name}.\n"
|
||||||
# If can't find previous investment objective text, add the fund names to be the pre-fix of page text
|
# If can't find previous investment objective text, add the fund names to be the pre-fix of page text
|
||||||
page_text = f"{pre_context}\n{page_text}".strip()
|
page_text = f"{pre_context}\n{page_text}".strip()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -375,7 +375,7 @@
|
||||||
"buy_spread": [
|
"buy_spread": [
|
||||||
"A. Exclude reported name",
|
"A. Exclude reported name",
|
||||||
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
||||||
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), ",
|
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), Cost of product, ",
|
||||||
"Estimated transaction costs offset by buy/sell spreads (% pa), ",
|
"Estimated transaction costs offset by buy/sell spreads (% pa), ",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Option name \nTotal estimated \ntransaction costs \n(% pa) \nEstimated transaction costs \noffset by buy/sell spreads \n(% pa) \nEstimated transaction costs \nborne by the option \n(% pa) \nGenerations Defensive \n0.21 \n0.04 \n0.17 \n",
|
"Option name \nTotal estimated \ntransaction costs \n(% pa) \nEstimated transaction costs \noffset by buy/sell spreads \n(% pa) \nEstimated transaction costs \nborne by the option \n(% pa) \nGenerations Defensive \n0.21 \n0.04 \n0.17 \n",
|
||||||
|
|
@ -388,6 +388,12 @@
|
||||||
"---Example 2 End---",
|
"---Example 2 End---",
|
||||||
"The data is about Transaction costs, should be excluded, the output for buy_spread and sell_spread should be:",
|
"The data is about Transaction costs, should be excluded, the output for buy_spread and sell_spread should be:",
|
||||||
"{\"data\": []}",
|
"{\"data\": []}",
|
||||||
|
"\n",
|
||||||
|
"---Example 3 Start---",
|
||||||
|
"Fund name \nCost of product \nCFS Index Australian Bond Fund \n$155 \n",
|
||||||
|
"---Example 3 End---",
|
||||||
|
"The data is about Cost of product, should be excluded, the output for buy_spread and sell_spread should be:",
|
||||||
|
"{\"data\": []}",
|
||||||
"B. Simple case with simple table structure:",
|
"B. Simple case with simple table structure:",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",
|
"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",
|
||||||
|
|
@ -416,12 +422,18 @@
|
||||||
"performance_fee_costs": [
|
"performance_fee_costs": [
|
||||||
"Performance fees is share class level data.",
|
"Performance fees is share class level data.",
|
||||||
"A. If the performance fees is with the range, please ignore and output empty.",
|
"A. If the performance fees is with the range, please ignore and output empty.",
|
||||||
"---Example 1 Start---",
|
"---Example Start---",
|
||||||
"Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.",
|
"Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.",
|
||||||
"---Example 1 End---",
|
"---Example End---",
|
||||||
"The relevant values: 0.00 and 2.18, are in the range, so the output should be:",
|
"The relevant values: 0.00 and 2.18, are in the range, so the output should be:",
|
||||||
"{\"data\": []}",
|
"{\"data\": []}",
|
||||||
"B. If with pure performance fee in table, please extract relevant values",
|
"B. If the table is only about Cost of product, should be excluded, ",
|
||||||
|
"---Example Start---",
|
||||||
|
"Fund name \nCost of product \nCFS Index Australian Bond Fund \n$155 \n",
|
||||||
|
"---Example End---",
|
||||||
|
"The data is about Cost of product, should be excluded, the output for Performance fees should be:",
|
||||||
|
"{\"data\": []}",
|
||||||
|
"C. If with pure performance fee in table, please extract relevant values",
|
||||||
"---Example Start---",
|
"---Example Start---",
|
||||||
"\n\nFees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nPerformance fees \nAmounts deducted from your investment in \nrelation to the performance of the product. \nPlatinum International Fund Nil 0.15%\nPlatinum Global Fund (Long Only) Nil 0.24%\n",
|
"\n\nFees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nPerformance fees \nAmounts deducted from your investment in \nrelation to the performance of the product. \nPlatinum International Fund Nil 0.15%\nPlatinum Global Fund (Long Only) Nil 0.24%\n",
|
||||||
"---Example End---",
|
"---Example End---",
|
||||||
|
|
@ -429,8 +441,7 @@
|
||||||
"b. This example mentioned share classes, please output according to share class.",
|
"b. This example mentioned share classes, please output according to share class.",
|
||||||
"The output should be",
|
"The output should be",
|
||||||
"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}",
|
"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}",
|
||||||
"C. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0"
|
"D. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0"
|
||||||
|
|
||||||
],
|
],
|
||||||
"minimum_initial_investment": [
|
"minimum_initial_investment": [
|
||||||
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
|
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
{
|
||||||
|
"prompts": [
|
||||||
|
"Get the fund name from document context. \n",
|
||||||
|
"The document context contains fund investment objective(s).\n",
|
||||||
|
"1. Please locate the last investment objective in the document context.\n",
|
||||||
|
"2. Please provide the relevant fund name for the last investment objective.\n",
|
||||||
|
"3. Usually, the fund name can be found in the several upon lines of the last investment objective.\n",
|
||||||
|
"----Example context start----",
|
||||||
|
"\n\nMLC Horizon 4 Balanced Portfolio \n\nThis option invests in a wide range of asset classes with a strong bias towards shares and other growth assets. It ’ s designed for members who \nare focused on higher returns and are willing to take on exposure to more volatile investments. \n\nMLC Horizon 4 Balanced Portfolio \nInvestment objective \nAims to grow by more than inflation +3% pa (after fees and tax) over 10 years. \nBenchmark \nInflation is measured by the Consumer Price Index, calculated by the Australian Bureau of Statistics. \nThe investment option may be \nsuited to you if... \nyou want your investment to exceed changes in the costs of living, over the long term \nyou want a higher emphasis on growth than stability \nyou understand returns may be higher or lower than its objective, and \nyou value active management. \n\n3 \n\nMLC MasterKey Super & Pension Fundamentals Product Disclosure Statement",
|
||||||
|
"----Example context end----",
|
||||||
|
"The output should be as JSON format:",
|
||||||
|
"{\"fund_name\": \"MLC Horizon 4 Balanced Portfolio\"}\n",
|
||||||
|
"Answer:\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
21
main.py
21
main.py
|
|
@ -1452,7 +1452,7 @@ def get_aus_prospectus_document_category():
|
||||||
|
|
||||||
|
|
||||||
def test_post_adjust_extract_data():
|
def test_post_adjust_extract_data():
|
||||||
doc_id = "539266814"
|
doc_id = "397107472"
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
@ -1526,8 +1526,8 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# special_doc_id_list = ["553242411"]
|
# special_doc_id_list = ["553242411"]
|
||||||
|
|
||||||
re_run_extract_data = True
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = False
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
doc_source = "aus_prospectus"
|
doc_source = "aus_prospectus"
|
||||||
# doc_source = "emea_ar"
|
# doc_source = "emea_ar"
|
||||||
|
|
@ -1547,20 +1547,7 @@ if __name__ == "__main__":
|
||||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
||||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
# document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||||
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
|
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
|
||||||
# special_doc_id_list: list = ["410899007", "539266880", "539266817",
|
# special_doc_id_list = ["420339794"]
|
||||||
# "539261734", "539266893"]
|
|
||||||
# special_doc_id_list: list = ["530101994",
|
|
||||||
# "539241700",
|
|
||||||
# "539261734",
|
|
||||||
# "539266814",
|
|
||||||
# "539266817",
|
|
||||||
# "539266874",
|
|
||||||
# "539266880",
|
|
||||||
# "539266893",
|
|
||||||
# "544886057",
|
|
||||||
# "550769189",
|
|
||||||
# "553449663"]
|
|
||||||
# special_doc_id_list = ["521606755"]
|
|
||||||
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"]
|
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 28,
|
"execution_count": 33,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
|
@ -44,53 +44,53 @@
|
||||||
"All Providers Results: \n",
|
"All Providers Results: \n",
|
||||||
"Document List File - None\n",
|
"Document List File - None\n",
|
||||||
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||||
"management_fee_and_costs \t0.9463 \t0.9087 \t0.9873 \t0.8986 \t431 \t388 \t2 \t39 \t5 \n",
|
"management_fee_and_costs \t0.9395 \t0.9002 \t0.9823 \t0.8858 \t437 \t388 \t0 \t43 \t7 \n",
|
||||||
"management_fee \t0.9502 \t0.9157 \t0.9874 \t0.9055 \t431 \t391 \t2 \t36 \t5 \n",
|
"management_fee \t0.9496 \t0.9188 \t0.9826 \t0.9041 \t437 \t396 \t0 \t35 \t7 \n",
|
||||||
"performance_fee_costs \t0.8614 \t0.8473 \t0.8759 \t0.8272 \t281 \t233 \t126 \t42 \t33 \n",
|
"performance_fee_costs \t0.8597 \t0.8755 \t0.8445 \t0.8219 \t298 \t239 \t121 \t34 \t44 \n",
|
||||||
"interposed_vehicle_performance_fee_cost \t0.9726 \t0.9467 \t1.0000 \t0.9908 \t72 \t71 \t359 \t4 \t0 \n",
|
"interposed_vehicle_performance_fee_cost \t0.9362 \t0.9429 \t0.9296 \t0.9795 \t72 \t66 \t363 \t4 \t5 \n",
|
||||||
"administration_fees \t0.9935 \t0.9872 \t1.0000 \t0.9977 \t77 \t77 \t356 \t1 \t0 \n",
|
"administration_fees \t0.7862 \t0.9828 \t0.6552 \t0.9292 \t87 \t57 \t350 \t1 \t30 \n",
|
||||||
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t72 \t72 \t362 \t0 \t0 \n",
|
"total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9772 \t82 \t72 \t356 \t10 \t0 \n",
|
||||||
"buy_spread \t0.9322 \t0.9066 \t0.9593 \t0.8894 \t370 \t330 \t56 \t34 \t14 \n",
|
"buy_spread \t0.9374 \t0.9208 \t0.9547 \t0.8973 \t375 \t337 \t56 \t29 \t16 \n",
|
||||||
"sell_spread \t0.9352 \t0.9121 \t0.9595 \t0.8940 \t370 \t332 \t56 \t32 \t14 \n",
|
"sell_spread \t0.9418 \t0.9290 \t0.9551 \t0.9041 \t375 \t340 \t56 \t26 \t16 \n",
|
||||||
"minimum_initial_investment \t0.9577 \t0.9474 \t0.9684 \t0.9378 \t322 \t306 \t101 \t17 \t10 \n",
|
"minimum_initial_investment \t0.9518 \t0.9457 \t0.9579 \t0.9315 \t315 \t296 \t112 \t17 \t13 \n",
|
||||||
"benchmark_name \t0.8067 \t0.7562 \t0.8643 \t0.8664 \t154 \t121 \t255 \t39 \t19 \n",
|
"benchmark_name \t0.8553 \t0.8418 \t0.8693 \t0.8973 \t166 \t133 \t260 \t25 \t20 \n",
|
||||||
"TOTAL \t0.9356 \t0.9128 \t0.9602 \t0.9207 \t2580 \t2321 \t1675 \t244 \t100 \n",
|
"TOTAL \t0.9093 \t0.9135 \t0.9131 \t0.9128 \t2644 \t2324 \t1674 \t224 \t158 \n",
|
||||||
"Total Funds Matched - 434\n",
|
"Total Funds Matched - 438\n",
|
||||||
"Total Funds Not Matched - 131\n",
|
"Total Funds Not Matched - 127\n",
|
||||||
"Percentage of Funds Matched - 76.8141592920354\n",
|
"Percentage of Funds Matched - 77.5221238938053\n",
|
||||||
"All Providers Results: \n",
|
"All Providers Results: \n",
|
||||||
"Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
|
"Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n",
|
||||||
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||||
"management_fee_and_costs \t0.9499 \t0.9096 \t0.9938 \t0.9045 \t177 \t161 \t0 \t16 \t1 \n",
|
"management_fee_and_costs \t0.9419 \t0.9059 \t0.9809 \t0.8902 \t172 \t154 \t0 \t16 \t3 \n",
|
||||||
"management_fee \t0.9529 \t0.9153 \t0.9939 \t0.9101 \t177 \t162 \t0 \t15 \t1 \n",
|
"management_fee \t0.9547 \t0.9294 \t0.9814 \t0.9133 \t172 \t158 \t0 \t12 \t3 \n",
|
||||||
"performance_fee_costs \t0.8197 \t0.7979 \t0.8427 \t0.8146 \t91 \t75 \t70 \t19 \t14 \n",
|
"performance_fee_costs \t0.8315 \t0.9024 \t0.7708 \t0.8266 \t97 \t74 \t69 \t8 \t22 \n",
|
||||||
"interposed_vehicle_performance_fee_cost \t0.9811 \t0.9630 \t1.0000 \t0.9888 \t53 \t52 \t124 \t2 \t0 \n",
|
"interposed_vehicle_performance_fee_cost \t0.9630 \t0.9286 \t1.0000 \t0.9769 \t53 \t52 \t117 \t4 \t0 \n",
|
||||||
"administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t8 \t8 \t170 \t0 \t0 \n",
|
"administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t8 \t8 \t165 \t0 \t0 \n",
|
||||||
"buy_spread \t0.9738 \t0.9653 \t0.9824 \t0.9494 \t174 \t167 \t2 \t6 \t3 \n",
|
"buy_spread \t0.9699 \t0.9699 \t0.9699 \t0.9422 \t169 \t161 \t2 \t5 \t5 \n",
|
||||||
"sell_spread \t0.9767 \t0.9711 \t0.9825 \t0.9551 \t174 \t168 \t2 \t5 \t3 \n",
|
"sell_spread \t0.9760 \t0.9819 \t0.9702 \t0.9538 \t169 \t163 \t2 \t3 \t5 \n",
|
||||||
"minimum_initial_investment \t0.9185 \t0.9118 \t0.9254 \t0.8764 \t140 \t124 \t32 \t12 \t10 \n",
|
"minimum_initial_investment \t0.9027 \t0.9062 \t0.8992 \t0.8555 \t135 \t116 \t32 \t12 \t13 \n",
|
||||||
"benchmark_name \t0.8121 \t0.7528 \t0.8816 \t0.8258 \t86 \t67 \t80 \t22 \t9 \n",
|
"benchmark_name \t0.8333 \t0.8025 \t0.8667 \t0.8497 \t85 \t65 \t82 \t16 \t10 \n",
|
||||||
"TOTAL \t0.9316 \t0.9096 \t0.9558 \t0.9139 \t1080 \t984 \t480 \t97 \t141 \n",
|
"TOTAL \t0.9303 \t0.9252 \t0.9377 \t0.9120 \t1060 \t951 \t469 \t76 \t219 \n",
|
||||||
"Total Funds Matched - 178\n",
|
"Total Funds Matched - 173\n",
|
||||||
"Total Funds Not Matched - 18\n",
|
"Total Funds Not Matched - 23\n",
|
||||||
"Percentage of Funds Matched - 90.81632653061224\n",
|
"Percentage of Funds Matched - 88.26530612244898\n",
|
||||||
"All Providers Results: \n",
|
"All Providers Results: \n",
|
||||||
"Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
|
"Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n",
|
||||||
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
"Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n",
|
||||||
"management_fee_and_costs \t0.9439 \t0.9080 \t0.9827 \t0.8945 \t254 \t227 \t2 \t23 \t4 \n",
|
"management_fee_and_costs \t0.9379 \t0.8966 \t0.9832 \t0.8830 \t265 \t234 \t0 \t27 \t4 \n",
|
||||||
"management_fee \t0.9482 \t0.9160 \t0.9828 \t0.9023 \t254 \t229 \t2 \t21 \t4 \n",
|
"management_fee \t0.9463 \t0.9119 \t0.9835 \t0.8981 \t265 \t238 \t0 \t23 \t4 \n",
|
||||||
"performance_fee_costs \t0.8827 \t0.8729 \t0.8927 \t0.8359 \t190 \t158 \t56 \t23 \t19 \n",
|
"performance_fee_costs \t0.8730 \t0.8639 \t0.8824 \t0.8189 \t201 \t165 \t52 \t26 \t22 \n",
|
||||||
"interposed_vehicle_performance_fee_cost \t0.9500 \t0.9048 \t1.0000 \t0.9922 \t19 \t19 \t235 \t2 \t0 \n",
|
"interposed_vehicle_performance_fee_cost \t0.8485 \t1.0000 \t0.7368 \t0.9811 \t19 \t14 \t246 \t0 \t5 \n",
|
||||||
"administration_fees \t0.9928 \t0.9857 \t1.0000 \t0.9961 \t69 \t69 \t186 \t1 \t0 \n",
|
"administration_fees \t0.7597 \t0.9800 \t0.6203 \t0.8830 \t79 \t49 \t185 \t1 \t30 \n",
|
||||||
"total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t72 \t72 \t184 \t0 \t0 \n",
|
"total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9623 \t82 \t72 \t183 \t10 \t0 \n",
|
||||||
"buy_spread \t0.8932 \t0.8534 \t0.9368 \t0.8477 \t196 \t163 \t54 \t28 \t11 \n",
|
"buy_spread \t0.9096 \t0.8800 \t0.9412 \t0.8679 \t206 \t176 \t54 \t24 \t11 \n",
|
||||||
"sell_spread \t0.8962 \t0.8586 \t0.9371 \t0.8516 \t196 \t164 \t54 \t27 \t11 \n",
|
"sell_spread \t0.9124 \t0.8850 \t0.9415 \t0.8717 \t206 \t177 \t54 \t23 \t11 \n",
|
||||||
"minimum_initial_investment \t0.9864 \t0.9733 \t1.0000 \t0.9805 \t182 \t182 \t69 \t5 \t0 \n",
|
"minimum_initial_investment \t0.9863 \t0.9730 \t1.0000 \t0.9811 \t180 \t180 \t80 \t5 \t0 \n",
|
||||||
"benchmark_name \t0.8000 \t0.7606 \t0.8438 \t0.8945 \t68 \t54 \t175 \t17 \t10 \n",
|
"benchmark_name \t0.8774 \t0.8831 \t0.8718 \t0.9283 \t81 \t68 \t178 \t9 \t10 \n",
|
||||||
"TOTAL \t0.9293 \t0.9033 \t0.9576 \t0.9195 \t1500 \t1337 \t1017 \t147 \t200 \n",
|
"TOTAL \t0.8986 \t0.9151 \t0.8961 \t0.9075 \t1584 \t1373 \t1032 \t148 \t316 \n",
|
||||||
"Total Funds Matched - 256\n",
|
"Total Funds Matched - 265\n",
|
||||||
"Total Funds Not Matched - 113\n",
|
"Total Funds Not Matched - 104\n",
|
||||||
"Percentage of Funds Matched - 69.37669376693766\n"
|
"Percentage of Funds Matched - 71.81571815718158\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -114,7 +114,7 @@
|
||||||
"\"\"\"\n",
|
"\"\"\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
|
"path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n",
|
||||||
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313153441.xlsx\"\n",
|
"path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313224747.xlsx\"\n",
|
||||||
"provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n",
|
"provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"funds_matched = 0\n",
|
"funds_matched = 0\n",
|
||||||
|
|
@ -198,36 +198,41 @@
|
||||||
" for i in intersection_list:\n",
|
" for i in intersection_list:\n",
|
||||||
" for keys in imp_datapoints:\n",
|
" for keys in imp_datapoints:\n",
|
||||||
" if i == imp_datapoints_mapping[keys]:\n",
|
" if i == imp_datapoints_mapping[keys]:\n",
|
||||||
|
" truth = str(truth_values[i]).strip()\n",
|
||||||
|
" generated = str(generated_values[i]).strip()\n",
|
||||||
" total = total +1\n",
|
" total = total +1\n",
|
||||||
" if truth_values[i] == \"\":\n",
|
" if truth == \"\":\n",
|
||||||
" if truth_values[i] == generated_values[i]:\n",
|
" if truth == generated:\n",
|
||||||
" results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
|
" results[i][\"TN\"] = results[i][\"TN\"] + 1\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
|
" results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
|
||||||
" # if \"Performance fee and cost\" in keys:\n",
|
" # if \"Performance fee and cost\" in keys:\n",
|
||||||
" debug = 0\n",
|
" debug = 0\n",
|
||||||
" # print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], sec_name) \n",
|
" # print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], sec_name) \n",
|
||||||
" message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Truth is null and generated is not null\"}\n",
|
" message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n",
|
||||||
|
" \"truth\": truth, \"generated\": generated, \"error\": \"Truth is null and generated is not null\"}\n",
|
||||||
" message_list.append(message) \n",
|
" message_list.append(message) \n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" if truth_values[i] == generated_values[i]:\n",
|
" if truth == generated:\n",
|
||||||
" results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
|
" results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
|
||||||
" elif generated_values[i] != \"\":\n",
|
" elif generated != \"\":\n",
|
||||||
" if i == \"benchmark_name\" and compare_text(truth_values[i], generated_values[i]):\n",
|
" if i == \"benchmark_name\" and compare_text(truth, generated):\n",
|
||||||
" results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
|
" results[i][\"TP\"] = results[i][\"TP\"] + 1\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
|
" results[i][\"FP\"] = results[i][\"FP\"] + 1\n",
|
||||||
" # if \"Performance fee and cost\" in keys:\n",
|
" # if \"Performance fee and cost\" in keys:\n",
|
||||||
" debug = 0\n",
|
" debug = 0\n",
|
||||||
" # print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", sec_name)\n",
|
" # print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", sec_name)\n",
|
||||||
" message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Truth is not equal with generated\"}\n",
|
" message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n",
|
||||||
|
" \"truth\": truth, \"generated\": generated, \"error\": \"Truth is not equal with generated\"}\n",
|
||||||
" message_list.append(message)\n",
|
" message_list.append(message)\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
|
" results[i][\"FN\"] = results[i][\"FN\"] + 1\n",
|
||||||
" # if \"Performance fee and cost\" in keys:\n",
|
" # if \"Performance fee and cost\" in keys:\n",
|
||||||
" debug = 0\n",
|
" debug = 0\n",
|
||||||
" # print(keys, \" - \" , doc_id, \" generated is null and truth is - \", truth_values[i], sec_name)\n",
|
" # print(keys, \" - \" , doc_id, \" generated is null and truth is - \", truth_values[i], sec_name)\n",
|
||||||
" message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Generated is null and truth is not null\"}\n",
|
" message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n",
|
||||||
|
" \"truth\": truth, \"generated\": generated, \"error\": \"Generated is null and truth is not null\"}\n",
|
||||||
" message_list.append(message)\n",
|
" message_list.append(message)\n",
|
||||||
" results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
|
" results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n",
|
||||||
" funds_matched += 1\n",
|
" funds_matched += 1\n",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue