align fund name based on production name
optimize performance relevant prompts
This commit is contained in:
parent
6f17c2253c
commit
c2c0b33015
|
|
@ -1295,6 +1295,21 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
|
|||
gt_list.append(0)
|
||||
else:
|
||||
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
||||
if data_point not in ["benchmark_name"]:
|
||||
try:
|
||||
pred_num = float(pred_value)
|
||||
# round to 2 decimal places
|
||||
pred_value = round(pred_num, 4)
|
||||
if pred_value == 0:
|
||||
gt_list.append(1)
|
||||
pred_list.append(1)
|
||||
else:
|
||||
gt_list.append(0)
|
||||
pred_list.append(1)
|
||||
message["error"] = "gt_value is empty, but pred_value is not empty"
|
||||
except Exception as e:
|
||||
pass
|
||||
else:
|
||||
gt_list.append(0)
|
||||
pred_list.append(1)
|
||||
message["error"] = "gt_value is empty, but pred_value is not empty"
|
||||
|
|
@ -1305,8 +1320,8 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
|
|||
|
||||
|
||||
def is_equal(gt_value, pred_value, data_point: str = ""):
|
||||
if gt_value is not None and len(str(gt_value).strip()) > 0 and \
|
||||
pred_value is not None and len(str(pred_value).strip()) > 0:
|
||||
if gt_value is not None and len(str(gt_value).strip()) > 0:
|
||||
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
||||
if gt_value == "0.0":
|
||||
gt_value = "0"
|
||||
if pred_value == "0.0":
|
||||
|
|
@ -1334,6 +1349,16 @@ def is_equal(gt_value, pred_value, data_point: str = ""):
|
|||
jacard_score = similarity.jaccard_similarity(gt_value.lower().split(), pred_value.lower().split())
|
||||
if jacard_score > 0.8:
|
||||
return True
|
||||
else:
|
||||
if data_point not in ["benchmark_name"]:
|
||||
try:
|
||||
gt_num = float(gt_value)
|
||||
# round to 2 decimal places
|
||||
gt_value = round(gt_num, 4)
|
||||
except Exception as e:
|
||||
pass
|
||||
if gt_value == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -36,6 +36,12 @@
|
|||
["\\nFund\\s*name\\s*Management\\s*fee\\s*Indirect\\s*costs\\s*Recoverable\\s*expenses[\\s\\S]*?performance.*\\s*fee\\s*Estimated\\s*other\\s*indirect\\s*costs\\s*\\n"],
|
||||
"replace_text": "\nFund name \nManagement fee \nRecoverable expenses \nEstimated performance-related fee \nEstimated other indirect costs \n",
|
||||
"comments": ["item 0: document 391080133, page 21"]
|
||||
},
|
||||
{
|
||||
"regex_all_list":
|
||||
["The\\s*investment\\s*fees\\s*and\\s*costs[\\s\\S]*?Performance\\s*fee\\s*Plus\\s*other\\s*investment\\s*fees\\s*and\\s*costs\\s*Equals\\s*investment\\s*fees\\s*and\\s*costs\\s*Transaction\\s*costs[\\s\\S]*?Buy\\-sell\\s*spreads\\s*Transaction[\\s\\S]*?Entry[\\s\\S]*pa\\s*\\n"],
|
||||
"replace_text": "Performance fee \nPlus other investment fees and costs \nEquals investment fees and costs \nTransaction costs(net) \nBuy-sell spreads \nTransaction costs(gross)\n",
|
||||
"comments": ["item 0: document 420339794, page 74"]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -263,6 +263,7 @@ class DataExtraction:
|
|||
data_dict["total_token"] = result.get("total_token", 0)
|
||||
"""
|
||||
data_list = self.supplement_ttr_pension(data_list)
|
||||
data_list = self.align_fund_share_name(data_list)
|
||||
data_list = self.supplement_minimum_initial_investment(data_list)
|
||||
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
|
||||
data_list = self.remove_duplicate_data(data_list)
|
||||
|
|
@ -272,6 +273,41 @@ class DataExtraction:
|
|||
data_list = self.check_administration_fees(data_list)
|
||||
return data_list
|
||||
|
||||
def align_fund_share_name(self, data_list: list):
|
||||
"""
|
||||
Align the fund name and share name to be the same format
|
||||
"""
|
||||
if self.document_production is None or len(self.document_production) == 0:
|
||||
return data_list
|
||||
fund_name_list = []
|
||||
for data_dict in data_list:
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
data = extract_data.get("data", [])
|
||||
for data_item in data:
|
||||
fund_name = data_item.get("fund_name", "")
|
||||
if len(fund_name) == 0:
|
||||
continue
|
||||
if fund_name not in fund_name_list:
|
||||
fund_name_list.append(fund_name)
|
||||
for data_dict in data_list:
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
data = extract_data.get("data", [])
|
||||
for data_item in data:
|
||||
fund_name = data_item.get("fund_name", "")
|
||||
share_name = data_item.get("share_name", "")
|
||||
if len(fund_name) == 0:
|
||||
continue
|
||||
for c_fund_name in fund_name_list:
|
||||
if c_fund_name == fund_name:
|
||||
continue
|
||||
if len(fund_name) < len(c_fund_name) and c_fund_name.endswith(fund_name):
|
||||
if c_fund_name.replace(fund_name, "").strip() in self.document_production:
|
||||
data_item["fund_name"] = c_fund_name
|
||||
if share_name == fund_name:
|
||||
data_item["share_name"] = c_fund_name
|
||||
break
|
||||
return data_list
|
||||
|
||||
def supplement_ttr_pension(self, data_list: list):
|
||||
"""
|
||||
If with fund name ends with "TTR" and "Pension", and exist same fund name without "TTR" or "Pension",
|
||||
|
|
@ -688,7 +724,7 @@ class DataExtraction:
|
|||
previous_page_datapoints = []
|
||||
previous_page_fund_name = None
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
# if page_num not in [40, 44]:
|
||||
# if page_num not in [74]:
|
||||
# continue
|
||||
if page_num in handled_page_num_list:
|
||||
continue
|
||||
|
|
|
|||
|
|
@ -240,29 +240,23 @@
|
|||
"\n",
|
||||
"H. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.",
|
||||
"---Example 1 Start---",
|
||||
"Fees and costs for \nyour investment options \n\nAdministration fees and costs apply in addition to the fees and costs shown in this table. Please refer to the PDS and Fee Brochure for \nfurther information about fees and costs, including how the figures shown below are calculated. \n\nThe investment fees and \ncosts are made up of \nPerformance \nfee \nPlus \nother \ninvestment \nfees and \ncosts \nEquals \ninvestment \nfees and \ncosts \nTransaction \ncosts (net) \nBuy-sell \nspreads \nTransaction \ncosts \n(gross) 1 \n% pa \n% pa \n% pa \nEntry %/ \nExit % \n% pa \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \n",
|
||||
"---Example 1 End---",
|
||||
"The column: \"Equals investment fees and costs\" is the sum of \"Performance fee\" and \"Plus other investment fees and costs\", we should ignore the \"Performance fee\" value, just output the \"Plus other investment fees and costs\" value.",
|
||||
"The \"Plus other investment fees and costs\" could be the values for both of \"management fee\" and \"management fee and costs\", so the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee_costs\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"performance_fee_costs\": 0.18, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
|
||||
"---Example 2 Start---",
|
||||
"MANAGEMENT COSTS AND TRANSACTION COSTS \n\nOption name Management costs \nEstimated \nperformance \nfee (pa) 1 \nTotal management\ncosts (including\nestimated performance\nfee) pa\nTransaction costs \nper transaction (%) \nMULTI-MANAGER MULTI-SECTOR (These investment options are located in the ‘Investment Options Menu’ on pages 18 to 19.) \nFirstChoice Wholesale Defensive 0.85% 0.85% 0.15\nFirstChoice Wholesale Conservative 0.90% 0.02%1 0.92% 1 0.15 \n",
|
||||
"---Example 2 End---",
|
||||
"---Example 1 End---",
|
||||
"The column: \"Total management costs (including estimated performance fee) pa\" is the sum of \"Management costs\" and \"Estimated performance fee (pa)\", we should ignore the \"Estimated performance fee (pa)\" value, just output the \"Management costs\" value.",
|
||||
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\", so the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"FirstChoice Wholesale Defensive\", \"share name\": \"FirstChoice Wholesale Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.85}, {\"fund name\": \"FirstChoice Wholesale Conservative\", \"share name\": \"FirstChoice Wholesale Conservative\", \"management_fee_and_costs\": 0.9, \"management_fee\": 0.9, \"performance_fee_costs\": 0.02}]}",
|
||||
"---Example 3 Start---",
|
||||
"---Example 2 Start---",
|
||||
"Investment \noption \nInvestment fees and \ncosts (p.a.) \n1 \nTransaction \ncosts (p.a.) \nMySuper/ \nBalanced \n0.38% (including 0.09% \nPerformance fee) \n0.18% \nManaged \nGrowth \n0.38% (including 0.11% \nPerformance fee) \n0.08% \n",
|
||||
"---Example 3 End---",
|
||||
"---Example 2 End---",
|
||||
"The column: \"Investment fees and costs (p.a.)\", \"including Performance fee\", meaning the value is the sum of \"Management costs\" and \"performance fee\", We should subtract the \"performance fee\" value, just output the \"Management costs\" value.",
|
||||
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\".",
|
||||
"So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee_costs is 0.09.",
|
||||
"For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee_costs is 0.11.",
|
||||
"So the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee_costs\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee_costs\": 0.11}]}",
|
||||
"---Example 4 Start---",
|
||||
"---Example 3 Start---",
|
||||
"Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return – Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n",
|
||||
"---Example 4 End---",
|
||||
"---Example 3 End---",
|
||||
"The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.",
|
||||
"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
|
||||
"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
|
||||
|
|
@ -550,6 +544,24 @@
|
|||
"The output should be: ",
|
||||
"{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"keywords":["Plus other investment fees and costs \nEquals investment fees and costs"],
|
||||
"prompts": [
|
||||
"Complex management fee and costs rule:",
|
||||
"If the table with columns:",
|
||||
"\"Performance fee\", \"Plus other investment fees and costs\", \"Equals investment fees and costs\", \"Transaction costs(net)\", \"Buy-sell spreads\", \"Transaction costs(gross)\".",
|
||||
"Both of the management_fee and management_fee_costs are \"Plus other investment fees and costs\".",
|
||||
"The performance_fee_costs is \"Performance fee\".",
|
||||
"The buy_spread and sell_spread are \"Buy-sell spreads\".",
|
||||
"---Example Start---",
|
||||
"Performance fee \nPlus other investment fees and costs \nEquals investment fees and costs \nTransaction costs(net) \nBuy-sell spreads \nTransaction costs(gross) \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.04 \n0.10 / 0.10 \n0.09 \n",
|
||||
"---Example End---",
|
||||
"Please ignore the 3rd column: \"Equals investment fees and costs\" values!!",
|
||||
"Please read context carefully, don't miss any data row!!",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
|||
4
main.py
4
main.py
|
|
@ -1560,8 +1560,8 @@ if __name__ == "__main__":
|
|||
# "544886057",
|
||||
# "550769189",
|
||||
# "553449663"]
|
||||
special_doc_id_list = ["412778803"]
|
||||
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803"]
|
||||
special_doc_id_list = ["454036250"]
|
||||
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
Loading…
Reference in New Issue