optimize instructions for document 412778803
This commit is contained in:
parent
765772e5a8
commit
6f17c2253c
|
|
@ -1282,7 +1282,7 @@ def generate_message(message: dict,
|
|||
|
||||
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""):
|
||||
message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""}
|
||||
if gt_value is not None and len(str(gt_value)) > 0:
|
||||
if gt_value is not None and len(str(gt_value).strip()) > 0:
|
||||
gt_list.append(1)
|
||||
gt_equal_pred = is_equal(gt_value, pred_value, data_point)
|
||||
if gt_equal_pred:
|
||||
|
|
@ -1290,11 +1290,11 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
|
|||
else:
|
||||
pred_list.append(0)
|
||||
message["error"] = "pred_value is not equal to gt_value"
|
||||
if pred_value is not None and len(str(pred_value)) > 0:
|
||||
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
||||
pred_list.append(1)
|
||||
gt_list.append(0)
|
||||
else:
|
||||
if pred_value is not None and len(str(pred_value)) > 0:
|
||||
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
||||
gt_list.append(0)
|
||||
pred_list.append(1)
|
||||
message["error"] = "gt_value is empty, but pred_value is not empty"
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@
|
|||
"regex_all_list":
|
||||
["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n",
|
||||
"Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"],
|
||||
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n",
|
||||
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \n",
|
||||
"comments": ["item 0: document 401212184, page 17",
|
||||
"item 1: document 401212184, page 18 - 20"]
|
||||
},
|
||||
|
|
|
|||
|
|
@ -262,6 +262,7 @@ class DataExtraction:
|
|||
data_dict["completion_token"] = result.get("completion_token", 0)
|
||||
data_dict["total_token"] = result.get("total_token", 0)
|
||||
"""
|
||||
data_list = self.supplement_ttr_pension(data_list)
|
||||
data_list = self.supplement_minimum_initial_investment(data_list)
|
||||
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
|
||||
data_list = self.remove_duplicate_data(data_list)
|
||||
|
|
@ -270,6 +271,79 @@ class DataExtraction:
|
|||
|
||||
data_list = self.check_administration_fees(data_list)
|
||||
return data_list
|
||||
|
||||
def supplement_ttr_pension(self, data_list: list):
|
||||
"""
|
||||
If with fund name ends with "TTR" and "Pension", and exist same fund name without "TTR" or "Pension",
|
||||
Supplement the data of "TTR" and "Pension" to the same fund name without "TTR" or "Pension"
|
||||
"""
|
||||
ttr_fund_name_list = []
|
||||
pension_fund_name_list = []
|
||||
exist_ttr = False
|
||||
exist_pension = False
|
||||
for data_dict in data_list:
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
data = extract_data.get("data", [])
|
||||
|
||||
for data_item in data:
|
||||
keys = list(data_item.keys())
|
||||
fund_name = data_item.get("fund_name", "")
|
||||
if len(fund_name) == 0:
|
||||
continue
|
||||
fund_name_splits = fund_name.split()
|
||||
if fund_name_splits[-1] == "TTR":
|
||||
ttr_fund_name_list.append(fund_name)
|
||||
exist_ttr = True
|
||||
if fund_name_splits[-1] == "Pension":
|
||||
pension_fund_name_list.append(fund_name)
|
||||
exist_pension = True
|
||||
if exist_ttr and exist_pension:
|
||||
for data_dict in data_list:
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
data = extract_data.get("data", [])
|
||||
new_item_list = []
|
||||
remove_item_list = []
|
||||
for data_item in data:
|
||||
fund_name = data_item.get("fund_name", "")
|
||||
share_name = data_item.get("share_name", "")
|
||||
if len(fund_name) == 0:
|
||||
continue
|
||||
fund_name_splits = fund_name.split()
|
||||
if fund_name_splits[-1] == "TTR" or fund_name_splits[-1] == "Pension":
|
||||
continue
|
||||
keys = [key for key in list(data_item.keys())
|
||||
if key not in ["fund_name", "share_name"]]
|
||||
for ttr_fund_name in ttr_fund_name_list:
|
||||
ttr_pure_fund_name = ttr_fund_name.replace(" TTR", "")
|
||||
if fund_name == ttr_pure_fund_name:
|
||||
new_fund_name = f"{fund_name} TTR"
|
||||
if share_name == fund_name:
|
||||
share_name = new_fund_name
|
||||
new_item = {"fund_name": new_fund_name, "share_name": share_name}
|
||||
for key in keys:
|
||||
new_item[key] = data_item.get(key, "")
|
||||
new_item_list.append(new_item)
|
||||
if data_item not in remove_item_list:
|
||||
remove_item_list.append(data_item)
|
||||
break
|
||||
for pension_fund_name in pension_fund_name_list:
|
||||
pernsion_pure_fund_name = pension_fund_name.replace(" Pension", "")
|
||||
if fund_name == pernsion_pure_fund_name:
|
||||
new_fund_name = f"{fund_name} Pension"
|
||||
if share_name == fund_name:
|
||||
share_name = new_fund_name
|
||||
new_item = {"fund_name": new_fund_name, "share_name": share_name}
|
||||
for key in keys:
|
||||
new_item[key] = data_item.get(key, "")
|
||||
new_item_list.append(new_item)
|
||||
if data_item not in remove_item_list:
|
||||
remove_item_list.append(data_item)
|
||||
break
|
||||
for remove_item in remove_item_list:
|
||||
if remove_item in data:
|
||||
data.remove(remove_item)
|
||||
data.extend(new_item_list)
|
||||
return data_list
|
||||
|
||||
def check_administration_fees(self, data_list: list):
|
||||
"""
|
||||
|
|
@ -614,7 +688,7 @@ class DataExtraction:
|
|||
previous_page_datapoints = []
|
||||
previous_page_fund_name = None
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
# if page_num not in [20]:
|
||||
# if page_num not in [40, 44]:
|
||||
# continue
|
||||
if page_num in handled_page_num_list:
|
||||
continue
|
||||
|
|
|
|||
|
|
@ -360,8 +360,15 @@
|
|||
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
|
||||
],
|
||||
"buy_spread": [
|
||||
"A. Exclude reported name",
|
||||
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
||||
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
|
||||
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)",
|
||||
"B. Simple case with simple table structure:",
|
||||
"---Example 1 Start---",
|
||||
"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",
|
||||
"---Example 1 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Lifestyle Growth\", \"share name\": \"Lifestyle Growth\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Lifestyle Balanced\", \"share name\": \"Lifestyle Balanced\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Property\", \"share name\": \"Property\", \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
|
||||
],
|
||||
"performance_fee_costs": [
|
||||
"Performance fees is share class level data.",
|
||||
|
|
@ -468,15 +475,15 @@
|
|||
"For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".",
|
||||
"For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry option\" + \"Estimated other investment costs\".",
|
||||
"---Example 1 Start---",
|
||||
"Management Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n",
|
||||
"\nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.32 0.00 0.00 0.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.29 0.00 0.00 0.04 1.48 2.33\nPlatinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n",
|
||||
"---Example 1 End---",
|
||||
"The data points numbers order in data row (for example: 2.14 2.990.02 0.000.21 2.37 3.22) is correct as initial table structure.",
|
||||
"The data points numbers order in data row (for example: 2.14 2.99 0.02 0.00 0.21 2.37 3.22) is correct as initial table structure.",
|
||||
"Please pay attention below information",
|
||||
"Assume the column sequence number is from 1.",
|
||||
"Assume the numeric column sequence number is from 1.",
|
||||
"\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.",
|
||||
"For main fund: Platinum Asia with values: 2.14 2.990.02 0.000.21 2.37 3.22, ",
|
||||
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
|
||||
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
|
||||
"For main fund: Platinum Asia with values: 2.14 2.99 0.02 0.00 0.21 2.37 3.22, ",
|
||||
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
|
||||
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
|
||||
"Therefore, the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}"
|
||||
]
|
||||
|
|
@ -495,10 +502,11 @@
|
|||
"it means each investment name is with only one fund name, it is for TTR.",
|
||||
"For example, if the investment name is \"Balanced\", the TTR fund name is \"Balanced TTR\".",
|
||||
"---Example 1 Start---",
|
||||
"Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth",
|
||||
"Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares, 0.07% p.a. for Cash \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth",
|
||||
"---Example 1 End---",
|
||||
"Please read the context carefully, especially for \"Retirement and TTR income streams\" part, output all of fund names and relevant values",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
|
||||
"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, , {\"fund name\": \"Cash Pension\", \"share name\": \"Cash Pension\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Cash TTR\", \"share name\": \"Cash TTR\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
|||
4
main.py
4
main.py
|
|
@ -1560,8 +1560,8 @@ if __name__ == "__main__":
|
|||
# "544886057",
|
||||
# "550769189",
|
||||
# "553449663"]
|
||||
special_doc_id_list = ["391080133"]
|
||||
# special_doc_id_list = ["391080133", ""]
|
||||
special_doc_id_list = ["412778803"]
|
||||
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
|
|
@ -1096,6 +1096,10 @@ def replace_special_table_header(replace_table_header_config: list, page_text: s
|
|||
break
|
||||
if updated_text:
|
||||
break
|
||||
|
||||
# split numbers like 1.320.00 to be 1.32 0.00 by regex
|
||||
if re.search(r'(\d)\.(\d{2})(\d)\.(\d{2})', page_text):
|
||||
page_text = re.sub(r'(\d)\.(\d{2})(\d)\.(\d{2})', r'\1.\2 \3.\4', page_text)
|
||||
return page_text
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue