optimize instructions for document 412778803
This commit is contained in:
parent
765772e5a8
commit
6f17c2253c
|
|
@ -1282,7 +1282,7 @@ def generate_message(message: dict,
|
||||||
|
|
||||||
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""):
|
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""):
|
||||||
message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""}
|
message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""}
|
||||||
if gt_value is not None and len(str(gt_value)) > 0:
|
if gt_value is not None and len(str(gt_value).strip()) > 0:
|
||||||
gt_list.append(1)
|
gt_list.append(1)
|
||||||
gt_equal_pred = is_equal(gt_value, pred_value, data_point)
|
gt_equal_pred = is_equal(gt_value, pred_value, data_point)
|
||||||
if gt_equal_pred:
|
if gt_equal_pred:
|
||||||
|
|
@ -1290,11 +1290,11 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
|
||||||
else:
|
else:
|
||||||
pred_list.append(0)
|
pred_list.append(0)
|
||||||
message["error"] = "pred_value is not equal to gt_value"
|
message["error"] = "pred_value is not equal to gt_value"
|
||||||
if pred_value is not None and len(str(pred_value)) > 0:
|
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
||||||
pred_list.append(1)
|
pred_list.append(1)
|
||||||
gt_list.append(0)
|
gt_list.append(0)
|
||||||
else:
|
else:
|
||||||
if pred_value is not None and len(str(pred_value)) > 0:
|
if pred_value is not None and len(str(pred_value).strip()) > 0:
|
||||||
gt_list.append(0)
|
gt_list.append(0)
|
||||||
pred_list.append(1)
|
pred_list.append(1)
|
||||||
message["error"] = "gt_value is empty, but pred_value is not empty"
|
message["error"] = "gt_value is empty, but pred_value is not empty"
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@
|
||||||
"regex_all_list":
|
"regex_all_list":
|
||||||
["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n",
|
["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n",
|
||||||
"Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"],
|
"Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"],
|
||||||
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n",
|
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \n",
|
||||||
"comments": ["item 0: document 401212184, page 17",
|
"comments": ["item 0: document 401212184, page 17",
|
||||||
"item 1: document 401212184, page 18 - 20"]
|
"item 1: document 401212184, page 18 - 20"]
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -262,6 +262,7 @@ class DataExtraction:
|
||||||
data_dict["completion_token"] = result.get("completion_token", 0)
|
data_dict["completion_token"] = result.get("completion_token", 0)
|
||||||
data_dict["total_token"] = result.get("total_token", 0)
|
data_dict["total_token"] = result.get("total_token", 0)
|
||||||
"""
|
"""
|
||||||
|
data_list = self.supplement_ttr_pension(data_list)
|
||||||
data_list = self.supplement_minimum_initial_investment(data_list)
|
data_list = self.supplement_minimum_initial_investment(data_list)
|
||||||
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
|
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
|
||||||
data_list = self.remove_duplicate_data(data_list)
|
data_list = self.remove_duplicate_data(data_list)
|
||||||
|
|
@ -271,6 +272,79 @@ class DataExtraction:
|
||||||
data_list = self.check_administration_fees(data_list)
|
data_list = self.check_administration_fees(data_list)
|
||||||
return data_list
|
return data_list
|
||||||
|
|
||||||
|
def supplement_ttr_pension(self, data_list: list):
|
||||||
|
"""
|
||||||
|
If with fund name ends with "TTR" and "Pension", and exist same fund name without "TTR" or "Pension",
|
||||||
|
Supplement the data of "TTR" and "Pension" to the same fund name without "TTR" or "Pension"
|
||||||
|
"""
|
||||||
|
ttr_fund_name_list = []
|
||||||
|
pension_fund_name_list = []
|
||||||
|
exist_ttr = False
|
||||||
|
exist_pension = False
|
||||||
|
for data_dict in data_list:
|
||||||
|
extract_data = data_dict.get("extract_data", {})
|
||||||
|
data = extract_data.get("data", [])
|
||||||
|
|
||||||
|
for data_item in data:
|
||||||
|
keys = list(data_item.keys())
|
||||||
|
fund_name = data_item.get("fund_name", "")
|
||||||
|
if len(fund_name) == 0:
|
||||||
|
continue
|
||||||
|
fund_name_splits = fund_name.split()
|
||||||
|
if fund_name_splits[-1] == "TTR":
|
||||||
|
ttr_fund_name_list.append(fund_name)
|
||||||
|
exist_ttr = True
|
||||||
|
if fund_name_splits[-1] == "Pension":
|
||||||
|
pension_fund_name_list.append(fund_name)
|
||||||
|
exist_pension = True
|
||||||
|
if exist_ttr and exist_pension:
|
||||||
|
for data_dict in data_list:
|
||||||
|
extract_data = data_dict.get("extract_data", {})
|
||||||
|
data = extract_data.get("data", [])
|
||||||
|
new_item_list = []
|
||||||
|
remove_item_list = []
|
||||||
|
for data_item in data:
|
||||||
|
fund_name = data_item.get("fund_name", "")
|
||||||
|
share_name = data_item.get("share_name", "")
|
||||||
|
if len(fund_name) == 0:
|
||||||
|
continue
|
||||||
|
fund_name_splits = fund_name.split()
|
||||||
|
if fund_name_splits[-1] == "TTR" or fund_name_splits[-1] == "Pension":
|
||||||
|
continue
|
||||||
|
keys = [key for key in list(data_item.keys())
|
||||||
|
if key not in ["fund_name", "share_name"]]
|
||||||
|
for ttr_fund_name in ttr_fund_name_list:
|
||||||
|
ttr_pure_fund_name = ttr_fund_name.replace(" TTR", "")
|
||||||
|
if fund_name == ttr_pure_fund_name:
|
||||||
|
new_fund_name = f"{fund_name} TTR"
|
||||||
|
if share_name == fund_name:
|
||||||
|
share_name = new_fund_name
|
||||||
|
new_item = {"fund_name": new_fund_name, "share_name": share_name}
|
||||||
|
for key in keys:
|
||||||
|
new_item[key] = data_item.get(key, "")
|
||||||
|
new_item_list.append(new_item)
|
||||||
|
if data_item not in remove_item_list:
|
||||||
|
remove_item_list.append(data_item)
|
||||||
|
break
|
||||||
|
for pension_fund_name in pension_fund_name_list:
|
||||||
|
pernsion_pure_fund_name = pension_fund_name.replace(" Pension", "")
|
||||||
|
if fund_name == pernsion_pure_fund_name:
|
||||||
|
new_fund_name = f"{fund_name} Pension"
|
||||||
|
if share_name == fund_name:
|
||||||
|
share_name = new_fund_name
|
||||||
|
new_item = {"fund_name": new_fund_name, "share_name": share_name}
|
||||||
|
for key in keys:
|
||||||
|
new_item[key] = data_item.get(key, "")
|
||||||
|
new_item_list.append(new_item)
|
||||||
|
if data_item not in remove_item_list:
|
||||||
|
remove_item_list.append(data_item)
|
||||||
|
break
|
||||||
|
for remove_item in remove_item_list:
|
||||||
|
if remove_item in data:
|
||||||
|
data.remove(remove_item)
|
||||||
|
data.extend(new_item_list)
|
||||||
|
return data_list
|
||||||
|
|
||||||
def check_administration_fees(self, data_list: list):
|
def check_administration_fees(self, data_list: list):
|
||||||
"""
|
"""
|
||||||
If document source is aus_prospectus and document category is MIS, then remove the administration fees from data_list
|
If document source is aus_prospectus and document category is MIS, then remove the administration fees from data_list
|
||||||
|
|
@ -614,7 +688,7 @@ class DataExtraction:
|
||||||
previous_page_datapoints = []
|
previous_page_datapoints = []
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
# if page_num not in [20]:
|
# if page_num not in [40, 44]:
|
||||||
# continue
|
# continue
|
||||||
if page_num in handled_page_num_list:
|
if page_num in handled_page_num_list:
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
|
|
@ -360,8 +360,15 @@
|
||||||
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
|
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
|
||||||
],
|
],
|
||||||
"buy_spread": [
|
"buy_spread": [
|
||||||
|
"A. Exclude reported name",
|
||||||
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
||||||
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
|
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)",
|
||||||
|
"B. Simple case with simple table structure:",
|
||||||
|
"---Example 1 Start---",
|
||||||
|
"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",
|
||||||
|
"---Example 1 End---",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"Lifestyle Growth\", \"share name\": \"Lifestyle Growth\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Lifestyle Balanced\", \"share name\": \"Lifestyle Balanced\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Property\", \"share name\": \"Property\", \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
|
||||||
],
|
],
|
||||||
"performance_fee_costs": [
|
"performance_fee_costs": [
|
||||||
"Performance fees is share class level data.",
|
"Performance fees is share class level data.",
|
||||||
|
|
@ -468,11 +475,11 @@
|
||||||
"For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".",
|
"For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".",
|
||||||
"For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry option\" + \"Estimated other investment costs\".",
|
"For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry option\" + \"Estimated other investment costs\".",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Management Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n",
|
"\nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.32 0.00 0.00 0.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.29 0.00 0.00 0.04 1.48 2.33\nPlatinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
"The data points numbers order in data row (for example: 2.14 2.99 0.02 0.00 0.21 2.37 3.22) is correct as initial table structure.",
|
"The data points numbers order in data row (for example: 2.14 2.99 0.02 0.00 0.21 2.37 3.22) is correct as initial table structure.",
|
||||||
"Please pay attention below information",
|
"Please pay attention below information",
|
||||||
"Assume the column sequence number is from 1.",
|
"Assume the numeric column sequence number is from 1.",
|
||||||
"\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.",
|
"\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.",
|
||||||
"For main fund: Platinum Asia with values: 2.14 2.99 0.02 0.00 0.21 2.37 3.22, ",
|
"For main fund: Platinum Asia with values: 2.14 2.99 0.02 0.00 0.21 2.37 3.22, ",
|
||||||
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
|
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
|
||||||
|
|
@ -495,10 +502,11 @@
|
||||||
"it means each investment name is with only one fund name, it is for TTR.",
|
"it means each investment name is with only one fund name, it is for TTR.",
|
||||||
"For example, if the investment name is \"Balanced\", the TTR fund name is \"Balanced TTR\".",
|
"For example, if the investment name is \"Balanced\", the TTR fund name is \"Balanced TTR\".",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
"Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth",
|
"Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares, 0.07% p.a. for Cash \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth",
|
||||||
"---Example 1 End---",
|
"---Example 1 End---",
|
||||||
|
"Please read the context carefully, especially for \"Retirement and TTR income streams\" part, output all of fund names and relevant values",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
|
"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, , {\"fund name\": \"Cash Pension\", \"share name\": \"Cash Pension\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Cash TTR\", \"share name\": \"Cash TTR\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
||||||
4
main.py
4
main.py
|
|
@ -1560,8 +1560,8 @@ if __name__ == "__main__":
|
||||||
# "544886057",
|
# "544886057",
|
||||||
# "550769189",
|
# "550769189",
|
||||||
# "553449663"]
|
# "553449663"]
|
||||||
special_doc_id_list = ["391080133"]
|
special_doc_id_list = ["412778803"]
|
||||||
# special_doc_id_list = ["391080133", ""]
|
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
|
|
@ -1096,6 +1096,10 @@ def replace_special_table_header(replace_table_header_config: list, page_text: s
|
||||||
break
|
break
|
||||||
if updated_text:
|
if updated_text:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# split numbers like 1.320.00 to be 1.32 0.00 by regex
|
||||||
|
if re.search(r'(\d)\.(\d{2})(\d)\.(\d{2})', page_text):
|
||||||
|
page_text = re.sub(r'(\d)\.(\d{2})(\d)\.(\d{2})', r'\1.\2 \3.\4', page_text)
|
||||||
return page_text
|
return page_text
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue