optimize instructions for document 412778803

This commit is contained in:
Blade He 2025-03-12 17:24:39 -05:00
parent 765772e5a8
commit 6f17c2253c
6 changed files with 102 additions and 16 deletions

View File

@ -1282,7 +1282,7 @@ def generate_message(message: dict,
def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""):
message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""}
if gt_value is not None and len(str(gt_value)) > 0:
if gt_value is not None and len(str(gt_value).strip()) > 0:
gt_list.append(1)
gt_equal_pred = is_equal(gt_value, pred_value, data_point)
if gt_equal_pred:
@ -1290,11 +1290,11 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
else:
pred_list.append(0)
message["error"] = "pred_value is not equal to gt_value"
if pred_value is not None and len(str(pred_value)) > 0:
if pred_value is not None and len(str(pred_value).strip()) > 0:
pred_list.append(1)
gt_list.append(0)
else:
if pred_value is not None and len(str(pred_value)) > 0:
if pred_value is not None and len(str(pred_value).strip()) > 0:
gt_list.append(0)
pred_list.append(1)
message["error"] = "gt_value is empty, but pred_value is not empty"

View File

@ -21,7 +21,7 @@
"regex_all_list":
["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n",
"Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"],
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n",
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \n",
"comments": ["item 0: document 401212184, page 17",
"item 1: document 401212184, page 18 - 20"]
},

View File

@ -262,6 +262,7 @@ class DataExtraction:
data_dict["completion_token"] = result.get("completion_token", 0)
data_dict["total_token"] = result.get("total_token", 0)
"""
data_list = self.supplement_ttr_pension(data_list)
data_list = self.supplement_minimum_initial_investment(data_list)
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
data_list = self.remove_duplicate_data(data_list)
@ -271,6 +272,79 @@ class DataExtraction:
data_list = self.check_administration_fees(data_list)
return data_list
def supplement_ttr_pension(self, data_list: list):
"""
If with fund name ends with "TTR" and "Pension", and exist same fund name without "TTR" or "Pension",
Supplement the data of "TTR" and "Pension" to the same fund name without "TTR" or "Pension"
"""
ttr_fund_name_list = []
pension_fund_name_list = []
exist_ttr = False
exist_pension = False
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
keys = list(data_item.keys())
fund_name = data_item.get("fund_name", "")
if len(fund_name) == 0:
continue
fund_name_splits = fund_name.split()
if fund_name_splits[-1] == "TTR":
ttr_fund_name_list.append(fund_name)
exist_ttr = True
if fund_name_splits[-1] == "Pension":
pension_fund_name_list.append(fund_name)
exist_pension = True
if exist_ttr and exist_pension:
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
new_item_list = []
remove_item_list = []
for data_item in data:
fund_name = data_item.get("fund_name", "")
share_name = data_item.get("share_name", "")
if len(fund_name) == 0:
continue
fund_name_splits = fund_name.split()
if fund_name_splits[-1] == "TTR" or fund_name_splits[-1] == "Pension":
continue
keys = [key for key in list(data_item.keys())
if key not in ["fund_name", "share_name"]]
for ttr_fund_name in ttr_fund_name_list:
ttr_pure_fund_name = ttr_fund_name.replace(" TTR", "")
if fund_name == ttr_pure_fund_name:
new_fund_name = f"{fund_name} TTR"
if share_name == fund_name:
share_name = new_fund_name
new_item = {"fund_name": new_fund_name, "share_name": share_name}
for key in keys:
new_item[key] = data_item.get(key, "")
new_item_list.append(new_item)
if data_item not in remove_item_list:
remove_item_list.append(data_item)
break
for pension_fund_name in pension_fund_name_list:
pernsion_pure_fund_name = pension_fund_name.replace(" Pension", "")
if fund_name == pernsion_pure_fund_name:
new_fund_name = f"{fund_name} Pension"
if share_name == fund_name:
share_name = new_fund_name
new_item = {"fund_name": new_fund_name, "share_name": share_name}
for key in keys:
new_item[key] = data_item.get(key, "")
new_item_list.append(new_item)
if data_item not in remove_item_list:
remove_item_list.append(data_item)
break
for remove_item in remove_item_list:
if remove_item in data:
data.remove(remove_item)
data.extend(new_item_list)
return data_list
def check_administration_fees(self, data_list: list):
"""
If document source is aus_prospectus and document category is MIS, then remove the administration fees from data_list
@ -614,7 +688,7 @@ class DataExtraction:
previous_page_datapoints = []
previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items():
# if page_num not in [20]:
# if page_num not in [40, 44]:
# continue
if page_num in handled_page_num_list:
continue

View File

@ -360,8 +360,15 @@
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
],
"buy_spread": [
"A. Exclude reported name",
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)",
"B. Simple case with simple table structure:",
"---Example 1 Start---",
"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Lifestyle Growth\", \"share name\": \"Lifestyle Growth\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Lifestyle Balanced\", \"share name\": \"Lifestyle Balanced\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Property\", \"share name\": \"Property\", \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
],
"performance_fee_costs": [
"Performance fees is share class level data.",
@ -468,15 +475,15 @@
"For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".",
"For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry option\" + \"Estimated other investment costs\".",
"---Example 1 Start---",
"Management Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n",
"\nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.32 0.00 0.00 0.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.29 0.00 0.00 0.04 1.48 2.33\nPlatinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n",
"---Example 1 End---",
"The data points numbers order in data row (for example: 2.14 2.990.02 0.000.21 2.37 3.22) is correct as initial table structure.",
"The data points numbers order in data row (for example: 2.14 2.99 0.02 0.00 0.21 2.37 3.22) is correct as initial table structure.",
"Please pay attention below information",
"Assume the column sequence number is from 1.",
"Assume the numeric column sequence number is from 1.",
"\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.",
"For main fund: Platinum Asia with values: 2.14 2.990.02 0.000.21 2.37 3.22, ",
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
"For main fund: Platinum Asia with values: 2.14 2.99 0.02 0.00 0.21 2.37 3.22, ",
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)",
"Therefore, the output should be:",
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}"
]
@ -495,10 +502,11 @@
"it means each investment name is with only one fund name, it is for TTR.",
"For example, if the investment name is \"Balanced\", the TTR fund name is \"Balanced TTR\".",
"---Example 1 Start---",
"Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth",
"Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares, 0.07% p.a. for Cash \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth",
"---Example 1 End---",
"Please read the context carefully, especially for \"Retirement and TTR income streams\" part, output all of fund names and relevant values",
"The output should be:",
"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, , {\"fund name\": \"Cash Pension\", \"share name\": \"Cash Pension\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Cash TTR\", \"share name\": \"Cash TTR\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
]
},
{

View File

@ -1560,8 +1560,8 @@ if __name__ == "__main__":
# "544886057",
# "550769189",
# "553449663"]
special_doc_id_list = ["391080133"]
# special_doc_id_list = ["391080133", ""]
special_doc_id_list = ["412778803"]
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (

View File

@ -1096,6 +1096,10 @@ def replace_special_table_header(replace_table_header_config: list, page_text: s
break
if updated_text:
break
# split numbers like 1.320.00 to be 1.32 0.00 by regex
if re.search(r'(\d)\.(\d{2})(\d)\.(\d{2})', page_text):
page_text = re.sub(r'(\d)\.(\d{2})(\d)\.(\d{2})', r'\1.\2 \3.\4', page_text)
return page_text