From 6f17c2253c0619b52356a8f89af2d1fe1f4a1ac7 Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 12 Mar 2025 17:24:39 -0500 Subject: [PATCH] optimize instructions for document 412778803 --- calc_metrics.py | 6 +- .../aus_prospectus/replace_table_header.json | 2 +- core/data_extraction.py | 76 ++++++++++++++++++- .../data_extraction_prompts_config.json | 26 ++++--- main.py | 4 +- utils/biz_utils.py | 4 + 6 files changed, 102 insertions(+), 16 deletions(-) diff --git a/calc_metrics.py b/calc_metrics.py index f9dfe47..2f4e407 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -1282,7 +1282,7 @@ def generate_message(message: dict, def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data_point: str = ""): message = {"gt_value": gt_value, "pred_value": pred_value, "error": ""} - if gt_value is not None and len(str(gt_value)) > 0: + if gt_value is not None and len(str(gt_value).strip()) > 0: gt_list.append(1) gt_equal_pred = is_equal(gt_value, pred_value, data_point) if gt_equal_pred: @@ -1290,11 +1290,11 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data else: pred_list.append(0) message["error"] = "pred_value is not equal to gt_value" - if pred_value is not None and len(str(pred_value)) > 0: + if pred_value is not None and len(str(pred_value).strip()) > 0: pred_list.append(1) gt_list.append(0) else: - if pred_value is not None and len(str(pred_value)) > 0: + if pred_value is not None and len(str(pred_value).strip()) > 0: gt_list.append(0) pred_list.append(1) message["error"] = "gt_value is empty, but pred_value is not empty" diff --git a/configuration/aus_prospectus/replace_table_header.json b/configuration/aus_prospectus/replace_table_header.json index 587d193..59724fe 100644 --- a/configuration/aus_prospectus/replace_table_header.json +++ b/configuration/aus_prospectus/replace_table_header.json @@ -21,7 +21,7 @@ "regex_all_list": ["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n", "Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"], - "replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n", + "replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \n", "comments": ["item 0: document 401212184, page 17", "item 1: document 401212184, page 18 - 20"] }, diff --git a/core/data_extraction.py b/core/data_extraction.py index 12faf86..393876c 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -262,6 +262,7 @@ class DataExtraction: data_dict["completion_token"] = result.get("completion_token", 0) data_dict["total_token"] = result.get("total_token", 0) """ + data_list = self.supplement_ttr_pension(data_list) data_list = self.supplement_minimum_initial_investment(data_list) data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list) data_list = self.remove_duplicate_data(data_list) @@ -270,6 +271,79 @@ class DataExtraction: data_list = self.check_administration_fees(data_list) return data_list + + def supplement_ttr_pension(self, data_list: list): + """ + If with fund name ends with "TTR" and "Pension", and exist same fund name without "TTR" or "Pension", + Supplement the data of "TTR" and "Pension" to the same fund name without "TTR" or "Pension" + """ + ttr_fund_name_list = [] + pension_fund_name_list = [] + exist_ttr = False + exist_pension = False + for data_dict in data_list: + extract_data = data_dict.get("extract_data", {}) + data = extract_data.get("data", []) + + for data_item in data: + keys = list(data_item.keys()) + fund_name = data_item.get("fund_name", "") + if len(fund_name) == 0: + continue + fund_name_splits = fund_name.split() + if fund_name_splits[-1] == "TTR": + ttr_fund_name_list.append(fund_name) + exist_ttr = True + if fund_name_splits[-1] == "Pension": + pension_fund_name_list.append(fund_name) + exist_pension = True + if exist_ttr and exist_pension: + for data_dict in data_list: + extract_data = data_dict.get("extract_data", {}) + data = extract_data.get("data", []) + new_item_list = [] + remove_item_list = [] + for data_item in data: + fund_name = data_item.get("fund_name", "") + share_name = data_item.get("share_name", "") + if len(fund_name) == 0: + continue + fund_name_splits = fund_name.split() + if fund_name_splits[-1] == "TTR" or fund_name_splits[-1] == "Pension": + continue + keys = [key for key in list(data_item.keys()) + if key not in ["fund_name", "share_name"]] + for ttr_fund_name in ttr_fund_name_list: + ttr_pure_fund_name = ttr_fund_name.replace(" TTR", "") + if fund_name == ttr_pure_fund_name: + new_fund_name = f"{fund_name} TTR" + if share_name == fund_name: + share_name = new_fund_name + new_item = {"fund_name": new_fund_name, "share_name": share_name} + for key in keys: + new_item[key] = data_item.get(key, "") + new_item_list.append(new_item) + if data_item not in remove_item_list: + remove_item_list.append(data_item) + break + for pension_fund_name in pension_fund_name_list: + pernsion_pure_fund_name = pension_fund_name.replace(" Pension", "") + if fund_name == pernsion_pure_fund_name: + new_fund_name = f"{fund_name} Pension" + if share_name == fund_name: + share_name = new_fund_name + new_item = {"fund_name": new_fund_name, "share_name": share_name} + for key in keys: + new_item[key] = data_item.get(key, "") + new_item_list.append(new_item) + if data_item not in remove_item_list: + remove_item_list.append(data_item) + break + for remove_item in remove_item_list: + if remove_item in data: + data.remove(remove_item) + data.extend(new_item_list) + return data_list def check_administration_fees(self, data_list: list): """ @@ -614,7 +688,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num not in [20]: + # if page_num not in [40, 44]: # continue if page_num in handled_page_num_list: continue diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 66e3f3f..9c1fa33 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -360,8 +360,15 @@ "{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" ], "buy_spread": [ + "A. Exclude reported name", "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", - "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)" + "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)", + "B. Simple case with simple table structure:", + "---Example 1 Start---", + "Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n", + "---Example 1 End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"Lifestyle Growth\", \"share name\": \"Lifestyle Growth\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Lifestyle Balanced\", \"share name\": \"Lifestyle Balanced\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Property\", \"share name\": \"Property\", \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" ], "performance_fee_costs": [ "Performance fees is share class level data.", @@ -468,15 +475,15 @@ "For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".", "For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry option\" + \"Estimated other investment costs\".", "---Example 1 Start---", - "Management Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n", + "\nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.32 0.00 0.00 0.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.29 0.00 0.00 0.04 1.48 2.33\nPlatinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n", "---Example 1 End---", - "The data points numbers order in data row (for example: 2.14 2.990.02 0.000.21 2.37 3.22) is correct as initial table structure.", + "The data points numbers order in data row (for example: 2.14 2.99 0.02 0.00 0.21 2.37 3.22) is correct as initial table structure.", "Please pay attention below information", - "Assume the column sequence number is from 1.", + "Assume the numeric column sequence number is from 1.", "\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.", - "For main fund: Platinum Asia with values: 2.14 2.990.02 0.000.21 2.37 3.22, ", - "the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", - "the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", + "For main fund: Platinum Asia with values: 2.14 2.99 0.02 0.00 0.21 2.37 3.22, ", + "the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", + "the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", "Therefore, the output should be:", "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}" ] @@ -495,10 +502,11 @@ "it means each investment name is with only one fund name, it is for TTR.", "For example, if the investment name is \"Balanced\", the TTR fund name is \"Balanced TTR\".", "---Example 1 Start---", - "Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth", + "Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares, 0.07% p.a. for Cash \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth", "---Example 1 End---", + "Please read the context carefully, especially for \"Retirement and TTR income streams\" part, output all of fund names and relevant values", "The output should be:", - "{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}" + "{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, , {\"fund name\": \"Cash Pension\", \"share name\": \"Cash Pension\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Cash TTR\", \"share name\": \"Cash TTR\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}" ] }, { diff --git a/main.py b/main.py index 7993804..c3da546 100644 --- a/main.py +++ b/main.py @@ -1560,8 +1560,8 @@ if __name__ == "__main__": # "544886057", # "550769189", # "553449663"] - special_doc_id_list = ["391080133"] - # special_doc_id_list = ["391080133", ""] + special_doc_id_list = ["412778803"] + # special_doc_id_list = ["391080133", "391080140", "401212184", "412778803"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 307be09..12866ca 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -1096,6 +1096,10 @@ def replace_special_table_header(replace_table_header_config: list, page_text: s break if updated_text: break + + # split numbers like 1.320.00 to be 1.32 0.00 by regex + if re.search(r'(\d)\.(\d{2})(\d)\.(\d{2})', page_text): + page_text = re.sub(r'(\d)\.(\d{2})(\d)\.(\d{2})', r'\1.\2 \3.\4', page_text) return page_text