1. simplify management_fee_and_costs instructions

2. optimize management_fee_and_costs instructions
3. resolve the issues for complex scenarios: need sum management_fee, recoverable_expenses, indirect_costs as management_fee_and_costs
This commit is contained in:
Blade He 2025-03-06 17:27:18 -06:00
parent c4ed65770d
commit 52515fc152
5 changed files with 145 additions and 152 deletions

View File

@ -1168,14 +1168,16 @@ if __name__ == "__main__":
audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
audit_data_sheet: str = "Sheet1" audit_data_sheet: str = "Sheet1"
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_46_documents_by_text_20250305170202.xlsx" verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_46_documents_by_text_20250306171226.xlsx"
verify_data_sheet: str = "total_data" verify_data_sheet: str = "total_data"
verify_document_list_file: str = "./sample_documents/aus_prospectus_17_documents_sample.txt" # verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt"
calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, verify_document_list_file_list = [None, "./sample_documents/aus_prospectus_29_documents_sample.txt", "./sample_documents/aus_prospectus_17_documents_sample.txt"]
audit_data_sheet=audit_data_sheet, for verify_document_list_file in verify_document_list_file_list:
verify_file_path=verify_file_path, calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
verify_data_sheet=verify_data_sheet, audit_data_sheet=audit_data_sheet,
verify_document_list_file = verify_document_list_file) verify_file_path=verify_file_path,
verify_data_sheet=verify_data_sheet,
verify_document_list_file = verify_document_list_file)
# set_mapping_to_17_documents_data() # set_mapping_to_17_documents_data()
# set_mapping_to_ravi_data() # set_mapping_to_ravi_data()

View File

@ -9,7 +9,8 @@ from utils.gpt_utils import chat
from utils.pdf_util import PDFUtil from utils.pdf_util import PDFUtil
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
from utils.logger import logger from utils.logger import logger
from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name, remove_abundant_data from utils.biz_utils import add_slash_to_text_as_regex, clean_text, \
get_most_similar_name, remove_abundant_data, replace_special_table_header
class DataExtraction: class DataExtraction:
@ -575,7 +576,7 @@ class DataExtraction:
previous_page_datapoints = [] previous_page_datapoints = []
previous_page_fund_name = None previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
# if page_num != 21: # if page_num != 24:
# continue # continue
if page_num in handled_page_num_list: if page_num in handled_page_num_list:
continue continue
@ -593,6 +594,7 @@ class DataExtraction:
else: else:
previous_page_fund_name = None previous_page_fund_name = None
page_text = replace_special_table_header(page_text)
extract_data = self.extract_data_by_page( extract_data = self.extract_data_by_page(
page_num, page_num,
page_text, page_text,
@ -657,6 +659,7 @@ class DataExtraction:
) )
if not with_same_structure_table: if not with_same_structure_table:
break break
next_page_text = replace_special_table_header(next_page_text)
target_text = current_text + next_page_text target_text = current_text + next_page_text
else: else:
target_text = "" target_text = ""
@ -1507,6 +1510,32 @@ class DataExtraction:
complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "") complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "")
with_special_rule_title = False with_special_rule_title = False
for datapoint in datapoints: for datapoint in datapoints:
find_complex_special_rule = False
if page_text is not None and len(page_text) > 0:
complex_special_rule_list = complex_special_rule.get(datapoint, [])
for complex_special_rule in complex_special_rule_list:
complex_keywords = complex_special_rule.get("keywords", [])
if len(complex_keywords) == 0:
continue
exist_keywords = False
for special_keywords in complex_keywords:
special_keywrods_regex = add_slash_to_text_as_regex(special_keywords)
if special_keywords in page_text or \
re.search(special_keywrods_regex, page_text) is not None:
exist_keywords = True
break
if exist_keywords:
complex_prompts_list = complex_special_rule.get("prompts", [])
if len(complex_prompts_list) > 0:
if not with_special_rule_title:
instructions.append("Special rule:\n")
with_special_rule_title = True
complex_prompts = "\n".join(complex_prompts_list)
instructions.append(complex_prompts)
instructions.append("\n\n")
find_complex_special_rule = True
if find_complex_special_rule:
continue
special_rule_list = special_rule_info.get(datapoint, []) special_rule_list = special_rule_info.get(datapoint, [])
if len(special_rule_list) > 0: if len(special_rule_list) > 0:
if not with_special_rule_title: if not with_special_rule_title:
@ -1515,26 +1544,7 @@ class DataExtraction:
special_rule = "\n".join(special_rule_list) special_rule = "\n".join(special_rule_list)
instructions.append(special_rule) instructions.append(special_rule)
instructions.append("\n\n") instructions.append("\n\n")
if page_text is None or len(page_text) == 0:
continue
complex_special_rule_list = complex_special_rule.get(datapoint, [])
for complex_special_rule in complex_special_rule_list:
complex_keywords = complex_special_rule.get("keywords", [])
if len(complex_keywords) == 0:
continue
exist_keywords = False
for special_keywords in complex_keywords:
special_keywrods_regex = add_slash_to_text_as_regex(special_keywords)
if special_keywords in page_text or \
re.search(special_keywrods_regex, page_text) is not None:
exist_keywords = True
break
if exist_keywords:
complex_prompts_list = complex_special_rule.get("prompts", [])
if len(complex_prompts_list) > 0:
complex_prompts = "\n".join(complex_prompts_list)
instructions.append(complex_prompts)
instructions.append("\n\n")
instructions.append("\n") instructions.append("\n")
instructions.append("Special cases:\n") instructions.append("Special cases:\n")
@ -1563,27 +1573,9 @@ class DataExtraction:
contents_list = special_case.get("contents", []) contents_list = special_case.get("contents", [])
contents = "\n".join(contents_list) contents = "\n".join(contents_list)
instructions.append(contents) instructions.append(contents)
instructions.append("\n\n") instructions.append("\n")
instructions.append("\n") instructions.append("\n")
# extreme_complex_config_list = special_cases.get("extreme_complex", [])
# if len(extreme_complex_config_list) > 0:
# for extreme_complex_config in extreme_complex_config_list:
# regex = extreme_complex_config.get("regex", "")
# if len(regex) == 0:
# continue
# search = re.search(regex, page_text)
# if search is not None:
# title = extreme_complex_config.get("title", "")
# title = f"{special_cases_number}. {title} "
# special_cases_number += 1
# instructions.append(title)
# instructions.append("\n")
# contents_list = extreme_complex_config.get("contents", [])
# contents = "\n".join(contents_list)
# instructions.append(contents)
# instructions.append("\n\n")
instructions.append("Output requirement:\n") instructions.append("Output requirement:\n")
output_requirement = self.instructions_config.get("output_requirement", {}) output_requirement = self.instructions_config.get("output_requirement", {})
output_requirement_common_list = output_requirement.get("common", []) output_requirement_common_list = output_requirement.get("common", [])

View File

@ -162,26 +162,6 @@
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
"\n", "\n",
"B.3 With \"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", sum the values from these 3 columns.",
"---Example Start---",
"Fund \nManagement \nfee 1 \n(% pa) \nIndirect costs1\n(% pa)\nEstimated performance fees2\n(% pa)\nTransaction \ncosts \n(% pa) \nBuy/sell \nspreads (%) \nRecoverable \nexpenses 3 \nEstimated \nother indirect \ncosts \nPerformance \nfees charged \nto the Fund \nby underlying \nmanagers \nPerformance \nfees charged \nby interposed \nvehicles \nipac Life \nChoices \nActive 50 \n0.70 \n0.02 \n0.09 \n0.00 \n0.05 \n0.14 \n0.10/0.10 \nipac Life \nChoices \nActive 70 \n0.79 \n0.01 \n0.08 \n0.00 \n0.05 \n0.17 \n0.10/0.10 \n",
"---Example End---",
"For this case: ",
"a. The table header is with secondary-level header.",
"b. The fund name is before the data row, e.g. ipac Life Choices Active 50",
"c. The data points numbers order in data row, for example: \n0.70 \n0.02 \n0.09 \n0.00 \n0.05 \n0.14 \n0.10/0.10 \n is correct as initial table structure.",
"The 1st number: 0.70 is the management_fee,",
"the 2nd number: 0.02 is the recoverable_expenses,",
"the 3rd number: 0.09 is the indirect_costs",
"the 4th number: 0.00 is the performance_fee,",
"the 5th number: 0.05 is the interposed_vehicle_performance_fee_cost, ",
"the 6th number: 0.14 is the transaction costs, please ignore this number.",
"the 7th number: 0.10 is the buy_spread, ",
"the 8th number: 0.10 is the sell_spread.",
"The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.70 + 0.02 + 0.09= 0.81",
"The output should be:",
"{\"data\": [{\"fund name\": \"ipac Life Choices Active 50\", \"share name\": \"ipac Life Choices Active 50\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.7, \"recoverable_expenses\": 0.02, \"indirect_costs\": 0.09, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"ipac Life Choices Active 70\", \"share name\": \"ipac Life Choices Active 70\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.79, \"recoverable_expenses\": 0.01, \"indirect_costs\": 0.08, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
"\n",
"C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
"---Example 1 Start---", "---Example 1 Start---",
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
@ -394,46 +374,6 @@
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee\": 0}" "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee\": 0}"
] ]
}, },
{
"keywords": ["Indirect costs \ni \nEstimated performance fees"],
"prompts": ["Complex management fee and costs rule:",
"If the table with columns:",
"\"Management fee (% pa)\", \"Indirect costs\", \"Estimated performance fees\", \"Buy/sell spreads\"",
"The management_fee is \"Management fee (% pa)\".",
"The management_fee_costs is \"Management fee (% pa)\" + \"Indirect costs\".",
"The performance_fee is \"Estimated performance fees\"",
"The buy_spread and sell_spread are \"Buy/sell spreads\".",
"---Example 1 Start---",
"Indirect costs \ni\nEstimated performance fees \nii\nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nRecoverable \nexpenses \niii \nEstimated other \nindirect costs \nPerformance \nfees charged to \nthe Investment \nOption by \nunderlying \nmanagers \nPerformance \nfees charged by \ninterposed \nvehicles \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 \nMyNorth \nAustralian Fixed \nInterest Index \niv \n0.25 \n0.01 \n0.00 \n0.00 \n0.00 \n0.07 \n0.10/0.10 \nMyNorth \nInternational \nFixed Interest \nIndex - Hedged \n",
"---Example 1 End---",
"For this case: ",
"a. The table header is with disorder issue during PDF contents extraction issue.",
"b. The fund name is after the data row, e.g. MyNorth Australian Fixed Interest Index",
"c. The data points numbers order in data row, for example: \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 is correct as initial table structure.",
"The 1st number: 0.20 is the management_fee, the 2nd number and the 3th number: 0.01 0.00 are the indirect costs, ",
"the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ",
"the 6th number: 0.00 is the transaction costs, ",
"the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.",
"The management_fee_and_costs is management_fee + indirect costs = 0.20 + 0.01 + 0.00= 0.21",
"The output should be: ",
"{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"performance_fee\": 0.00, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
"\n",
"---Example 2 Start---",
"Indirect costs \ni \nEstimated performance fees \nii \nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nMyNorth Index \nModerately \nDefensive \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 \nMyNorth Index \nBalanced \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.09/0.09 \n",
"---Example 2 End---",
"For this case: ",
"a. The table header is with disorder issue during PDF contents extraction issue.",
"b. The fund name is before the data row, e.g. MyNorth Index Moderately \nDefensive",
"c. The data points numbers order in data row, for example: \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 is correct as initial table structure.",
"The 1st number: 0.55 is the management_fee, the 2nd number and the 3th number: 0.00 0.00 are the indirect costs, ",
"the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the performance_fee by interposed vehicles, ",
"the 6th number: 0.01 is the transaction costs, please ignore this number.",
"the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.",
"The management_fee_and_costs is management_fee + indirect costs = 0.55 + 0.00 + 0.00= 0.55",
"The output should be: ",
"{\"data\": [{\"fund name\": \"MyNorth Index Moderately Defensive\", \"share name\": \"MyNorth Index Moderately Defensive\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.00, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth Index Balanced\", \"share name\": \"MyNorth Index Balanced\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.00, \"buy_spread\": 0.09, \"sell_spread\": 0.09}]}"
]
},
{ {
"keywords": ["Retirement and TTR income streams"], "keywords": ["Retirement and TTR income streams"],
"prompts": ["Complex management fee and costs rule:", "prompts": ["Complex management fee and costs rule:",
@ -455,67 +395,45 @@
] ]
}, },
{ {
"keywords": ["Option name \nIndirect costs"], "keywords": "Recoverable expenses \nEstimated other indirect costs",
"prompts": ["Complex management fee and costs rule:",
"If the table with columns:",
"\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"",
"The management_fee is \"Management fee (% pa)\".",
"The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".",
"The indirect_costs is \"Estimated other indirect costs\"",
"The recoverable_expenses is \"Recoverable expenses\"",
"The performance_fee is \"Peformance fees charged to the option by underlying managers\".",
"The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"",
"The buy_spread and sell_spread are \"Buy/sell spreads\".",
"---Example 1 Start---",
"Option name \nIndirect costs \n(i)\nEstimated performance fees \n(ii)\nManagement \nfee \n(% pa) \n(i) \n(% pa) \n(% pa) \nTransaction \ncosts \n(% pa) \nBuy/sell \nspreads \n(%) \n(iv) \nRecoverable \nexpenses \n(iii) \nEstimated other \nindirect costs \nPerformance \nfees charged to \nthe option \nby underlying \nmanagers \nPerformance \nfees charged \nby interposed \nvehicles \nGenerations Defensive \n0.90 \n0.26 \n0.12 \n0.00 \n0.06 \n0.17 \n0.09/0.08 \nGenerations Moderately \nDefensive \n1.00 \n0.08 \n0.10 \n0.00 \n0.05 \n0.17 \n0.10/0.10 \n",
"---Example 1 End---",
"For this case: ",
"a. The table header is with disorder issue during PDF contents extraction issue.",
"b. The fund name is before the data row, e.g. Generations Defensive",
"c. The data points numbers order in data row, for example: \n0.90 \n0.26 \n0.12 \n0.00 \n0.06 \n0.17 \n0.09/0.08 \n is correct as initial table structure.",
"The 1st number: 0.90 is the management_fee,",
"the 2nd number: 0.26 is the recoverable_expenses,",
"the 3rd number: 0.12 is the indirect_costs",
"the 4th number: 0.00 is the performance_fee,",
"the 5th number: 0.06 is the interposed_vehicle_performance_fee_cost, ",
"the 6th number: 0.17 is the transaction costs, please ignore this number.",
"the 7th number: 0.09 is the buy_spread, ",
"the 8th number: 0.08 is the sell_spread.",
"The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.90 + 0.26 + 0.12= 1.28",
"The output should be: ",
"{\"data\": [{\"fund name\": \"Generations Defensive\", \"share name\": \"Generations Defensive\", \"management_fee_and_costs\": 1.28, \"management_fee\": 0.9, \"recoverable_expenses\": 0.26, \"indirect_costs\": 0.12, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.06, \"buy_spread\": 0.09, \"sell_spread\": 0.08}, {\"fund name\": \"Generations Moderately Defensive\", \"share name\": \"Generations Moderately Defensive\", \"management_fee_and_costs\": 1.18, \"management_fee\": 1, \"recoverable_expenses\": 0.08, \"indirect_costs\": 0.1,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
]
},
{
"keywords": "Management \nfee (i) \n(% pa) \nIndirect costs (i) \n(% pa)",
"prompts": ["Complex management fee and costs rule:", "prompts": ["Complex management fee and costs rule:",
"If the table with columns:", "If the table with columns:",
"\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"",
"The management_fee is \"Management fee (% pa)\".", "The management_fee is \"Management fee (% pa)\".",
"The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".", "The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".",
"The indirect_costs is \"Estimated other indirect costs\"",
"The recoverable_expenses is \"Recoverable expenses\"", "The recoverable_expenses is \"Recoverable expenses\"",
"The indirect_costs is \"Estimated other indirect costs\"",
"The performance_fee is \"Peformance fees charged to the Investment Option by underlying managers\".", "The performance_fee is \"Peformance fees charged to the Investment Option by underlying managers\".",
"The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"", "The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"",
"The buy_spread and sell_spread are \"Buy/sell spreads\".", "The buy_spread and sell_spread are \"Buy/sell spreads\".",
"---Example 1 Start---", "---Example 1 Start---",
"Investment Option \nManagement \nfee (i) \n(% pa) \nIndirect costs (i) \n(% pa) \nEstimated performance fees (ii) \n(% pa) \nTransaction \ncosts (% pa) \nBuy/sell \nspreads (%) \nRecoverable \nexpenses (iii) \nEstimated \nother \nindirect costs \nPerformance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \nPerformance fees \ncharged by \ninterposed \nvehicles \nNorth Active Defensive \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \nNorth Active Moderately \nDefensive \n0.72 \n0.07 \n0.04 \n0.00 \n0.01 \n0.14 \n0.09/0.09 \nNorth Index Growth \n0.45 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.06/0.06 \nNorth Index High Growth \n0.45 \n0.00 \n0.01 \n0.00 \n0.00 \n0.01 \n0.06/0.07 \n", "Investment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \nNorth Active Defensive \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \nNorth Active Moderately \nDefensive \n0.72 \n0.07 \n0.04 \n0.00 \n0.01 \n0.14 \n0.09/0.09 \nNorth Index Growth \n0.45 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.06/0.06 \nNorth Index High Growth \n0.45 \n0.00 \n0.01 \n0.00 \n0.00 \n0.01 \n0.06/0.07 \n",
"---Example 1 End---", "---Example 1 End---",
"For this case: ", "For this case: ",
"a. The table header is with secondary-level header.", "a. The fund name is before the data row, e.g. North Active Defensive",
"b. The fund name is before the data row, e.g. North Active Defensive", "c. The data points numbers in data row. ",
"c. The data points numbers order in data row, for example: \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \n is correct as initial table structure.", "For example: \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \n is with correct order as initial table structure.",
"The 1st number: 0.62 is the management_fee,", "The 1st number: 0.62 is the management_fee,",
"the 2nd number: 0.18 is the recoverable_expenses,", "the 2nd number: 0.18 is the recoverable_expenses,",
"the 3rd number: 0.05 is the indirect_costs", "the 3rd number: 0.05 is the indirect_costs",
"the 4th number: 0.00 is the performance_fee,", "the 4th number: 0.00 is the performance_fee,",
"the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ", "the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ",
"the 6th number: 0.14 is the transaction costs, please ignore this number.", "the 6th number: 0.14 is the Transaction costs (% pa).",
"the 7th number: 0.08 is the buy_spread, ", "the 7th number: 0.08 is the buy_spread, ",
"the 8th number: 0.08 is the sell_spread.", "the 8th number: 0.08 is the sell_spread.",
"The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.62 + 0.18 + 0.05= 0.85", "The management_fee_and_costs is Management fee (i) + Recoverable expenses + Estimated other indirect costs = 0.62 + 0.18 + 0.05= 0.85",
"**Attention: Ignore Transaction costs (% pa), the 6th number, DO NOT APPLY ITS VALUE TO CALCULATE management_fee_and_costs!!!**",
"The output should be: ", "The output should be: ",
"{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}" "{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}",
"---Example 2 Start---",
"Investment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 \nMyNorth \nAustralian Fixed \nInterest Index \niv \n0.25 \n0.01 \n0.00 \n0.00 \n0.00 \n0.07 \n0.10/0.10 \nMyNorth \nInternational \nFixed Interest \nIndex - Hedged \n",
"---Example 2 End---",
"For this case: ",
"a. This table header is same as Example 1.",
"b. The algorithm to calculate management_fee_and_costs is same as Example 1.",
"c. The difference is **the fund name is after the data row, e.g. the fund name of the first data row is: MyNorth Australian Fixed Interest Index**",
"The output should be: ",
"{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
] ]
} }
] ]

View File

@ -1506,7 +1506,7 @@ if __name__ == "__main__":
re_run_extract_data = True re_run_extract_data = True
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = False force_save_total_data = True
doc_source = "aus_prospectus" doc_source = "aus_prospectus"
# doc_source = "emea_ar" # doc_source = "emea_ar"
if doc_source == "aus_prospectus": if doc_source == "aus_prospectus":
@ -1525,7 +1525,8 @@ if __name__ == "__main__":
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
# special_doc_id_list: list = ["539261734"] # special_doc_id_list: list = ["410899007", "539266880", "539266817",
# "539261734", "539266893"]
# special_doc_id_list: list = ["401212184"] # special_doc_id_list: list = ["401212184"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"

View File

@ -1034,3 +1034,83 @@ def remove_abundant_data_detail(data_detail_list: list,
if remove_data in data_detail_list: if remove_data in data_detail_list:
data_detail_list.remove(remove_data) data_detail_list.remove(remove_data)
return data_detail_list return data_detail_list
def replace_special_table_header(page_text: str):
"""
For some special table header, replace to the standard header
e.g.
raw header 1:
Investment Option \n
Management \nfee (i) \n(% pa) \n
Indirect costs (i) \n(% pa) \n
Estimated performance fees (ii) \n(% pa) \n
Transaction \ncosts (% pa) \n
Buy/sell \nspreads (%) \n
Recoverable \nexpenses (iii) \n
Estimated \nother \nindirect costs \n
Performance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \n
Performance fees \ncharged by \ninterposed \nvehicles \n
raw header 2:
Fund \n
Management \nfee 1 \n(% pa) \n
Indirect costs1\n(% pa)\n
Estimated performance fees2\n(% pa)\n
Transaction \ncosts \n(% pa) \n
Buy/sell \nspreads (%) \n
Recoverable \nexpenses 3 \n
Estimated \nother indirect \ncosts \n
Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n
Performance \nfees charged \nby interposed \nvehicles \n
There are 2 layers of headers, the first layer is the main header, the second layer is the sub header
The purpose is to merge the sub header to the main header
Indirect costs (i) \n(% pa) replace to Recoverable expenses\nEstimated other indirect costs
Estimated performance fees2\n(% pa) replace to Performance fees charged to the Fund by underlying managers\nPerformance fees charged by interposed vehicles
Remove the second layer header.
e.g.
Recoverable \nexpenses (iii) \n
Estimated \nother \nindirect costs \n
Performance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \n
Performance fees \ncharged by \ninterposed \nvehicles \n
or
Recoverable \nexpenses 3 \n
Estimated \nother indirect \ncosts \n
Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n
Performance \nfees charged \nby interposed \nvehicles \n
"""
replace_info_list = [
{
# item 0: document 410899007
# item 1: document 539266880, 539266817, 539261734
# item 2: document 539266893
"regex_all_list":
[r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n",
r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?interposed\s*vehicles\s*\n",
r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"],
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
},
{
# item 0: document 410899007
"regex_all_list":
[r"Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Transactions\s*costs[\s\S]*?Buy\/sell\s*spreads\s*\(\%\)\s*\n"],
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
}
]
updated_text = False
for replace_info in replace_info_list:
for regex_all in replace_info["regex_all_list"]:
if re.search(regex_all, page_text) is not None:
page_text = re.sub(regex_all, replace_info["replace_text"], page_text)
updated_text = True
break
if updated_text:
break
return page_text