diff --git a/calc_metrics.py b/calc_metrics.py index 4936327..41de9e1 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -1168,14 +1168,16 @@ if __name__ == "__main__": audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" audit_data_sheet: str = "Sheet1" - verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_46_documents_by_text_20250305170202.xlsx" + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged/merged_mapping_data_info_46_documents_by_text_20250306171226.xlsx" verify_data_sheet: str = "total_data" - verify_document_list_file: str = "./sample_documents/aus_prospectus_17_documents_sample.txt" - calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, - audit_data_sheet=audit_data_sheet, - verify_file_path=verify_file_path, - verify_data_sheet=verify_data_sheet, - verify_document_list_file = verify_document_list_file) + # verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt" + verify_document_list_file_list = [None, "./sample_documents/aus_prospectus_29_documents_sample.txt", "./sample_documents/aus_prospectus_17_documents_sample.txt"] + for verify_document_list_file in verify_document_list_file_list: + calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, + audit_data_sheet=audit_data_sheet, + verify_file_path=verify_file_path, + verify_data_sheet=verify_data_sheet, + verify_document_list_file = verify_document_list_file) # set_mapping_to_17_documents_data() # set_mapping_to_ravi_data() diff --git a/core/data_extraction.py b/core/data_extraction.py index ab880f2..715546b 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -9,7 +9,8 @@ from utils.gpt_utils import chat from utils.pdf_util import PDFUtil from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider from utils.logger import logger -from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name, remove_abundant_data +from utils.biz_utils import add_slash_to_text_as_regex, clean_text, \ + get_most_similar_name, remove_abundant_data, replace_special_table_header class DataExtraction: @@ -575,7 +576,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num != 21: + # if page_num != 24: # continue if page_num in handled_page_num_list: continue @@ -593,6 +594,7 @@ class DataExtraction: else: previous_page_fund_name = None + page_text = replace_special_table_header(page_text) extract_data = self.extract_data_by_page( page_num, page_text, @@ -657,6 +659,7 @@ class DataExtraction: ) if not with_same_structure_table: break + next_page_text = replace_special_table_header(next_page_text) target_text = current_text + next_page_text else: target_text = "" @@ -1507,6 +1510,32 @@ class DataExtraction: complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "") with_special_rule_title = False for datapoint in datapoints: + find_complex_special_rule = False + if page_text is not None and len(page_text) > 0: + complex_special_rule_list = complex_special_rule.get(datapoint, []) + for complex_special_rule in complex_special_rule_list: + complex_keywords = complex_special_rule.get("keywords", []) + if len(complex_keywords) == 0: + continue + exist_keywords = False + for special_keywords in complex_keywords: + special_keywrods_regex = add_slash_to_text_as_regex(special_keywords) + if special_keywords in page_text or \ + re.search(special_keywrods_regex, page_text) is not None: + exist_keywords = True + break + if exist_keywords: + complex_prompts_list = complex_special_rule.get("prompts", []) + if len(complex_prompts_list) > 0: + if not with_special_rule_title: + instructions.append("Special rule:\n") + with_special_rule_title = True + complex_prompts = "\n".join(complex_prompts_list) + instructions.append(complex_prompts) + instructions.append("\n\n") + find_complex_special_rule = True + if find_complex_special_rule: + continue special_rule_list = special_rule_info.get(datapoint, []) if len(special_rule_list) > 0: if not with_special_rule_title: @@ -1515,26 +1544,7 @@ class DataExtraction: special_rule = "\n".join(special_rule_list) instructions.append(special_rule) instructions.append("\n\n") - if page_text is None or len(page_text) == 0: - continue - complex_special_rule_list = complex_special_rule.get(datapoint, []) - for complex_special_rule in complex_special_rule_list: - complex_keywords = complex_special_rule.get("keywords", []) - if len(complex_keywords) == 0: - continue - exist_keywords = False - for special_keywords in complex_keywords: - special_keywrods_regex = add_slash_to_text_as_regex(special_keywords) - if special_keywords in page_text or \ - re.search(special_keywrods_regex, page_text) is not None: - exist_keywords = True - break - if exist_keywords: - complex_prompts_list = complex_special_rule.get("prompts", []) - if len(complex_prompts_list) > 0: - complex_prompts = "\n".join(complex_prompts_list) - instructions.append(complex_prompts) - instructions.append("\n\n") + instructions.append("\n") instructions.append("Special cases:\n") @@ -1563,26 +1573,8 @@ class DataExtraction: contents_list = special_case.get("contents", []) contents = "\n".join(contents_list) instructions.append(contents) - instructions.append("\n\n") + instructions.append("\n") instructions.append("\n") - - # extreme_complex_config_list = special_cases.get("extreme_complex", []) - # if len(extreme_complex_config_list) > 0: - # for extreme_complex_config in extreme_complex_config_list: - # regex = extreme_complex_config.get("regex", "") - # if len(regex) == 0: - # continue - # search = re.search(regex, page_text) - # if search is not None: - # title = extreme_complex_config.get("title", "") - # title = f"{special_cases_number}. {title} " - # special_cases_number += 1 - # instructions.append(title) - # instructions.append("\n") - # contents_list = extreme_complex_config.get("contents", []) - # contents = "\n".join(contents_list) - # instructions.append(contents) - # instructions.append("\n\n") instructions.append("Output requirement:\n") output_requirement = self.instructions_config.get("output_requirement", {}) diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 04b2fff..064dc36 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -162,26 +162,6 @@ "The output should be:", "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", "\n", - "B.3 With \"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", sum the values from these 3 columns.", - "---Example Start---", - "Fund \nManagement \nfee 1 \n(% pa) \nIndirect costs1\n(% pa)\nEstimated performance fees2\n(% pa)\nTransaction \ncosts \n(% pa) \nBuy/sell \nspreads (%) \nRecoverable \nexpenses 3 \nEstimated \nother indirect \ncosts \nPerformance \nfees charged \nto the Fund \nby underlying \nmanagers \nPerformance \nfees charged \nby interposed \nvehicles \nipac Life \nChoices \nActive 50 \n0.70 \n0.02 \n0.09 \n0.00 \n0.05 \n0.14 \n0.10/0.10 \nipac Life \nChoices \nActive 70 \n0.79 \n0.01 \n0.08 \n0.00 \n0.05 \n0.17 \n0.10/0.10 \n", - "---Example End---", - "For this case: ", - "a. The table header is with secondary-level header.", - "b. The fund name is before the data row, e.g. ipac Life Choices Active 50", - "c. The data points numbers order in data row, for example: \n0.70 \n0.02 \n0.09 \n0.00 \n0.05 \n0.14 \n0.10/0.10 \n is correct as initial table structure.", - "The 1st number: 0.70 is the management_fee,", - "the 2nd number: 0.02 is the recoverable_expenses,", - "the 3rd number: 0.09 is the indirect_costs", - "the 4th number: 0.00 is the performance_fee,", - "the 5th number: 0.05 is the interposed_vehicle_performance_fee_cost, ", - "the 6th number: 0.14 is the transaction costs, please ignore this number.", - "the 7th number: 0.10 is the buy_spread, ", - "the 8th number: 0.10 is the sell_spread.", - "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.70 + 0.02 + 0.09= 0.81", - "The output should be:", - "{\"data\": [{\"fund name\": \"ipac Life Choices Active 50\", \"share name\": \"ipac Life Choices Active 50\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.7, \"recoverable_expenses\": 0.02, \"indirect_costs\": 0.09, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"ipac Life Choices Active 70\", \"share name\": \"ipac Life Choices Active 70\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.79, \"recoverable_expenses\": 0.01, \"indirect_costs\": 0.08, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", - "\n", "C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", @@ -394,46 +374,6 @@ "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee\": 0}" ] }, - { - "keywords": ["Indirect costs \ni \nEstimated performance fees"], - "prompts": ["Complex management fee and costs rule:", - "If the table with columns:", - "\"Management fee (% pa)\", \"Indirect costs\", \"Estimated performance fees\", \"Buy/sell spreads\"", - "The management_fee is \"Management fee (% pa)\".", - "The management_fee_costs is \"Management fee (% pa)\" + \"Indirect costs\".", - "The performance_fee is \"Estimated performance fees\"", - "The buy_spread and sell_spread are \"Buy/sell spreads\".", - "---Example 1 Start---", - "Indirect costs \ni\nEstimated performance fees \nii\nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nRecoverable \nexpenses \niii \nEstimated other \nindirect costs \nPerformance \nfees charged to \nthe Investment \nOption by \nunderlying \nmanagers \nPerformance \nfees charged by \ninterposed \nvehicles \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 \nMyNorth \nAustralian Fixed \nInterest Index \niv \n0.25 \n0.01 \n0.00 \n0.00 \n0.00 \n0.07 \n0.10/0.10 \nMyNorth \nInternational \nFixed Interest \nIndex - Hedged \n", - "---Example 1 End---", - "For this case: ", - "a. The table header is with disorder issue during PDF contents extraction issue.", - "b. The fund name is after the data row, e.g. MyNorth Australian Fixed Interest Index", - "c. The data points numbers order in data row, for example: \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 is correct as initial table structure.", - "The 1st number: 0.20 is the management_fee, the 2nd number and the 3th number: 0.01 0.00 are the indirect costs, ", - "the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ", - "the 6th number: 0.00 is the transaction costs, ", - "the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.", - "The management_fee_and_costs is management_fee + indirect costs = 0.20 + 0.01 + 0.00= 0.21", - "The output should be: ", - "{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"performance_fee\": 0.00, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", - "\n", - "---Example 2 Start---", - "Indirect costs \ni \nEstimated performance fees \nii \nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nMyNorth Index \nModerately \nDefensive \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 \nMyNorth Index \nBalanced \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.09/0.09 \n", - "---Example 2 End---", - "For this case: ", - "a. The table header is with disorder issue during PDF contents extraction issue.", - "b. The fund name is before the data row, e.g. MyNorth Index Moderately \nDefensive", - "c. The data points numbers order in data row, for example: \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 is correct as initial table structure.", - "The 1st number: 0.55 is the management_fee, the 2nd number and the 3th number: 0.00 0.00 are the indirect costs, ", - "the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the performance_fee by interposed vehicles, ", - "the 6th number: 0.01 is the transaction costs, please ignore this number.", - "the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.", - "The management_fee_and_costs is management_fee + indirect costs = 0.55 + 0.00 + 0.00= 0.55", - "The output should be: ", - "{\"data\": [{\"fund name\": \"MyNorth Index Moderately Defensive\", \"share name\": \"MyNorth Index Moderately Defensive\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.00, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth Index Balanced\", \"share name\": \"MyNorth Index Balanced\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.00, \"buy_spread\": 0.09, \"sell_spread\": 0.09}]}" - ] - }, { "keywords": ["Retirement and TTR income streams"], "prompts": ["Complex management fee and costs rule:", @@ -455,67 +395,45 @@ ] }, { - "keywords": ["Option name \nIndirect costs"], - "prompts": ["Complex management fee and costs rule:", - "If the table with columns:", - "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", - "The management_fee is \"Management fee (% pa)\".", - "The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".", - "The indirect_costs is \"Estimated other indirect costs\"", - "The recoverable_expenses is \"Recoverable expenses\"", - "The performance_fee is \"Peformance fees charged to the option by underlying managers\".", - "The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"", - "The buy_spread and sell_spread are \"Buy/sell spreads\".", - "---Example 1 Start---", - "Option name \nIndirect costs \n(i)\nEstimated performance fees \n(ii)\nManagement \nfee \n(% pa) \n(i) \n(% pa) \n(% pa) \nTransaction \ncosts \n(% pa) \nBuy/sell \nspreads \n(%) \n(iv) \nRecoverable \nexpenses \n(iii) \nEstimated other \nindirect costs \nPerformance \nfees charged to \nthe option \nby underlying \nmanagers \nPerformance \nfees charged \nby interposed \nvehicles \nGenerations Defensive \n0.90 \n0.26 \n0.12 \n0.00 \n0.06 \n0.17 \n0.09/0.08 \nGenerations Moderately \nDefensive \n1.00 \n0.08 \n0.10 \n0.00 \n0.05 \n0.17 \n0.10/0.10 \n", - "---Example 1 End---", - "For this case: ", - "a. The table header is with disorder issue during PDF contents extraction issue.", - "b. The fund name is before the data row, e.g. Generations Defensive", - "c. The data points numbers order in data row, for example: \n0.90 \n0.26 \n0.12 \n0.00 \n0.06 \n0.17 \n0.09/0.08 \n is correct as initial table structure.", - "The 1st number: 0.90 is the management_fee,", - "the 2nd number: 0.26 is the recoverable_expenses,", - "the 3rd number: 0.12 is the indirect_costs", - "the 4th number: 0.00 is the performance_fee,", - "the 5th number: 0.06 is the interposed_vehicle_performance_fee_cost, ", - "the 6th number: 0.17 is the transaction costs, please ignore this number.", - "the 7th number: 0.09 is the buy_spread, ", - "the 8th number: 0.08 is the sell_spread.", - "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.90 + 0.26 + 0.12= 1.28", - "The output should be: ", - "{\"data\": [{\"fund name\": \"Generations Defensive\", \"share name\": \"Generations Defensive\", \"management_fee_and_costs\": 1.28, \"management_fee\": 0.9, \"recoverable_expenses\": 0.26, \"indirect_costs\": 0.12, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.06, \"buy_spread\": 0.09, \"sell_spread\": 0.08}, {\"fund name\": \"Generations Moderately Defensive\", \"share name\": \"Generations Moderately Defensive\", \"management_fee_and_costs\": 1.18, \"management_fee\": 1, \"recoverable_expenses\": 0.08, \"indirect_costs\": 0.1,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.05, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" - ] - }, - { - "keywords": "Management \nfee (i) \n(% pa) \nIndirect costs (i) \n(% pa)", + "keywords": "Recoverable expenses \nEstimated other indirect costs", "prompts": ["Complex management fee and costs rule:", "If the table with columns:", "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", "The management_fee is \"Management fee (% pa)\".", "The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".", - "The indirect_costs is \"Estimated other indirect costs\"", "The recoverable_expenses is \"Recoverable expenses\"", + "The indirect_costs is \"Estimated other indirect costs\"", "The performance_fee is \"Peformance fees charged to the Investment Option by underlying managers\".", "The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"", "The buy_spread and sell_spread are \"Buy/sell spreads\".", "---Example 1 Start---", - "Investment Option \nManagement \nfee (i) \n(% pa) \nIndirect costs (i) \n(% pa) \nEstimated performance fees (ii) \n(% pa) \nTransaction \ncosts (% pa) \nBuy/sell \nspreads (%) \nRecoverable \nexpenses (iii) \nEstimated \nother \nindirect costs \nPerformance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \nPerformance fees \ncharged by \ninterposed \nvehicles \nNorth Active Defensive \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \nNorth Active Moderately \nDefensive \n0.72 \n0.07 \n0.04 \n0.00 \n0.01 \n0.14 \n0.09/0.09 \nNorth Index Growth \n0.45 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.06/0.06 \nNorth Index High Growth \n0.45 \n0.00 \n0.01 \n0.00 \n0.00 \n0.01 \n0.06/0.07 \n", + "Investment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \nNorth Active Defensive \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \nNorth Active Moderately \nDefensive \n0.72 \n0.07 \n0.04 \n0.00 \n0.01 \n0.14 \n0.09/0.09 \nNorth Index Growth \n0.45 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.06/0.06 \nNorth Index High Growth \n0.45 \n0.00 \n0.01 \n0.00 \n0.00 \n0.01 \n0.06/0.07 \n", "---Example 1 End---", "For this case: ", - "a. The table header is with secondary-level header.", - "b. The fund name is before the data row, e.g. North Active Defensive", - "c. The data points numbers order in data row, for example: \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \n is correct as initial table structure.", + "a. The fund name is before the data row, e.g. North Active Defensive", + "c. The data points numbers in data row. ", + "For example: \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \n is with correct order as initial table structure.", "The 1st number: 0.62 is the management_fee,", "the 2nd number: 0.18 is the recoverable_expenses,", "the 3rd number: 0.05 is the indirect_costs", "the 4th number: 0.00 is the performance_fee,", "the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ", - "the 6th number: 0.14 is the transaction costs, please ignore this number.", + "the 6th number: 0.14 is the Transaction costs (% pa).", "the 7th number: 0.08 is the buy_spread, ", "the 8th number: 0.08 is the sell_spread.", - "The management_fee_and_costs is management_fee + recoverable_expenses + indirect_costs = 0.62 + 0.18 + 0.05= 0.85", + "The management_fee_and_costs is Management fee (i) + Recoverable expenses + Estimated other indirect costs = 0.62 + 0.18 + 0.05= 0.85", + "**Attention: Ignore Transaction costs (% pa), the 6th number, DO NOT APPLY ITS VALUE TO CALCULATE management_fee_and_costs!!!**", "The output should be: ", - "{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}" + "{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}", + "---Example 2 Start---", + "Investment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 \nMyNorth \nAustralian Fixed \nInterest Index \niv \n0.25 \n0.01 \n0.00 \n0.00 \n0.00 \n0.07 \n0.10/0.10 \nMyNorth \nInternational \nFixed Interest \nIndex - Hedged \n", + "---Example 2 End---", + "For this case: ", + "a. This table header is same as Example 1.", + "b. The algorithm to calculate management_fee_and_costs is same as Example 1.", + "c. The difference is **the fund name is after the data row, e.g. the fund name of the first data row is: MyNorth Australian Fixed Interest Index**", + "The output should be: ", + "{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" ] } ] diff --git a/main.py b/main.py index e28c426..64fe6e0 100644 --- a/main.py +++ b/main.py @@ -1506,7 +1506,7 @@ if __name__ == "__main__": re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = False + force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" if doc_source == "aus_prospectus": @@ -1525,7 +1525,8 @@ if __name__ == "__main__": # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" - # special_doc_id_list: list = ["539261734"] + # special_doc_id_list: list = ["410899007", "539266880", "539266817", + # "539261734", "539266893"] # special_doc_id_list: list = ["401212184"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" diff --git a/utils/biz_utils.py b/utils/biz_utils.py index ade84a3..8485bc2 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -1033,4 +1033,84 @@ def remove_abundant_data_detail(data_detail_list: list, for remove_data in remove_list: if remove_data in data_detail_list: data_detail_list.remove(remove_data) - return data_detail_list \ No newline at end of file + return data_detail_list + + +def replace_special_table_header(page_text: str): + """ + For some special table header, replace to the standard header + e.g. + raw header 1: + Investment Option \n + Management \nfee (i) \n(% pa) \n + Indirect costs (i) \n(% pa) \n + Estimated performance fees (ii) \n(% pa) \n + Transaction \ncosts (% pa) \n + Buy/sell \nspreads (%) \n + Recoverable \nexpenses (iii) \n + Estimated \nother \nindirect costs \n + Performance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \n + Performance fees \ncharged by \ninterposed \nvehicles \n + + raw header 2: + Fund \n + Management \nfee 1 \n(% pa) \n + Indirect costs1\n(% pa)\n + Estimated performance fees2\n(% pa)\n + Transaction \ncosts \n(% pa) \n + Buy/sell \nspreads (%) \n + Recoverable \nexpenses 3 \n + Estimated \nother indirect \ncosts \n + Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n + Performance \nfees charged \nby interposed \nvehicles \n + + There are 2 layers of headers, the first layer is the main header, the second layer is the sub header + The purpose is to merge the sub header to the main header + Indirect costs (i) \n(% pa) replace to Recoverable expenses\nEstimated other indirect costs + Estimated performance fees2\n(% pa) replace to Performance fees charged to the Fund by underlying managers\nPerformance fees charged by interposed vehicles + + Remove the second layer header. + e.g. + Recoverable \nexpenses (iii) \n + Estimated \nother \nindirect costs \n + Performance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \n + Performance fees \ncharged by \ninterposed \nvehicles \n + + or + + Recoverable \nexpenses 3 \n + Estimated \nother indirect \ncosts \n + Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n + Performance \nfees charged \nby interposed \nvehicles \n + """ + replace_info_list = [ + { + # item 0: document 410899007 + # item 1: document 539266880, 539266817, 539261734 + # item 2: document 539266893 + "regex_all_list": + [r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n", + r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?interposed\s*vehicles\s*\n", + r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"], + "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n" + }, + { + # item 0: document 410899007 + "regex_all_list": + [r"Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Transactions\s*costs[\s\S]*?Buy\/sell\s*spreads\s*\(\%\)\s*\n"], + "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n" + } + ] + updated_text = False + for replace_info in replace_info_list: + for regex_all in replace_info["regex_all_list"]: + if re.search(regex_all, page_text) is not None: + page_text = re.sub(regex_all, replace_info["replace_text"], page_text) + updated_text = True + break + if updated_text: + break + return page_text + + + \ No newline at end of file