diff --git a/calc_metrics.py b/calc_metrics.py index 53dca9e..840ee54 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -1308,6 +1308,10 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data def is_equal(gt_value, pred_value, data_point: str = ""): if gt_value is not None and len(str(gt_value)) > 0 and \ pred_value is not None and len(str(pred_value)) > 0: + if gt_value == "0.0": + gt_value = "0" + if pred_value == "0.0": + pred_value = "0" if gt_value == pred_value: return True if data_point == "benchmark_name": @@ -1351,7 +1355,7 @@ if __name__ == "__main__": verify_document_list_file_list = [None, "./sample_documents/aus_prospectus_29_documents_sample.txt", "./sample_documents/aus_prospectus_17_documents_sample.txt"] - is_for_all = False + is_for_all = True for verify_document_list_file in verify_document_list_file_list: calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, audit_data_sheet=audit_data_sheet, diff --git a/configuration/aus_prospectus/replace_table_header.json b/configuration/aus_prospectus/replace_table_header.json new file mode 100644 index 0000000..7dfc383 --- /dev/null +++ b/configuration/aus_prospectus/replace_table_header.json @@ -0,0 +1,35 @@ +{ + "details": [ + { + "regex_all_list": + ["\\nIndirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n", + "\\n(Investment\\s*Option|Fund)[\\s\\S]*?Management\\s*fee[\\s\\S]*?Indirect\\s*costs[\\s\\S]*?performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?indirect\\s*costs[\\s\\S]*?(interposed\\s*vehicles|managers\\s*vehicles)\\s*\\n", + "\\nOption\\s*name\\s*Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n"], + "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n", + "comments": ["item 0: document 410899007", + "item 1: document 539266880, 539266817, 539261734", + "item 2: document 539266893"] + + }, + { + "regex_all_list": + ["Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Transactions\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads\\s*\\(\\%\\)\\s*\\n"], + "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n", + "comments": ["item 0: document 410899007"] + }, + { + "regex_all_list": + ["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n", + "Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"], + "replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n", + "comments": ["item 0: document 401212184, page 17", + "item 1: document 401212184, page 18 - 20"] + }, + { + "regex_all_list": + ["Investment\\s*option\\s*Administration fees[\\s\\S]*?administration\\s*costs\\s*Investment\\s*fees[\\s\\S]*?investment\\s*costs\\s*Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Estimated\\s*administration[\\s\\S]*?transaction\\s*costs[\\s\\S]*?annual\\s*fees\\s*and\\s*costs\\s*\\(\\%\\s*pa\\)\\s*\\n"], + "replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n", + "comments": ["item 0: document 411062815, page 17"] + } + ] +} \ No newline at end of file diff --git a/configuration/emea_ar/replace_table_header.json b/configuration/emea_ar/replace_table_header.json new file mode 100644 index 0000000..56b3955 --- /dev/null +++ b/configuration/emea_ar/replace_table_header.json @@ -0,0 +1,3 @@ +{ + "details": [] +} \ No newline at end of file diff --git a/core/data_extraction.py b/core/data_extraction.py index a81c8b6..ecd51b5 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -74,6 +74,7 @@ class DataExtraction: self.datapoint_level_config = self.get_datapoint_level() self.datapoint_type_config = self.get_datapoint_type() self.datapoint_name_config = self.get_datapoint_name() + self.replace_table_header_config = self.get_replace_table_header_config() self.datapoint_reported_name_config, self.non_english_reported_name_config = \ self.get_datapoint_reported_name() self.extract_way = extract_way @@ -206,6 +207,15 @@ class DataExtraction: datapoint_name = json.load(f) return datapoint_name + def get_replace_table_header_config(self) -> str: + replace_table_header_file = os.path.join(self.configuration_folder, "replace_table_header.json") + if os.path.exists(replace_table_header_file): + with open(replace_table_header_file, "r", encoding="utf-8") as f: + replace_table_header_config = json.load(f).get("details", []) + return replace_table_header_config + else: + return [] + def get_pdf_page_text_dict(self) -> dict: pdf_util = PDFUtil(self.pdf_file) success, text, page_text_dict = pdf_util.extract_text() @@ -598,7 +608,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num != 25: + # if page_num != 16: # continue if page_num in handled_page_num_list: continue @@ -616,7 +626,7 @@ class DataExtraction: else: previous_page_fund_name = None - page_text = replace_special_table_header(page_text) + page_text = replace_special_table_header(self.replace_table_header_config, page_text) extract_data = self.extract_data_by_page( page_num, page_text, @@ -681,7 +691,8 @@ class DataExtraction: ) if not with_same_structure_table: break - next_page_text = replace_special_table_header(next_page_text) + next_page_text = replace_special_table_header(self.replace_table_header_config, + next_page_text) target_text = current_text + next_page_text else: target_text = "" diff --git a/main.py b/main.py index d2dc800..9de2959 100644 --- a/main.py +++ b/main.py @@ -1526,8 +1526,8 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - re_run_extract_data = False - re_run_mapping_data = False + re_run_extract_data = True + re_run_mapping_data = True force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" @@ -1560,7 +1560,7 @@ if __name__ == "__main__": # "544886057", # "550769189", # "553449663"] - # special_doc_id_list = ["506913190"] + # special_doc_id_list = ["411062815"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 9bab856..307be09 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -1036,7 +1036,7 @@ def remove_abundant_data_detail(data_detail_list: list, return data_detail_list -def replace_special_table_header(page_text: str): +def replace_special_table_header(replace_table_header_config: list, page_text: str): """ For some special table header, replace to the standard header e.g. @@ -1083,42 +1083,15 @@ def replace_special_table_header(page_text: str): Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n Performance \nfees charged \nby interposed \nvehicles \n """ - replace_info_list = [ - { - # item 0: document 410899007 - # item 1: document 539266880, 539266817, 539261734 - # item 2: document 539266893 - "regex_all_list": - [r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n", - r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?(interposed\s*vehicles|managers\s*vehicles)\s*\n", - r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"], - "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n" - }, - { - # item 0: document 410899007 - "regex_all_list": - [r"Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Transactions\s*costs[\s\S]*?Buy\/sell\s*spreads\s*\(\%\)\s*\n"], - "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n" - }, - { - # item 0: document 401212184, page 17 - 20 - "regex_all_list": - [r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Entry\s*Fee[\s\S]*?Nil\s*Entry[\s\S]*?Other\s*investment\s*costs[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Nil\s*Entry\s*Fee\s*.*\n", - r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Estimated\s*Other[\s\S]*?Entry\s*Fee\s*Nil\s*Entry[\s\S]*?Nil\s*Entry[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Fee\s*option.*\n"], - "replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n" - }, - { - # item 0: document 411062815, page 17 - "regex_all_list": - [r"Investment\s*option\s*Administration fees[\s\S]*?administration\s*costs\s*Investment\s*fees[\s\S]*?investment\s*costs\s*Administration\s*fees[\s\S]*?Investment\s*fees[\s\S]*?Estimated\s*administration[\s\S]*?transaction\s*costs[\s\S]*?annual\s*fees\s*and\s*costs\s*\(\%\s*pa\)\s*\n"], - "replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n" - } - ] + if replace_table_header_config is None or len(replace_table_header_config) == 0: + return page_text updated_text = False - for replace_info in replace_info_list: - for regex_all in replace_info["regex_all_list"]: - if re.search(regex_all, page_text) is not None: - page_text = re.sub(regex_all, replace_info["replace_text"], page_text) + for replace_info in replace_table_header_config: + for regex_all in replace_info.get("regex_all_list", []): + table_header_search = re.search(regex_all, page_text) + if table_header_search is not None: + original_text = table_header_search.group() + page_text = re.sub(regex_all, replace_info.get("replace_text", original_text), page_text) updated_text = True break if updated_text: