From d3be7118594e3c666061beb92c74b2ac1bd6e46d Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 28 Feb 2025 22:12:18 -0600 Subject: [PATCH] optimize administration fees instructions --- core/data_extraction.py | 2 +- core/page_filter.py | 40 ++++++++++--------- .../data_extraction_prompts_config.json | 7 ++++ main.py | 6 +-- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/core/data_extraction.py b/core/data_extraction.py index c8b1c49..3f22633 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -575,7 +575,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num != 16: + # if page_num != 8: # continue if page_num in handled_page_num_list: continue diff --git a/core/page_filter.py b/core/page_filter.py index 93c7d07..cd82137 100644 --- a/core/page_filter.py +++ b/core/page_filter.py @@ -302,28 +302,32 @@ class FilterPages: for split in search_text_split: if split[0].islower(): lower_word_count += 1 - if lower_word_count < lower_word_count_threshold: - if re.search(self.percentage_regex, search_text) is not None: - is_valid = True + if self.doc_source == "emea_ar" and \ + lower_word_count > lower_word_count_threshold: + is_valid = False break - new_search_text_regex = add_slash_to_text_as_regex(search_text) - new_search_regex = r"\n.*{0}.*\n(?P.*)\n(?P.*)\n".format( - new_search_text_regex - ) - new_search = re.search(new_search_regex, text, re.IGNORECASE) - if new_search is not None: - next_line = new_search.group("next_line").strip() - next_2_line = new_search.group("next_2_line").strip() - - if re.search(big_number_regex, next_line) is not None or \ - re.search(big_number_regex, next_2_line) is not None: - is_valid = False - else: - is_valid = True - break + + if re.search(self.percentage_regex, search_text) is not None: + is_valid = True + break + new_search_text_regex = add_slash_to_text_as_regex(search_text) + new_search_regex = r"\n.*{0}.*\n(?P.*)\n(?P.*)\n".format( + new_search_text_regex + ) + new_search = re.search(new_search_regex, text, re.IGNORECASE) + if new_search is not None: + next_line = new_search.group("next_line").strip() + next_2_line = new_search.group("next_2_line").strip() + + if re.search(big_number_regex, next_line) is not None or \ + re.search(big_number_regex, next_2_line) is not None: + is_valid = False else: is_valid = True break + else: + is_valid = True + break return is_valid def search_keyword(self, text: str, keyword: str): diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index d72ae96..441775d 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -249,6 +249,13 @@ "----Example 1 End----", "The output should be:", "{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}", + "\n", + "----Example 2 Start----", + "At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply)", + "----Example 2 End----", + "The administration fee is $1.00 per week plus 0.17% pa, so the output should be:", + "{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17}]}", + "\n", "Complex cases:", "A. Need to add multiple numbers together.", "----Example 1 Start----", diff --git a/main.py b/main.py index 3ff2017..fd90600 100644 --- a/main.py +++ b/main.py @@ -1043,8 +1043,8 @@ def batch_run_documents( page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) - re_run_extract_data = True - re_run_mapping_data = True + re_run_extract_data = False + re_run_mapping_data = False force_save_total_data = True calculate_metrics = False @@ -1521,7 +1521,7 @@ if __name__ == "__main__": # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" - # special_doc_id_list: list = ["411062815"] + # special_doc_id_list: list = ["412778803"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = (