From d3be7118594e3c666061beb92c74b2ac1bd6e46d Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Fri, 28 Feb 2025 22:12:18 -0600
Subject: [PATCH] optimize administration fees instructions

---
 core/data_extraction.py                       |  2 +-
 core/page_filter.py                           | 40 ++++++++++---------
 .../data_extraction_prompts_config.json       |  7 ++++
 main.py                                       |  6 +--
 4 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/core/data_extraction.py b/core/data_extraction.py
index c8b1c49..3f22633 100644
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@@ -575,7 +575,7 @@ class DataExtraction:
         previous_page_datapoints = []
         previous_page_fund_name = None
         for page_num, page_text in self.page_text_dict.items():
-            # if page_num != 16:
+            # if page_num != 8:
             #     continue
             if page_num in handled_page_num_list:
                 continue
diff --git a/core/page_filter.py b/core/page_filter.py
index 93c7d07..cd82137 100644
--- a/core/page_filter.py
+++ b/core/page_filter.py
@@ -302,28 +302,32 @@ class FilterPages:
                 for split in search_text_split:
                     if split[0].islower():
                         lower_word_count += 1
-                if lower_word_count < lower_word_count_threshold:
-                    if re.search(self.percentage_regex, search_text) is not None:
-                        is_valid = True
+                if self.doc_source == "emea_ar" and \
+                    lower_word_count > lower_word_count_threshold:
+                        is_valid = False
                         break
-                    new_search_text_regex = add_slash_to_text_as_regex(search_text)
-                    new_search_regex = r"\n.*{0}.*\n(?P<next_line>.*)\n(?P<next_2_line>.*)\n".format(
-                        new_search_text_regex
-                    )
-                    new_search = re.search(new_search_regex, text, re.IGNORECASE)
-                    if new_search is not None:
-                        next_line = new_search.group("next_line").strip()
-                        next_2_line = new_search.group("next_2_line").strip()
-                        
-                        if re.search(big_number_regex, next_line) is not None or \
-                            re.search(big_number_regex, next_2_line) is not None:
-                            is_valid = False
-                        else:
-                            is_valid = True
-                            break
+
+                if re.search(self.percentage_regex, search_text) is not None:
+                    is_valid = True
+                    break
+                new_search_text_regex = add_slash_to_text_as_regex(search_text)
+                new_search_regex = r"\n.*{0}.*\n(?P<next_line>.*)\n(?P<next_2_line>.*)\n".format(
+                    new_search_text_regex
+                )
+                new_search = re.search(new_search_regex, text, re.IGNORECASE)
+                if new_search is not None:
+                    next_line = new_search.group("next_line").strip()
+                    next_2_line = new_search.group("next_2_line").strip()
+                    
+                    if re.search(big_number_regex, next_line) is not None or \
+                        re.search(big_number_regex, next_2_line) is not None:
+                        is_valid = False
                     else:
                         is_valid = True
                         break
+                else:
+                    is_valid = True
+                    break
         return is_valid
 
     def search_keyword(self, text: str, keyword: str):
diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json
index d72ae96..441775d 100644
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@@ -249,6 +249,13 @@
 				"----Example 1 End----",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}",
+				"\n",
+				"----Example 2 Start----",
+				"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply)",
+				"----Example 2 End----",
+				"The administration fee is $1.00 per week plus 0.17% pa, so the output should be:",
+				"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17}]}",
+				"\n",
 				"Complex cases:",
 				"A. Need to add multiple numbers together.",
 				"----Example 1 Start----",
diff --git a/main.py b/main.py
index 3ff2017..fd90600 100644
--- a/main.py
+++ b/main.py
@@ -1043,8 +1043,8 @@ def batch_run_documents(
     page_filter_ground_truth_file = (
         r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
     )
-    re_run_extract_data = True
-    re_run_mapping_data = True
+    re_run_extract_data = False
+    re_run_mapping_data = False
     force_save_total_data = True
     calculate_metrics = False
 
@@ -1521,7 +1521,7 @@ if __name__ == "__main__":
         # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
         # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
         document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
-        # special_doc_id_list: list = ["411062815"]
+        # special_doc_id_list: list = ["412778803"]
         pdf_folder: str = r"/data/aus_prospectus/pdf/"
         output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
         output_extract_data_child_folder: str = (