From d0295995d8e4dc0b400a4f6a5b4538ee2103a6c7 Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Thu, 27 Feb 2025 23:08:57 -0600
Subject: [PATCH] support judge whether next page contents with same structure
 table as current page. If yes, handle next page data extraction pipeline.

---
 core/data_extraction.py | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/core/data_extraction.py b/core/data_extraction.py
index a29bf92..9b1756d 100644
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@@ -576,7 +576,7 @@ class DataExtraction:
         previous_page_datapoints = []
         previous_page_fund_name = None
         for page_num, page_text in self.page_text_dict.items():
-            # if page_num < 73:
+            # if page_num < 75:
             #     continue
             if page_num in handled_page_num_list:
                 continue
@@ -653,9 +653,15 @@ class DataExtraction:
                                 break
                         if extract_way == "text":
                             next_page_text = self.page_text_dict.get(next_page_num, "")
+                            with_same_structure_table = self.is_next_page_with_same_structure_table(
+                                current_text, next_page_text
+                            )
+                            if not with_same_structure_table:
+                                break
                             target_text = current_text + next_page_text
                         else:
                             target_text = ""
+                        
                         # try to get data by current page_datapoints
                         logger.info(f"Try to get data from next page {next_page_num}")
                         next_page_extract_data = self.extract_data_by_page(
@@ -705,6 +711,28 @@ class DataExtraction:
         # self.output_data_to_file(data_list)
         return data_list
     
+    def is_next_page_with_same_structure_table(self, current_page_text: str, next_page_text: str) -> bool:
+        with_same_structure_table = False
+        compare_table_structure_prompts_file = os.path.join(self.instruction_folder, "compare_table_structure_prompts.json")
+        with open(compare_table_structure_prompts_file, "r", encoding="utf-8") as f:
+            compare_table_structure_prompts = "\n".join(json.load(f).get("prompts", []))
+        if len(compare_table_structure_prompts) > 0:
+            prompts = f"Context: \ncurrent page contents:\n{current_page_text}\nnext page contents:\n{next_page_text}\nInstructions:\n{compare_table_structure_prompts}\n"
+            result, with_error = chat(
+                prompt=prompts, response_format={"type": "json_object"}, max_tokens=100
+            )
+            response = result.get("response", "")
+            if not with_error:
+                try:
+                    data = json.loads(response)
+                    answer = data.get("answer", "No")
+                    if answer.lower() == "yes":
+                        with_same_structure_table = True
+                except:
+                    pass
+             
+        return with_same_structure_table
+    
     def extract_data_by_image(self) -> dict:
         """
         keys are