support judge whether next page contents with same structure table as current page.

If yes, handle next page data extraction pipeline.
2025-02-27 23:08:57 -06:00 · 2025-02-27 23:08:57 -06:00 · d0295995d8
parent d0128d6279
commit d0295995d8
1 changed files with 29 additions and 1 deletions
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -576,7 +576,7 @@ class DataExtraction:
        previous_page_datapoints = []
        previous_page_fund_name = None
        for page_num, page_text in self.page_text_dict.items():
-            # if page_num < 73:
+            # if page_num < 75:
            #     continue
            if page_num in handled_page_num_list:
                continue
@ -653,9 +653,15 @@ class DataExtraction:
                                break
                        if extract_way == "text":
                            next_page_text = self.page_text_dict.get(next_page_num, "")
                            with_same_structure_table = self.is_next_page_with_same_structure_table(
                                current_text, next_page_text
                            )
                            if not with_same_structure_table:
                                break
                            target_text = current_text + next_page_text
                        else:
                            target_text = ""
                        # try to get data by current page_datapoints
                        logger.info(f"Try to get data from next page {next_page_num}")
                        next_page_extract_data = self.extract_data_by_page(
@ -705,6 +711,28 @@ class DataExtraction:
        # self.output_data_to_file(data_list)
        return data_list
    def is_next_page_with_same_structure_table(self, current_page_text: str, next_page_text: str) -> bool:
        with_same_structure_table = False
        compare_table_structure_prompts_file = os.path.join(self.instruction_folder, "compare_table_structure_prompts.json")
        with open(compare_table_structure_prompts_file, "r", encoding="utf-8") as f:
            compare_table_structure_prompts = "\n".join(json.load(f).get("prompts", []))
        if len(compare_table_structure_prompts) > 0:
            prompts = f"Context: \ncurrent page contents:\n{current_page_text}\nnext page contents:\n{next_page_text}\nInstructions:\n{compare_table_structure_prompts}\n"
            result, with_error = chat(
                prompt=prompts, response_format={"type": "json_object"}, max_tokens=100
            )
            response = result.get("response", "")
            if not with_error:
                try:
                    data = json.loads(response)
                    answer = data.get("answer", "No")
                    if answer.lower() == "yes":
                        with_same_structure_table = True
                except:
                    pass
        return with_same_structure_table
    def extract_data_by_image(self) -> dict:
        """
        keys are