From d0295995d8e4dc0b400a4f6a5b4538ee2103a6c7 Mon Sep 17 00:00:00 2001 From: Blade He Date: Thu, 27 Feb 2025 23:08:57 -0600 Subject: [PATCH] support judge whether next page contents with same structure table as current page. If yes, handle next page data extraction pipeline. --- core/data_extraction.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/core/data_extraction.py b/core/data_extraction.py index a29bf92..9b1756d 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -576,7 +576,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num < 73: + # if page_num < 75: # continue if page_num in handled_page_num_list: continue @@ -653,9 +653,15 @@ class DataExtraction: break if extract_way == "text": next_page_text = self.page_text_dict.get(next_page_num, "") + with_same_structure_table = self.is_next_page_with_same_structure_table( + current_text, next_page_text + ) + if not with_same_structure_table: + break target_text = current_text + next_page_text else: target_text = "" + # try to get data by current page_datapoints logger.info(f"Try to get data from next page {next_page_num}") next_page_extract_data = self.extract_data_by_page( @@ -705,6 +711,28 @@ class DataExtraction: # self.output_data_to_file(data_list) return data_list + def is_next_page_with_same_structure_table(self, current_page_text: str, next_page_text: str) -> bool: + with_same_structure_table = False + compare_table_structure_prompts_file = os.path.join(self.instruction_folder, "compare_table_structure_prompts.json") + with open(compare_table_structure_prompts_file, "r", encoding="utf-8") as f: + compare_table_structure_prompts = "\n".join(json.load(f).get("prompts", [])) + if len(compare_table_structure_prompts) > 0: + prompts = f"Context: \ncurrent page contents:\n{current_page_text}\nnext page contents:\n{next_page_text}\nInstructions:\n{compare_table_structure_prompts}\n" + result, with_error = chat( + prompt=prompts, response_format={"type": "json_object"}, max_tokens=100 + ) + response = result.get("response", "") + if not with_error: + try: + data = json.loads(response) + answer = data.get("answer", "No") + if answer.lower() == "yes": + with_same_structure_table = True + except: + pass + + return with_same_structure_table + def extract_data_by_image(self) -> dict: """ keys are