support judge whether next page contents with same structure table as current page.

If yes, handle next page data extraction pipeline.
This commit is contained in:
Blade He 2025-02-27 23:08:57 -06:00
parent d0128d6279
commit d0295995d8
1 changed files with 29 additions and 1 deletions

View File

@ -576,7 +576,7 @@ class DataExtraction:
previous_page_datapoints = [] previous_page_datapoints = []
previous_page_fund_name = None previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
# if page_num < 73: # if page_num < 75:
# continue # continue
if page_num in handled_page_num_list: if page_num in handled_page_num_list:
continue continue
@ -653,9 +653,15 @@ class DataExtraction:
break break
if extract_way == "text": if extract_way == "text":
next_page_text = self.page_text_dict.get(next_page_num, "") next_page_text = self.page_text_dict.get(next_page_num, "")
with_same_structure_table = self.is_next_page_with_same_structure_table(
current_text, next_page_text
)
if not with_same_structure_table:
break
target_text = current_text + next_page_text target_text = current_text + next_page_text
else: else:
target_text = "" target_text = ""
# try to get data by current page_datapoints # try to get data by current page_datapoints
logger.info(f"Try to get data from next page {next_page_num}") logger.info(f"Try to get data from next page {next_page_num}")
next_page_extract_data = self.extract_data_by_page( next_page_extract_data = self.extract_data_by_page(
@ -705,6 +711,28 @@ class DataExtraction:
# self.output_data_to_file(data_list) # self.output_data_to_file(data_list)
return data_list return data_list
def is_next_page_with_same_structure_table(self, current_page_text: str, next_page_text: str) -> bool:
with_same_structure_table = False
compare_table_structure_prompts_file = os.path.join(self.instruction_folder, "compare_table_structure_prompts.json")
with open(compare_table_structure_prompts_file, "r", encoding="utf-8") as f:
compare_table_structure_prompts = "\n".join(json.load(f).get("prompts", []))
if len(compare_table_structure_prompts) > 0:
prompts = f"Context: \ncurrent page contents:\n{current_page_text}\nnext page contents:\n{next_page_text}\nInstructions:\n{compare_table_structure_prompts}\n"
result, with_error = chat(
prompt=prompts, response_format={"type": "json_object"}, max_tokens=100
)
response = result.get("response", "")
if not with_error:
try:
data = json.loads(response)
answer = data.get("answer", "No")
if answer.lower() == "yes":
with_same_structure_table = True
except:
pass
return with_same_structure_table
def extract_data_by_image(self) -> dict: def extract_data_by_image(self) -> dict:
""" """
keys are keys are