support judge whether next page contents with same structure table as current page.
If yes, handle next page data extraction pipeline.
This commit is contained in:
parent
d0128d6279
commit
d0295995d8
|
|
@ -576,7 +576,7 @@ class DataExtraction:
|
|||
previous_page_datapoints = []
|
||||
previous_page_fund_name = None
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
# if page_num < 73:
|
||||
# if page_num < 75:
|
||||
# continue
|
||||
if page_num in handled_page_num_list:
|
||||
continue
|
||||
|
|
@ -653,9 +653,15 @@ class DataExtraction:
|
|||
break
|
||||
if extract_way == "text":
|
||||
next_page_text = self.page_text_dict.get(next_page_num, "")
|
||||
with_same_structure_table = self.is_next_page_with_same_structure_table(
|
||||
current_text, next_page_text
|
||||
)
|
||||
if not with_same_structure_table:
|
||||
break
|
||||
target_text = current_text + next_page_text
|
||||
else:
|
||||
target_text = ""
|
||||
|
||||
# try to get data by current page_datapoints
|
||||
logger.info(f"Try to get data from next page {next_page_num}")
|
||||
next_page_extract_data = self.extract_data_by_page(
|
||||
|
|
@ -705,6 +711,28 @@ class DataExtraction:
|
|||
# self.output_data_to_file(data_list)
|
||||
return data_list
|
||||
|
||||
def is_next_page_with_same_structure_table(self, current_page_text: str, next_page_text: str) -> bool:
|
||||
with_same_structure_table = False
|
||||
compare_table_structure_prompts_file = os.path.join(self.instruction_folder, "compare_table_structure_prompts.json")
|
||||
with open(compare_table_structure_prompts_file, "r", encoding="utf-8") as f:
|
||||
compare_table_structure_prompts = "\n".join(json.load(f).get("prompts", []))
|
||||
if len(compare_table_structure_prompts) > 0:
|
||||
prompts = f"Context: \ncurrent page contents:\n{current_page_text}\nnext page contents:\n{next_page_text}\nInstructions:\n{compare_table_structure_prompts}\n"
|
||||
result, with_error = chat(
|
||||
prompt=prompts, response_format={"type": "json_object"}, max_tokens=100
|
||||
)
|
||||
response = result.get("response", "")
|
||||
if not with_error:
|
||||
try:
|
||||
data = json.loads(response)
|
||||
answer = data.get("answer", "No")
|
||||
if answer.lower() == "yes":
|
||||
with_same_structure_table = True
|
||||
except:
|
||||
pass
|
||||
|
||||
return with_same_structure_table
|
||||
|
||||
def extract_data_by_image(self) -> dict:
|
||||
"""
|
||||
keys are
|
||||
|
|
|
|||
Loading…
Reference in New Issue