support judge whether next page contents with same structure table as current page.
If yes, handle next page data extraction pipeline.
This commit is contained in:
parent
d0128d6279
commit
d0295995d8
|
|
@ -576,7 +576,7 @@ class DataExtraction:
|
||||||
previous_page_datapoints = []
|
previous_page_datapoints = []
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
# if page_num < 73:
|
# if page_num < 75:
|
||||||
# continue
|
# continue
|
||||||
if page_num in handled_page_num_list:
|
if page_num in handled_page_num_list:
|
||||||
continue
|
continue
|
||||||
|
|
@ -653,9 +653,15 @@ class DataExtraction:
|
||||||
break
|
break
|
||||||
if extract_way == "text":
|
if extract_way == "text":
|
||||||
next_page_text = self.page_text_dict.get(next_page_num, "")
|
next_page_text = self.page_text_dict.get(next_page_num, "")
|
||||||
|
with_same_structure_table = self.is_next_page_with_same_structure_table(
|
||||||
|
current_text, next_page_text
|
||||||
|
)
|
||||||
|
if not with_same_structure_table:
|
||||||
|
break
|
||||||
target_text = current_text + next_page_text
|
target_text = current_text + next_page_text
|
||||||
else:
|
else:
|
||||||
target_text = ""
|
target_text = ""
|
||||||
|
|
||||||
# try to get data by current page_datapoints
|
# try to get data by current page_datapoints
|
||||||
logger.info(f"Try to get data from next page {next_page_num}")
|
logger.info(f"Try to get data from next page {next_page_num}")
|
||||||
next_page_extract_data = self.extract_data_by_page(
|
next_page_extract_data = self.extract_data_by_page(
|
||||||
|
|
@ -705,6 +711,28 @@ class DataExtraction:
|
||||||
# self.output_data_to_file(data_list)
|
# self.output_data_to_file(data_list)
|
||||||
return data_list
|
return data_list
|
||||||
|
|
||||||
|
def is_next_page_with_same_structure_table(self, current_page_text: str, next_page_text: str) -> bool:
|
||||||
|
with_same_structure_table = False
|
||||||
|
compare_table_structure_prompts_file = os.path.join(self.instruction_folder, "compare_table_structure_prompts.json")
|
||||||
|
with open(compare_table_structure_prompts_file, "r", encoding="utf-8") as f:
|
||||||
|
compare_table_structure_prompts = "\n".join(json.load(f).get("prompts", []))
|
||||||
|
if len(compare_table_structure_prompts) > 0:
|
||||||
|
prompts = f"Context: \ncurrent page contents:\n{current_page_text}\nnext page contents:\n{next_page_text}\nInstructions:\n{compare_table_structure_prompts}\n"
|
||||||
|
result, with_error = chat(
|
||||||
|
prompt=prompts, response_format={"type": "json_object"}, max_tokens=100
|
||||||
|
)
|
||||||
|
response = result.get("response", "")
|
||||||
|
if not with_error:
|
||||||
|
try:
|
||||||
|
data = json.loads(response)
|
||||||
|
answer = data.get("answer", "No")
|
||||||
|
if answer.lower() == "yes":
|
||||||
|
with_same_structure_table = True
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return with_same_structure_table
|
||||||
|
|
||||||
def extract_data_by_image(self) -> dict:
|
def extract_data_by_image(self) -> dict:
|
||||||
"""
|
"""
|
||||||
keys are
|
keys are
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue