fix issue

This commit is contained in:
blade 2025-11-12 14:07:55 +08:00
parent ea81197bcd
commit 7b0c825a39
3 changed files with 15 additions and 3 deletions

View File

@ -1112,12 +1112,14 @@ class DataExtraction:
def is_next_page_with_same_structure_table(self, current_page_text: str, next_page_text: str) -> bool: def is_next_page_with_same_structure_table(self, current_page_text: str, next_page_text: str) -> bool:
with_same_structure_table = False with_same_structure_table = False
compare_table_structure_prompts_file = os.path.join(self.instruction_folder, "compare_table_structure_prompts.json") compare_table_structure_prompts_file = os.path.join(self.instruction_folder, "compare_table_structure_prompts.json")
if not os.path.exists(compare_table_structure_prompts_file):
return with_same_structure_table
with open(compare_table_structure_prompts_file, "r", encoding="utf-8") as f: with open(compare_table_structure_prompts_file, "r", encoding="utf-8") as f:
compare_table_structure_prompts = "\n".join(json.load(f).get("prompts", [])) compare_table_structure_prompts = "\n".join(json.load(f).get("prompts", []))
if len(compare_table_structure_prompts) > 0: if len(compare_table_structure_prompts) > 0:
prompts = f"Context: \ncurrent page contents:\n{current_page_text}\nnext page contents:\n{next_page_text}\nInstructions:\n{compare_table_structure_prompts}\n" prompts = f"Context: \ncurrent page contents:\n{current_page_text}\nnext page contents:\n{next_page_text}\nInstructions:\n{compare_table_structure_prompts}\n"
result, with_error = chat( result, with_error = chat(
prompt=prompts, text_model="qwen-plus", image_model="qwen-vl-plus" prompt=prompts, text_model=self.text_model, image_model=self.image_model
) )
response = result.get("response", "") response = result.get("response", "")
if not with_error: if not with_error:
@ -2014,7 +2016,7 @@ class DataExtraction:
# The reason why apply special_rule_by_keywords is: # The reason why apply special_rule_by_keywords is:
# 1. The special rule is very complex, prompsts are very long. # 1. The special rule is very complex, prompsts are very long.
# 2. To load it by keywords, is to avoid for simple case, the prompts are too long. # 2. To load it by keywords, is to avoid for simple case, the prompts are too long.
complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "") complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", {})
with_special_rule_title = False with_special_rule_title = False
found_sub_datapoints = [] found_sub_datapoints = []
datapoint_special_rule = {} datapoint_special_rule = {}

View File

@ -0,0 +1,9 @@
{
"prompts": [
"Assume there is a data table in current page contents, is there the table with same table structure in the next page contents?",
"The meaning of \"same\" is: with totally same table columns for the table in both of current page and next page.",
"Please output JSON format, the format example is:",
"{\"answer\": \"Yes\"} or {\"answer\": \"No\"}",
"Answer:\n"
]
}

View File

@ -97,7 +97,8 @@
"The performance fees should not be the presence of the rates at which the performance fees are calculated.", "The performance fees should not be the presence of the rates at which the performance fees are calculated.",
"The reported of performance fees should not be \"% based on the NAV at the end of the accounting period\"" "The reported of performance fees should not be \"% based on the NAV at the end of the accounting period\""
] ]
} },
"sepcial_rule_by_keywords": {}
}, },
"special_cases": { "special_cases": {
"common": [ "common": [