diff --git a/core/data_extraction.py b/core/data_extraction.py index 685ff15..4dee667 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -1112,12 +1112,14 @@ class DataExtraction: def is_next_page_with_same_structure_table(self, current_page_text: str, next_page_text: str) -> bool: with_same_structure_table = False compare_table_structure_prompts_file = os.path.join(self.instruction_folder, "compare_table_structure_prompts.json") + if not os.path.exists(compare_table_structure_prompts_file): + return with_same_structure_table with open(compare_table_structure_prompts_file, "r", encoding="utf-8") as f: compare_table_structure_prompts = "\n".join(json.load(f).get("prompts", [])) if len(compare_table_structure_prompts) > 0: prompts = f"Context: \ncurrent page contents:\n{current_page_text}\nnext page contents:\n{next_page_text}\nInstructions:\n{compare_table_structure_prompts}\n" result, with_error = chat( - prompt=prompts, text_model="qwen-plus", image_model="qwen-vl-plus" + prompt=prompts, text_model=self.text_model, image_model=self.image_model ) response = result.get("response", "") if not with_error: @@ -2014,7 +2016,7 @@ class DataExtraction: # The reason why apply special_rule_by_keywords is: # 1. The special rule is very complex, prompsts are very long. # 2. To load it by keywords, is to avoid for simple case, the prompts are too long. - complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "") + complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", {}) with_special_rule_title = False found_sub_datapoints = [] datapoint_special_rule = {} diff --git a/instructions/emea_ar/compare_table_structure_prompts.json b/instructions/emea_ar/compare_table_structure_prompts.json new file mode 100644 index 0000000..a5ac3c5 --- /dev/null +++ b/instructions/emea_ar/compare_table_structure_prompts.json @@ -0,0 +1,9 @@ +{ + "prompts": [ + "Assume there is a data table in current page contents, is there the table with same table structure in the next page contents?", + "The meaning of \"same\" is: with totally same table columns for the table in both of current page and next page.", + "Please output JSON format, the format example is:", + "{\"answer\": \"Yes\"} or {\"answer\": \"No\"}", + "Answer:\n" + ] +} diff --git a/instructions/emea_ar/data_extraction_prompts_config.json b/instructions/emea_ar/data_extraction_prompts_config.json index b5f718a..93e202b 100644 --- a/instructions/emea_ar/data_extraction_prompts_config.json +++ b/instructions/emea_ar/data_extraction_prompts_config.json @@ -97,7 +97,8 @@ "The performance fees should not be the presence of the rates at which the performance fees are calculated.", "The reported of performance fees should not be \"% based on the NAV at the end of the accounting period\"" ] - } + }, + "sepcial_rule_by_keywords": {} }, "special_cases": { "common": [