From 15720d8bfd07f7eccdff9c5c610de0ba90cdf4cb Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 26 Aug 2024 17:17:39 -0500 Subject: [PATCH] 1. Text-and-image all in one chat function by ChatGPT4o 2. many experiments for extracting data by two ways: page text or page image. --- .../data_extraction_image_prompts.txt | 47 +++++++++ instructions/data_extraction_prompts.txt | 29 ++++++ ...able_extraction_image_optimize_prompts.txt | 35 +++++++ .../table_extraction_image_prompts.txt | 2 +- .../table_extraction_image_prompts_v2.txt | 11 +++ .../text_extraction_image_prompts.txt | 11 +++ playground.py | 99 +++++++++++++++---- 7 files changed, 216 insertions(+), 18 deletions(-) create mode 100644 instructions/data_extraction_image_prompts.txt create mode 100644 instructions/data_extraction_prompts.txt create mode 100644 instructions/table_extraction_image_optimize_prompts.txt create mode 100644 instructions/table_extraction_image_prompts_v2.txt create mode 100644 instructions/text_extraction_image_prompts.txt diff --git a/instructions/data_extraction_image_prompts.txt b/instructions/data_extraction_image_prompts.txt new file mode 100644 index 0000000..444a75d --- /dev/null +++ b/instructions/data_extraction_image_prompts.txt @@ -0,0 +1,47 @@ +Instructions: +Please read the image carefully. +1. Identify the text in the PDF page image. +The text will be as output with key: "text". +2. Identify and format the all of tables in the PDF page image. +Table contents should be as markdown format, +ensuring the table structure and contents are exactly as in the PDF page image. +The format should be: |Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2| +Each cell in the table(s) should be in the proper position of relevant row and column. +The markdown table(s) will be as output with key: "table_contents". + 3. Extract data from upon parsed text and table(s) contents. +3.1 The upon parsed text and table(s) contents as context. +3.2 Data Extraction from parsed table contents +Maybe there are TER, performance fees data in the parsed table(s) contents. +The TER reported name could be:Total Expense Ratio, TER, Annualised TER including performance fees,etc. +The performance fees reported name could be:performance fees, performance fees ratio, etc. +If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be: +TER including performance fees - TER excluding performance fees. +The TER and performance fees value is percentage number, it means the value should be less than 100. +Most of cases, the data is in the table(s) of context. + +3.3 Fund name/ share class name extraction from upon context +Please extract fund name and share class name from the context. +If can't find fund name or share class name from table contents, +please try to find them from parsed text contents. + +3.4 Output +If possible, please extract fund name, share class name, TER or performance fees value as the output. +One fund could be with multiple share classes and relevant TER or performance fees values. +The output should be JSON format, the format is like: +{ + "text": "text from image", + "table_contents": ["|Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|"], + "data": + [{ + "fund name": "fund 1", + "share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}] + }, + { + "fund name": "fund 2", + "share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}] + }] +} +Only output JSON data. +If can't find share class name in context, please output empty JSON data: [] + +Answer: \ No newline at end of file diff --git a/instructions/data_extraction_prompts.txt b/instructions/data_extraction_prompts.txt new file mode 100644 index 0000000..6ba469d --- /dev/null +++ b/instructions/data_extraction_prompts.txt @@ -0,0 +1,29 @@ +Context: +{page_text} + +Read the context carefully. +Maybe there are TER, performance fees data in the context, the TER reported name could be: +Total Expense Ratio, TER, Annualised TER including performance fees,etc. +The performance fees reported name could be: +performance fees, performance fees ratio, etc. +If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be: +TER including performance fees - TER excluding performance fees. + +The TER and performance fees value is percentage number, it means the value should be less than 100. +Most of cases, the data is in the table(s) of context. +If with multiple TER/ performance fee values in same row, please extract the latest. +If possible, please extract fund name, share class name, TER or performance fees value as the output. +One fund could be with multiple share classes and relevant TER values. +The output should be JSON format, the format is like: +[{ + "fund name": "fund 1", + "share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}] +}, +{ + "fund name": "fund 2", + "share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}] +}] +Only output JSON data. +If can't find share class name in context, please output empty JSON data: [] + +Answer: \ No newline at end of file diff --git a/instructions/table_extraction_image_optimize_prompts.txt b/instructions/table_extraction_image_optimize_prompts.txt new file mode 100644 index 0000000..10122a4 --- /dev/null +++ b/instructions/table_extraction_image_optimize_prompts.txt @@ -0,0 +1,35 @@ +Smith is a professional to process financial report. +He want to extract table(s) from PDF, output as markdown format. +He decides to aks ChatGPT4o to help him for this. +Smith's prompt is as below: +--------------------------------------Smith's prompts start-------------------------------------- +Instructions: +Please read the image carefully. +Answer below questions: +1. Please find the table or tables in the image. +2. Output the table contents as markdown format, it's like: +|name|age|hobby| +|Annie|18|music| +The contents should be exactly precise as the image contents. +3. Please output the results as JSON format, the result member is with legal markdown table format, the example is: +{ +"tables": [" +|name|age|hobby| +|Annie|18|music| +"] +} +4. Only output JSON with tables + +Here is the answer from ChatGPT4o: +--------------------------------------ChatGPT4o start-------------------------------------- +|Share Class|TER for the year (Note 6)|\n|---|---|\n|AI - Shares| |\n|BF - Shares| |\n|BI - Shares| |\n|BP - Shares| |\n|E - Shares|0.30%|\n|HAF - SEK Shares|0.84%|\n|HAI - SEK Shares|1.59%|\n|HB - EUR Shares| |\n|HB - SEK Shares| |\n|HBC - EUR Shares|0.65%|\n|HBF - EUR Shares| |\n|HBF - NOK Shares| |\n|HBF - SEK Shares| |\n|HBI - DKK Shares| |\n|HBI - EUR Shares| |\n|HBI - NOK Shares| |\n|HBI - SEK Shares| |\n|HY - DKK Shares| |\n|HY - EUR Shares| |\n|HY - SEK Shares| |\n|LE - Shares| |\n|LP - Shares| |\n|X - Shares| |\n|Y - Shares|0.09%| +--------------------------------------ChatGPT4o end-------------------------------------- + +But it's incorrect, the correct answer is as below: +--------------------------------------correct answer start-------------------------------------- +|Share Class|TER for the year (Note 6)|\n|---|---|\n|AI - Shares| |\n|BF - Shares| |\n|BI - Shares|0.30%|\n|BP - Shares|0.84%|\n|E - Shares|1.59%|\n|HAF - SEK Shares| |\n|HAI - SEK Shares| |\n|HB - EUR Shares| |\n|HB - SEK Shares| |\n|HBC - EUR Shares|0.65%|\n|HBF - EUR Shares| |\n|HBF - NOK Shares| |\n|HBF - SEK Shares| |\n|HBI - DKK Shares| |\n|HBI - EUR Shares| |\n|HBI - NOK Shares| |\n|HBI - SEK Shares| |\n|HY - DKK Shares| |\n|HY - EUR Shares| |\n|HY - SEK Shares| |\n|LE - Shares| |\n|LP - Shares| |\n|X - Shares| |\n|Y - Shares|0.09%| +--------------------------------------correct answer end-------------------------------------- + +Please analyze the image, incorrect answer, correct answer, help Mr. Smith to optimize the instructions and output as JSON format: {"Instructions": "optimized instructions"} + +Answer: diff --git a/instructions/table_extraction_image_prompts.txt b/instructions/table_extraction_image_prompts.txt index c823582..e0348e5 100644 --- a/instructions/table_extraction_image_prompts.txt +++ b/instructions/table_extraction_image_prompts.txt @@ -1,7 +1,7 @@ Instructions: Please read the image carefully. Answer below questions: -1. Please find the table or tables in the image. +1. Please find the table or tables in the image. 2. Output the table contents as markdown format, it's like: |name|age|hobby| |Annie|18|music| diff --git a/instructions/table_extraction_image_prompts_v2.txt b/instructions/table_extraction_image_prompts_v2.txt new file mode 100644 index 0000000..4be0d12 --- /dev/null +++ b/instructions/table_extraction_image_prompts_v2.txt @@ -0,0 +1,11 @@ +Instructions: +Please read the image carefully. +Answer the following questions: +1. Identify the table or tables in the image. +2. Output the table contents in markdown format, ensuring the table structure and contents are exactly as in the image. +The format should be: |Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2| +3. Output the results in JSON format with the key 'tables' containing the markdown table(s). +The format should be: +{"tables": ["|Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|"]} +4. Only output JSON with tables. +Answer: \ No newline at end of file diff --git a/instructions/text_extraction_image_prompts.txt b/instructions/text_extraction_image_prompts.txt new file mode 100644 index 0000000..0a02b06 --- /dev/null +++ b/instructions/text_extraction_image_prompts.txt @@ -0,0 +1,11 @@ +Instructions: +Please read the image carefully. +Answer the following questions: +1. Identify the text contents in the image. +2. Output the text contexts, ensuring the contents are exactly as in the image. +The format should be totally same as the sequences in the image. +3. Output the results in JSON format with the key 'text' containing the markdown table(s). +The format should be: +{"text": "image contents text"} +4. Only output JSON with text. +Answer: \ No newline at end of file diff --git a/playground.py b/playground.py index cfb97ad..1ef9116 100644 --- a/playground.py +++ b/playground.py @@ -7,10 +7,9 @@ from utils.logger import logger from utils.gpt_utils import chat - -def get_base64_pdf_image_list(pdf_file: str, - pdf_page_index_list: list, - output_folder: str=None) -> dict: +def get_base64_pdf_image_list( + pdf_file: str, pdf_page_index_list: list, output_folder: str = None +) -> dict: if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file): logger.error("pdf_file is not provided") return None @@ -19,8 +18,9 @@ def get_base64_pdf_image_list(pdf_file: str, pdf_page_index_list = list(range(pdf_util.get_page_count())) if output_folder is not None and len(output_folder) > 0: os.makedirs(output_folder, exist_ok=True) - pdf_image_info = pdf_util.extract_images(pdf_page_index_list=pdf_page_index_list, - output_folder=output_folder) + pdf_image_info = pdf_util.extract_images( + pdf_page_index_list=pdf_page_index_list, output_folder=output_folder + ) return pdf_image_info @@ -31,15 +31,20 @@ def encode_image(image_path: str): return base64.b64encode(image_file.read()).decode("utf-8") -def chat_with_image(pdf_file: str, - pdf_page_index_list: list, - image_folder: str, - gpt_folder: str): +def chat_with_image( + pdf_file: str, + pdf_page_index_list: list, + image_instructions_file: str, + image_folder: str, + gpt_folder: str, +): if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file): logger.error("pdf_file is not provided") return None - pdf_image_info = get_base64_pdf_image_list(pdf_file, pdf_page_index_list, image_folder) - image_instructions_file = r'./instructions/table_extraction_image_prompts.txt' + pdf_image_info = get_base64_pdf_image_list( + pdf_file, pdf_page_index_list, image_folder + ) + with open(image_instructions_file, "r", encoding="utf-8") as file: image_instructions = file.read() os.makedirs(gpt_folder, exist_ok=True) @@ -57,7 +62,9 @@ def chat_with_image(pdf_file: str, response_json = json.loads(response) except: response_json = json_repair.loads(response) - response_json_file = os.path.join(gpt_folder, f"{pdf_base_name}_{page_index}.json") + response_json_file = os.path.join( + gpt_folder, f"{pdf_base_name}_{page_index}.json" + ) with open(response_json_file, "w", encoding="utf-8") as file: json.dump(response_json, file, indent=4) logger.info(f"Response for image in page {page_index}: {response}") @@ -65,8 +72,66 @@ def chat_with_image(pdf_file: str, if __name__ == "__main__": - pdf_file = r"/data/emea_ar/small_pdf/382366116.pdf" - pdf_page_index_list = [29, 35, 71, 77, 83, 89, 97, 103, 112, 121, 130, 140, 195, 250, 305] + # Table extraction by image + # pdf_file = r"/data/emea_ar/small_pdf/382366116.pdf" + # pdf_page_index_list = [29, 35, 71, 77, 83, 89, 97, 103, 112, 121, 130, 140, 195, 250, 305] + # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf" + # pdf_page_index_list = [13] + # pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf" + # pdf_page_index_list = [29] + # image_instructions_file = r"./instructions/table_extraction_image_prompts_v2.txt" + # image_output_folder = r"/data/emea_ar/small_pdf_image/" + # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/table/" + # chat_with_image( + # pdf_file, + # pdf_page_index_list, + # image_instructions_file, + # image_output_folder, + # gpt_output_folder, + # ) + + # Data extraction by image + # pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf" + # pdf_page_index_list = [29] + pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf" + pdf_page_index_list = [13] image_output_folder = r"/data/emea_ar/small_pdf_image/" - gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/" - chat_with_image(pdf_file, pdf_page_index_list, image_output_folder, gpt_output_folder) \ No newline at end of file + gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/data/" + image_instructions_file = r"./instructions/data_extraction_image_prompts.txt" + chat_with_image( + pdf_file, + pdf_page_index_list, + image_instructions_file, + image_output_folder, + gpt_output_folder, + ) + + + # Text extraction by image + # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf" + # pdf_page_index_list = [13] + # image_instructions_file = r"./instructions/text_extraction_image_prompts.txt" + # image_output_folder = r"/data/emea_ar/small_pdf_image/" + # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/text/" + # chat_with_image( + # pdf_file, + # pdf_page_index_list, + # image_instructions_file, + # image_output_folder, + # gpt_output_folder, + # ) + + # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf" + # pdf_page_index_list = [13] + # image_instructions_file = r"./instructions/table_extraction_image_optimize_prompts.txt" + # image_output_folder = r"/data/emea_ar/small_pdf_image/" + # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/optimized_instructions/" + # chat_with_image( + # pdf_file, + # pdf_page_index_list, + # image_instructions_file, + # image_output_folder, + # gpt_output_folder, + # ) + +