From 15720d8bfd07f7eccdff9c5c610de0ba90cdf4cb Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Mon, 26 Aug 2024 17:17:39 -0500
Subject: [PATCH] 1. Text-and-image all in one chat function by ChatGPT4o 2.
 many experiments for extracting data by two ways: page text or page image.

---
 .../data_extraction_image_prompts.txt         | 47 +++++++++
 instructions/data_extraction_prompts.txt      | 29 ++++++
 ...able_extraction_image_optimize_prompts.txt | 35 +++++++
 .../table_extraction_image_prompts.txt        |  2 +-
 .../table_extraction_image_prompts_v2.txt     | 11 +++
 .../text_extraction_image_prompts.txt         | 11 +++
 playground.py                                 | 99 +++++++++++++++----
 7 files changed, 216 insertions(+), 18 deletions(-)
 create mode 100644 instructions/data_extraction_image_prompts.txt
 create mode 100644 instructions/data_extraction_prompts.txt
 create mode 100644 instructions/table_extraction_image_optimize_prompts.txt
 create mode 100644 instructions/table_extraction_image_prompts_v2.txt
 create mode 100644 instructions/text_extraction_image_prompts.txt

diff --git a/instructions/data_extraction_image_prompts.txt b/instructions/data_extraction_image_prompts.txt
new file mode 100644
index 0000000..444a75d
--- /dev/null
+++ b/instructions/data_extraction_image_prompts.txt
@@ -0,0 +1,47 @@
+Instructions:
+Please read the image carefully.
+1. Identify the text in the PDF page image.
+The text will be as output with key: "text".
+2. Identify and format the all of tables in the PDF page image. 
+Table contents should be as markdown format, 
+ensuring the table structure and contents are exactly as in the PDF page image. 
+The format should be: |Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2| 
+Each cell in the table(s) should be in the proper position of relevant row and column.
+The markdown table(s) will be as output with key: "table_contents".
+ 3. Extract data from upon parsed text and table(s) contents.
+3.1 The upon parsed text and table(s) contents as context.
+3.2 Data Extraction from parsed table contents
+Maybe there are TER, performance fees data in the parsed table(s) contents.
+The TER reported name could be:Total Expense Ratio, TER, Annualised TER including performance fees,etc.
+The performance fees reported name could be:performance fees, performance fees ratio, etc.
+If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be:
+TER including performance fees - TER excluding performance fees.
+The TER and performance fees value is percentage number, it means the value should be less than 100.
+Most of cases, the data is in the table(s) of context.
+
+3.3 Fund name/ share class name extraction from upon context
+Please extract fund name and share class name from the context.
+If can't find fund name or share class name from table contents, 
+please try to find them from parsed text contents.
+
+3.4 Output
+If possible, please extract fund name, share class name, TER or performance fees value as the output.
+One fund could be with multiple share classes and relevant TER or performance fees values.
+The output should be JSON format, the format is like:
+{	
+	"text": "text from image",
+	"table_contents": ["|Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|"],
+	"data":
+	[{
+		"fund name": "fund 1",
+		"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}]
+	}, 
+	{
+		"fund name": "fund 2",
+		"share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}]
+	}]
+}
+Only output JSON data.
+If can't find share class name in context, please output empty JSON data: []
+
+Answer:
\ No newline at end of file
diff --git a/instructions/data_extraction_prompts.txt b/instructions/data_extraction_prompts.txt
new file mode 100644
index 0000000..6ba469d
--- /dev/null
+++ b/instructions/data_extraction_prompts.txt
@@ -0,0 +1,29 @@
+Context:
+{page_text}
+
+Read the context carefully.
+Maybe there are TER, performance fees data in the context, the TER reported name could be:
+Total Expense Ratio, TER, Annualised TER including performance fees,etc.
+The performance fees reported name could be:
+performance fees, performance fees ratio, etc.
+If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be:
+TER including performance fees - TER excluding performance fees.
+
+The TER and performance fees value is percentage number, it means the value should be less than 100.
+Most of cases, the data is in the table(s) of context.
+If with multiple TER/ performance fee values in same row, please extract the latest.
+If possible, please extract fund name, share class name, TER or performance fees value as the output.
+One fund could be with multiple share classes and relevant TER values.
+The output should be JSON format, the format is like:
+[{
+	"fund name": "fund 1",
+	"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}]
+}, 
+{
+	"fund name": "fund 2",
+	"share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}]
+}]
+Only output JSON data.
+If can't find share class name in context, please output empty JSON data: []
+
+Answer:
\ No newline at end of file
diff --git a/instructions/table_extraction_image_optimize_prompts.txt b/instructions/table_extraction_image_optimize_prompts.txt
new file mode 100644
index 0000000..10122a4
--- /dev/null
+++ b/instructions/table_extraction_image_optimize_prompts.txt
@@ -0,0 +1,35 @@
+Smith is a professional to process financial report.
+He want to extract table(s) from PDF, output as markdown format.
+He decides to aks ChatGPT4o to help him for this.
+Smith's prompt is as below:
+--------------------------------------Smith's prompts start--------------------------------------
+Instructions:
+Please read the image carefully.
+Answer below questions:
+1. Please find the table or tables in the image. 
+2. Output the table contents as markdown format, it's like: 
+|name|age|hobby|
+|Annie|18|music|
+The contents should be exactly precise as the image contents.
+3. Please output the results as JSON format, the result member is with legal markdown table format, the example is:
+{
+"tables": ["
+|name|age|hobby|
+|Annie|18|music|
+"]
+}
+4. Only output JSON with tables
+
+Here is the answer from ChatGPT4o:
+--------------------------------------ChatGPT4o start--------------------------------------
+|Share Class|TER for the year (Note 6)|\n|---|---|\n|AI - Shares| |\n|BF - Shares| |\n|BI - Shares| |\n|BP - Shares| |\n|E - Shares|0.30%|\n|HAF - SEK Shares|0.84%|\n|HAI - SEK Shares|1.59%|\n|HB - EUR Shares| |\n|HB - SEK Shares| |\n|HBC - EUR Shares|0.65%|\n|HBF - EUR Shares| |\n|HBF - NOK Shares| |\n|HBF - SEK Shares| |\n|HBI - DKK Shares| |\n|HBI - EUR Shares| |\n|HBI - NOK Shares| |\n|HBI - SEK Shares| |\n|HY - DKK Shares| |\n|HY - EUR Shares| |\n|HY - SEK Shares| |\n|LE - Shares| |\n|LP - Shares| |\n|X - Shares| |\n|Y - Shares|0.09%|
+--------------------------------------ChatGPT4o end--------------------------------------
+
+But it's incorrect, the correct answer is as below:
+--------------------------------------correct answer start--------------------------------------
+|Share Class|TER for the year (Note 6)|\n|---|---|\n|AI - Shares| |\n|BF - Shares| |\n|BI - Shares|0.30%|\n|BP - Shares|0.84%|\n|E - Shares|1.59%|\n|HAF - SEK Shares| |\n|HAI - SEK Shares| |\n|HB - EUR Shares| |\n|HB - SEK Shares| |\n|HBC - EUR Shares|0.65%|\n|HBF - EUR Shares| |\n|HBF - NOK Shares| |\n|HBF - SEK Shares| |\n|HBI - DKK Shares| |\n|HBI - EUR Shares| |\n|HBI - NOK Shares| |\n|HBI - SEK Shares| |\n|HY - DKK Shares| |\n|HY - EUR Shares| |\n|HY - SEK Shares| |\n|LE - Shares| |\n|LP - Shares| |\n|X - Shares| |\n|Y - Shares|0.09%|
+--------------------------------------correct answer end--------------------------------------
+
+Please analyze the image, incorrect answer, correct answer, help Mr. Smith to optimize the instructions and output as JSON format: {"Instructions": "optimized instructions"}
+
+Answer:
diff --git a/instructions/table_extraction_image_prompts.txt b/instructions/table_extraction_image_prompts.txt
index c823582..e0348e5 100644
--- a/instructions/table_extraction_image_prompts.txt
+++ b/instructions/table_extraction_image_prompts.txt
@@ -1,7 +1,7 @@
 Instructions:
 Please read the image carefully.
 Answer below questions:
-1. Please find the table or tables in the image.
+1. Please find the table or tables in the image. 
 2. Output the table contents as markdown format, it's like: 
 |name|age|hobby|
 |Annie|18|music|
diff --git a/instructions/table_extraction_image_prompts_v2.txt b/instructions/table_extraction_image_prompts_v2.txt
new file mode 100644
index 0000000..4be0d12
--- /dev/null
+++ b/instructions/table_extraction_image_prompts_v2.txt
@@ -0,0 +1,11 @@
+Instructions:
+Please read the image carefully. 
+Answer the following questions: 
+1. Identify the table or tables in the image. 
+2. Output the table contents in markdown format, ensuring the table structure and contents are exactly as in the image. 
+The format should be: |Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2| 
+3. Output the results in JSON format with the key 'tables' containing the markdown table(s). 
+The format should be: 
+{"tables": ["|Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|"]} 
+4. Only output JSON with tables.
+Answer:
\ No newline at end of file
diff --git a/instructions/text_extraction_image_prompts.txt b/instructions/text_extraction_image_prompts.txt
new file mode 100644
index 0000000..0a02b06
--- /dev/null
+++ b/instructions/text_extraction_image_prompts.txt
@@ -0,0 +1,11 @@
+Instructions:
+Please read the image carefully. 
+Answer the following questions: 
+1. Identify the text contents in the image. 
+2. Output the text contexts, ensuring the contents are exactly as in the image. 
+The format should be totally same as the sequences in the image.
+3. Output the results in JSON format with the key 'text' containing the markdown table(s). 
+The format should be: 
+{"text": "image contents text"} 
+4. Only output JSON with text.
+Answer:
\ No newline at end of file
diff --git a/playground.py b/playground.py
index cfb97ad..1ef9116 100644
--- a/playground.py
+++ b/playground.py
@@ -7,10 +7,9 @@ from utils.logger import logger
 from utils.gpt_utils import chat
 
 
-
-def get_base64_pdf_image_list(pdf_file: str,
-                              pdf_page_index_list: list,
-                              output_folder: str=None) -> dict:
+def get_base64_pdf_image_list(
+    pdf_file: str, pdf_page_index_list: list, output_folder: str = None
+) -> dict:
     if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
         logger.error("pdf_file is not provided")
         return None
@@ -19,8 +18,9 @@ def get_base64_pdf_image_list(pdf_file: str,
         pdf_page_index_list = list(range(pdf_util.get_page_count()))
     if output_folder is not None and len(output_folder) > 0:
         os.makedirs(output_folder, exist_ok=True)
-    pdf_image_info = pdf_util.extract_images(pdf_page_index_list=pdf_page_index_list,
-                                                    output_folder=output_folder)
+    pdf_image_info = pdf_util.extract_images(
+        pdf_page_index_list=pdf_page_index_list, output_folder=output_folder
+    )
     return pdf_image_info
 
 
@@ -31,15 +31,20 @@ def encode_image(image_path: str):
         return base64.b64encode(image_file.read()).decode("utf-8")
 
 
-def chat_with_image(pdf_file: str,
-                    pdf_page_index_list: list, 
-                    image_folder: str,
-                    gpt_folder: str):
+def chat_with_image(
+    pdf_file: str,
+    pdf_page_index_list: list,
+    image_instructions_file: str,
+    image_folder: str,
+    gpt_folder: str,
+):
     if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
         logger.error("pdf_file is not provided")
         return None
-    pdf_image_info = get_base64_pdf_image_list(pdf_file, pdf_page_index_list, image_folder)
-    image_instructions_file = r'./instructions/table_extraction_image_prompts.txt'
+    pdf_image_info = get_base64_pdf_image_list(
+        pdf_file, pdf_page_index_list, image_folder
+    )
+
     with open(image_instructions_file, "r", encoding="utf-8") as file:
         image_instructions = file.read()
     os.makedirs(gpt_folder, exist_ok=True)
@@ -57,7 +62,9 @@ def chat_with_image(pdf_file: str,
             response_json = json.loads(response)
         except:
             response_json = json_repair.loads(response)
-        response_json_file = os.path.join(gpt_folder, f"{pdf_base_name}_{page_index}.json")
+        response_json_file = os.path.join(
+            gpt_folder, f"{pdf_base_name}_{page_index}.json"
+        )
         with open(response_json_file, "w", encoding="utf-8") as file:
             json.dump(response_json, file, indent=4)
         logger.info(f"Response for image in page {page_index}: {response}")
@@ -65,8 +72,66 @@ def chat_with_image(pdf_file: str,
 
 
 if __name__ == "__main__":
-    pdf_file = r"/data/emea_ar/small_pdf/382366116.pdf"
-    pdf_page_index_list = [29, 35, 71, 77, 83, 89, 97, 103, 112, 121, 130, 140, 195, 250, 305]
+    # Table extraction by image
+    # pdf_file = r"/data/emea_ar/small_pdf/382366116.pdf"
+    # pdf_page_index_list = [29, 35, 71, 77, 83, 89, 97, 103, 112, 121, 130, 140, 195, 250, 305]
+    # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
+    # pdf_page_index_list = [13]
+    # pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
+    # pdf_page_index_list = [29]
+    # image_instructions_file = r"./instructions/table_extraction_image_prompts_v2.txt"
+    # image_output_folder = r"/data/emea_ar/small_pdf_image/"
+    # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/table/"
+    # chat_with_image(
+    #     pdf_file,
+    #     pdf_page_index_list,
+    #     image_instructions_file,
+    #     image_output_folder,
+    #     gpt_output_folder,
+    # )
+    
+    # Data extraction by image
+    # pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
+    # pdf_page_index_list = [29]
+    pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
+    pdf_page_index_list = [13]
     image_output_folder = r"/data/emea_ar/small_pdf_image/"
-    gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/"
-    chat_with_image(pdf_file, pdf_page_index_list, image_output_folder, gpt_output_folder)
\ No newline at end of file
+    gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/data/"
+    image_instructions_file = r"./instructions/data_extraction_image_prompts.txt"
+    chat_with_image(
+        pdf_file,
+        pdf_page_index_list,
+        image_instructions_file,
+        image_output_folder,
+        gpt_output_folder,
+    )
+    
+    
+    # Text extraction by image
+    # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
+    # pdf_page_index_list = [13]
+    # image_instructions_file = r"./instructions/text_extraction_image_prompts.txt"
+    # image_output_folder = r"/data/emea_ar/small_pdf_image/"
+    # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/text/"
+    # chat_with_image(
+    #     pdf_file,
+    #     pdf_page_index_list,
+    #     image_instructions_file,
+    #     image_output_folder,
+    #     gpt_output_folder,
+    # )
+    
+    # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
+    # pdf_page_index_list = [13]
+    # image_instructions_file = r"./instructions/table_extraction_image_optimize_prompts.txt"
+    # image_output_folder = r"/data/emea_ar/small_pdf_image/"
+    # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/optimized_instructions/"
+    # chat_with_image(
+    #     pdf_file,
+    #     pdf_page_index_list,
+    #     image_instructions_file,
+    #     image_output_folder,
+    #     gpt_output_folder,
+    # )
+
+