optimize prompts

2024-08-28 10:21:26 -05:00 · 2024-08-28 10:21:26 -05:00 · 32676728f6
parent 15720d8bfd
commit 32676728f6
6 changed files with 226 additions and 39 deletions
--- a/configuration/datapoint_keyword.json
+++ b/configuration/datapoint_keyword.json
@ -200,9 +200,9 @@
  "tor": {
    "english": [
      "TOR",
-      "Turnover*",
-      "Turnover",
-      "Turnover Ratio",
+      "Turnover* \\n",
+      "Turnover \\n",
+      "Turnover Ratio",
      "Turnover Rate",
      "Portfolio Turnover",
      "Portfolio turnover ratio",
@ -339,7 +339,9 @@
      "Performance Fee",
      "Performance Fees",
      "performance-based fee",
-      "performance-related fee"
+      "performance-related fee",
+      "with performance)",
+      "with performance fee)"
    ],
    "spanish": [
      "Comisión de Gestión sobre Resultados",
--- a/core/page_filter.py
+++ b/core/page_filter.py
@ -6,7 +6,7 @@ import pandas as pd
 from utils.pdf_util import PDFUtil
 from utils.sql_query_util import query_document_fund_mapping
 from utils.logger import logger
-from utils.biz_utils import add_slash_to_text_as_regex
+from utils.biz_utils import add_slash_to_text_as_regex, clean_text


 class FilterPages:
@ -96,11 +96,6 @@ class FilterPages:
            new_keyword = add_slash_to_text_as_regex(keyword)
            new_keywords.append(new_keyword)
        return new_keywords
-    
-    def clean_text(self, text: str) -> str:
-        text = text.lower()
-        text = re.sub(r"\s+", ' ', text.strip())
-        return text
        
    def start_job(self) -> dict:
        logger.info(f"Start extracting datapoints from {self.pdf_file}")
@ -118,7 +113,7 @@ class FilterPages:
        for datapoint in self.datapoint_config.keys():
            result[datapoint] = []
        for page_num, page_text in self.page_text_dict.items():
-            text = self.clean_text(page_text)
+            text = clean_text(page_text)
            for datapoint, keywords in self.datapoint_config.items():
                # idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean])
                for keyword in keywords:
--- a/instructions/data_extraction_image_prompts.txt
+++ b/instructions/data_extraction_image_prompts.txt
@ -11,36 +11,48 @@ The markdown table(s) will be as output with key: "table_contents".
 3. Extract data from upon parsed text and table(s) contents.
 3.1 The upon parsed text and table(s) contents as context.
 3.2 Data Extraction from parsed table contents
-Maybe there are TER, performance fees data in the parsed table(s) contents.
-The TER reported name could be:Total Expense Ratio, TER, Annualised TER including performance fees,etc.
-The performance fees reported name could be:performance fees, performance fees ratio, etc.
-If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be:
-TER including performance fees - TER excluding performance fees.
+Maybe there are TER, performance fees data in the context, the TER reported name could be:
+Total Expense Ratio, TER, Annualised TER including performance fees,etc.
+The performance fees reported name could be:
+performance fees, performance fees ratio, etc.
+Special cases
+1. Performance fees is part of TER.
+If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance",
+The TER should be "TER including performance fees" or "TER with performance".
+The performance fees should be:
+"TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees".
+The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18.
+
+2. Combo TER value table.
+2.1 Exist Feeder fund TER and Master fund TER.
+The relevant table header is like this: 
+Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
+Please output separately as below:
+- "feeder fund share class" and "TER feeder" values
+- "Master fund" and "TER Master" values
+Here is the example:
+Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
+
+The output should be:
+[
+{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]},
+{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]},
+]
+
 The TER and performance fees value is percentage number, it means the value should be less than 100.
 Most of cases, the data is in the table(s) of context.
-
-3.3 Fund name/ share class name extraction from upon context
-Please extract fund name and share class name from the context.
-If can't find fund name or share class name from table contents, 
-please try to find them from parsed text contents.
-
-3.4 Output
+If with multiple TER/ performance fee values in same row, please extract the latest.
 If possible, please extract fund name, share class name, TER or performance fees value as the output.
-One fund could be with multiple share classes and relevant TER or performance fees values.
+One fund could be with multiple share classes and relevant TER values.
 The output should be JSON format, the format is like:
-{	
-	"text": "text from image",
-	"table_contents": ["|Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|"],
-	"data":
-	[{
-		"fund name": "fund 1",
-		"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}]
-	}, 
-	{
-		"fund name": "fund 2",
-		"share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}]
-	}]
-}
+[{
+	"fund name": "fund 1",
+	"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}]
+}, 
+{
+	"fund name": "fund 2",
+	"share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}]
+}]
 Only output JSON data.
 If can't find share class name in context, please output empty JSON data: []

--- a/instructions/data_extraction_prompts.txt
+++ b/instructions/data_extraction_prompts.txt
@ -1,15 +1,38 @@
 Context:
 {page_text}

+Instructions:
 Read the context carefully.
 Maybe there are TER, performance fees data in the context, the TER reported name could be:
 Total Expense Ratio, TER, Annualised TER including performance fees,etc.
 The performance fees reported name could be:
 performance fees, performance fees ratio, etc.
-If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be:
-TER including performance fees - TER excluding performance fees.
+Special cases
+1. Performance fees is part of TER.
+If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance",
+The TER should be "TER including performance fees" or "TER with performance".
+The performance fees should be:
+"TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees".
+The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18.
+
+2. Combo TER value table.
+2.1 Exist Feeder fund TER and Master fund TER.
+The relevant table header is like this: 
+Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
+Please output separately as below:
+- "feeder fund share class" and "TER feeder" values
+- "Master fund" and "TER Master" values
+Here is the example:
+Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
+
+The output should be:
+[
+{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]},
+{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]},
+]

 The TER and performance fees value is percentage number, it means the value should be less than 100.
+The performance fees value can be negative, e.g. -0.2 or -0.67.
 Most of cases, the data is in the table(s) of context.
 If with multiple TER/ performance fee values in same row, please extract the latest.
 If possible, please extract fund name, share class name, TER or performance fees value as the output.
--- a/playground.ipynb
+++ b/playground.ipynb
@ -0,0 +1,147 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils.biz_utils import add_slash_to_text_as_regex\n",
+    "import json\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "regex = r\"Turnover \\n\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Turnover\\\\s+\\\\n'"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "add_slash_to_text_as_regex(regex)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover \\nreflects the\\n\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<re.Match object; span=(141, 151), match='Turnover \\n'>"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "re.search(regex, text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TOR no match\n",
+      "Turnover\\*\\s+ no match\n",
+      "Turnover\\s+ match Turnover \n",
+      "Turnover\\s+Ratio no match\n",
+      "Turnover\\s+Rate no match\n",
+      "Portfolio\\s+Turnover no match\n",
+      "Portfolio\\s+turnover\\s+ratio no match\n",
+      "Portfolio\\s+turnover\\s+rate no match\n",
+      "PTR no match\n",
+      "Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
+    "    datapoint_keywords_config = json.load(file)\n",
+    "\n",
+    "tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
+    "\n",
+    "for tor_regex in tor_regex_list:\n",
+    "    regex = add_slash_to_text_as_regex(tor_regex)\n",
+    "    search = re.search(regex, text)\n",
+    "    if search:\n",
+    "        print(f\"{regex} match {search.group()}\")\n",
+    "    else:\n",
+    "        print(f\"{regex} no match\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch2_real",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -11,4 +11,12 @@ def add_slash_to_text_as_regex(text: str):
        if replace not in text:
            text = re.sub(replace, replace, text)
    text = re.sub(r"\s+", r"\\s+", text)
+    return text
+
+
+def clean_text(text: str) -> str:
+    text = text.lower()
+    # update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
+    text = re.sub(r"\\u[0-9a-z]{4}", ' ', text)
+    text = re.sub(r"( ){2,}", ' ', text.strip())
    return text