optimize prompts

2024-08-28 10:21:26 -05:00 · 2024-08-28 10:21:26 -05:00 · 32676728f6
parent 15720d8bfd
commit 32676728f6
6 changed files with 226 additions and 39 deletions
--- a/configuration/datapoint_keyword.json
+++ b/configuration/datapoint_keyword.json
@ -200,9 +200,9 @@
  "tor": {
    "english": [
      "TOR",
-      "Turnover*",
+      "Turnover* \\n",
-      "Turnover",
+      "Turnover \\n",
-      "Turnover Ratio",
+      "Turnover Ratio",
      "Turnover Rate",
      "Portfolio Turnover",
      "Portfolio turnover ratio",
@ -339,7 +339,9 @@
      "Performance Fee",
      "Performance Fees",
      "performance-based fee",
-      "performance-related fee"
+      "performance-related fee",
      "with performance)",
      "with performance fee)"
    ],
    "spanish": [
      "Comisión de Gestión sobre Resultados",
--- a/core/page_filter.py
+++ b/core/page_filter.py
@ -6,7 +6,7 @@ import pandas as pd
 from utils.pdf_util import PDFUtil
 from utils.sql_query_util import query_document_fund_mapping
 from utils.logger import logger
-from utils.biz_utils import add_slash_to_text_as_regex
+from utils.biz_utils import add_slash_to_text_as_regex, clean_text
 class FilterPages:
@ -97,11 +97,6 @@ class FilterPages:
            new_keywords.append(new_keyword)
        return new_keywords
    def clean_text(self, text: str) -> str:
        text = text.lower()
        text = re.sub(r"\s+", ' ', text.strip())
        return text
    def start_job(self) -> dict:
        logger.info(f"Start extracting datapoints from {self.pdf_file}")
        """
@ -118,7 +113,7 @@ class FilterPages:
        for datapoint in self.datapoint_config.keys():
            result[datapoint] = []
        for page_num, page_text in self.page_text_dict.items():
-            text = self.clean_text(page_text)
+            text = clean_text(page_text)
            for datapoint, keywords in self.datapoint_config.items():
                # idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean])
                for keyword in keywords:
--- a/instructions/data_extraction_image_prompts.txt
+++ b/instructions/data_extraction_image_prompts.txt
@ -11,36 +11,48 @@ The markdown table(s) will be as output with key: "table_contents".
 3. Extract data from upon parsed text and table(s) contents.
 3.1 The upon parsed text and table(s) contents as context.
 3.2 Data Extraction from parsed table contents
-Maybe there are TER, performance fees data in the parsed table(s) contents.
+Maybe there are TER, performance fees data in the context, the TER reported name could be:
-The TER reported name could be:Total Expense Ratio, TER, Annualised TER including performance fees,etc.
+Total Expense Ratio, TER, Annualised TER including performance fees,etc.
-The performance fees reported name could be:performance fees, performance fees ratio, etc.
+The performance fees reported name could be:
-If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be:
+performance fees, performance fees ratio, etc.
-TER including performance fees - TER excluding performance fees.
+Special cases
 1. Performance fees is part of TER.
 If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance",
 The TER should be "TER including performance fees" or "TER with performance".
 The performance fees should be:
 "TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees".
 The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18.
 2. Combo TER value table.
 2.1 Exist Feeder fund TER and Master fund TER.
 The relevant table header is like this: 
 Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
 Please output separately as below:
 - "feeder fund share class" and "TER feeder" values
 - "Master fund" and "TER Master" values
 Here is the example:
 Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
 The output should be:
 [
 {"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]},
 {"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]},
 ]
 The TER and performance fees value is percentage number, it means the value should be less than 100.
 Most of cases, the data is in the table(s) of context.
-
+If with multiple TER/ performance fee values in same row, please extract the latest.
 3.3 Fund name/ share class name extraction from upon context
 Please extract fund name and share class name from the context.
 If can't find fund name or share class name from table contents, 
 please try to find them from parsed text contents.
 3.4 Output
 If possible, please extract fund name, share class name, TER or performance fees value as the output.
-One fund could be with multiple share classes and relevant TER or performance fees values.
+One fund could be with multiple share classes and relevant TER values.
 The output should be JSON format, the format is like:
 [{
 	"fund name": "fund 1",
 	"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}]
 }, 
 {
-	"text": "text from image",
+	"fund name": "fund 2",
-	"table_contents": ["|Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|"],
+	"share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}]
-	"data":
+}]
 	[{
 		"fund name": "fund 1",
 		"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}]
 	}, 
 	{
 		"fund name": "fund 2",
 		"share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}]
 	}]
 }
 Only output JSON data.
 If can't find share class name in context, please output empty JSON data: []
--- a/instructions/data_extraction_prompts.txt
+++ b/instructions/data_extraction_prompts.txt
@ -1,15 +1,38 @@
 Context:
 {page_text}
 Instructions:
 Read the context carefully.
 Maybe there are TER, performance fees data in the context, the TER reported name could be:
 Total Expense Ratio, TER, Annualised TER including performance fees,etc.
 The performance fees reported name could be:
 performance fees, performance fees ratio, etc.
-If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be:
+Special cases
-TER including performance fees - TER excluding performance fees.
+1. Performance fees is part of TER.
 If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance",
 The TER should be "TER including performance fees" or "TER with performance".
 The performance fees should be:
 "TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees".
 The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18.
 2. Combo TER value table.
 2.1 Exist Feeder fund TER and Master fund TER.
 The relevant table header is like this: 
 Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
 Please output separately as below:
 - "feeder fund share class" and "TER feeder" values
 - "Master fund" and "TER Master" values
 Here is the example:
 Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
 The output should be:
 [
 {"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]},
 {"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]},
 ]
 The TER and performance fees value is percentage number, it means the value should be less than 100.
 The performance fees value can be negative, e.g. -0.2 or -0.67.
 Most of cases, the data is in the table(s) of context.
 If with multiple TER/ performance fee values in same row, please extract the latest.
 If possible, please extract fund name, share class name, TER or performance fees value as the output.
--- a/playground.ipynb
+++ b/playground.ipynb
@ -0,0 +1,147 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils.biz_utils import add_slash_to_text_as_regex\n",
    "import json\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex = r\"Turnover \\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Turnover\\\\s+\\\\n'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "add_slash_to_text_as_regex(regex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover \\nreflects the\\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<re.Match object; span=(141, 151), match='Turnover \\n'>"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.search(regex, text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TOR no match\n",
      "Turnover\\*\\s+ no match\n",
      "Turnover\\s+ match Turnover \n",
      "Turnover\\s+Ratio no match\n",
      "Turnover\\s+Rate no match\n",
      "Portfolio\\s+Turnover no match\n",
      "Portfolio\\s+turnover\\s+ratio no match\n",
      "Portfolio\\s+turnover\\s+rate no match\n",
      "PTR no match\n",
      "Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
     ]
    }
   ],
   "source": [
    "with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
    "    datapoint_keywords_config = json.load(file)\n",
    "\n",
    "tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
    "\n",
    "for tor_regex in tor_regex_list:\n",
    "    regex = add_slash_to_text_as_regex(tor_regex)\n",
    "    search = re.search(regex, text)\n",
    "    if search:\n",
    "        print(f\"{regex} match {search.group()}\")\n",
    "    else:\n",
    "        print(f\"{regex} no match\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch2_real",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -12,3 +12,11 @@ def add_slash_to_text_as_regex(text: str):
            text = re.sub(replace, replace, text)
    text = re.sub(r"\s+", r"\\s+", text)
    return text
 def clean_text(text: str) -> str:
    text = text.lower()
    # update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
    text = re.sub(r"\\u[0-9a-z]{4}", ' ', text)
    text = re.sub(r"( ){2,}", ' ', text.strip())
    return text