optimize prompts

This commit is contained in:
Blade He 2024-08-28 10:21:26 -05:00
parent 15720d8bfd
commit 32676728f6
6 changed files with 226 additions and 39 deletions

View File

@ -200,9 +200,9 @@
"tor": { "tor": {
"english": [ "english": [
"TOR", "TOR",
"Turnover*", "Turnover* \\n",
"Turnover", "Turnover \\n",
"Turnover Ratio", "Turnover Ratio",
"Turnover Rate", "Turnover Rate",
"Portfolio Turnover", "Portfolio Turnover",
"Portfolio turnover ratio", "Portfolio turnover ratio",
@ -339,7 +339,9 @@
"Performance Fee", "Performance Fee",
"Performance Fees", "Performance Fees",
"performance-based fee", "performance-based fee",
"performance-related fee" "performance-related fee",
"with performance)",
"with performance fee)"
], ],
"spanish": [ "spanish": [
"Comisión de Gestión sobre Resultados", "Comisión de Gestión sobre Resultados",

View File

@ -6,7 +6,7 @@ import pandas as pd
from utils.pdf_util import PDFUtil from utils.pdf_util import PDFUtil
from utils.sql_query_util import query_document_fund_mapping from utils.sql_query_util import query_document_fund_mapping
from utils.logger import logger from utils.logger import logger
from utils.biz_utils import add_slash_to_text_as_regex from utils.biz_utils import add_slash_to_text_as_regex, clean_text
class FilterPages: class FilterPages:
@ -97,11 +97,6 @@ class FilterPages:
new_keywords.append(new_keyword) new_keywords.append(new_keyword)
return new_keywords return new_keywords
def clean_text(self, text: str) -> str:
text = text.lower()
text = re.sub(r"\s+", ' ', text.strip())
return text
def start_job(self) -> dict: def start_job(self) -> dict:
logger.info(f"Start extracting datapoints from {self.pdf_file}") logger.info(f"Start extracting datapoints from {self.pdf_file}")
""" """
@ -118,7 +113,7 @@ class FilterPages:
for datapoint in self.datapoint_config.keys(): for datapoint in self.datapoint_config.keys():
result[datapoint] = [] result[datapoint] = []
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
text = self.clean_text(page_text) text = clean_text(page_text)
for datapoint, keywords in self.datapoint_config.items(): for datapoint, keywords in self.datapoint_config.items():
# idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean]) # idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean])
for keyword in keywords: for keyword in keywords:

View File

@ -11,36 +11,48 @@ The markdown table(s) will be as output with key: "table_contents".
3. Extract data from upon parsed text and table(s) contents. 3. Extract data from upon parsed text and table(s) contents.
3.1 The upon parsed text and table(s) contents as context. 3.1 The upon parsed text and table(s) contents as context.
3.2 Data Extraction from parsed table contents 3.2 Data Extraction from parsed table contents
Maybe there are TER, performance fees data in the parsed table(s) contents. Maybe there are TER, performance fees data in the context, the TER reported name could be:
The TER reported name could be:Total Expense Ratio, TER, Annualised TER including performance fees,etc. Total Expense Ratio, TER, Annualised TER including performance fees,etc.
The performance fees reported name could be:performance fees, performance fees ratio, etc. The performance fees reported name could be:
If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be: performance fees, performance fees ratio, etc.
TER including performance fees - TER excluding performance fees. Special cases
1. Performance fees is part of TER.
If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance",
The TER should be "TER including performance fees" or "TER with performance".
The performance fees should be:
"TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees".
The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18.
2. Combo TER value table.
2.1 Exist Feeder fund TER and Master fund TER.
The relevant table header is like this:
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
Please output separately as below:
- "feeder fund share class" and "TER feeder" values
- "Master fund" and "TER Master" values
Here is the example:
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
The output should be:
[
{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]},
{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]},
]
The TER and performance fees value is percentage number, it means the value should be less than 100. The TER and performance fees value is percentage number, it means the value should be less than 100.
Most of cases, the data is in the table(s) of context. Most of cases, the data is in the table(s) of context.
If with multiple TER/ performance fee values in same row, please extract the latest.
3.3 Fund name/ share class name extraction from upon context
Please extract fund name and share class name from the context.
If can't find fund name or share class name from table contents,
please try to find them from parsed text contents.
3.4 Output
If possible, please extract fund name, share class name, TER or performance fees value as the output. If possible, please extract fund name, share class name, TER or performance fees value as the output.
One fund could be with multiple share classes and relevant TER or performance fees values. One fund could be with multiple share classes and relevant TER values.
The output should be JSON format, the format is like: The output should be JSON format, the format is like:
{ [{
"text": "text from image",
"table_contents": ["|Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|"],
"data":
[{
"fund name": "fund 1", "fund name": "fund 1",
"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}] "share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}]
}, },
{ {
"fund name": "fund 2", "fund name": "fund 2",
"share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}] "share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}]
}] }]
}
Only output JSON data. Only output JSON data.
If can't find share class name in context, please output empty JSON data: [] If can't find share class name in context, please output empty JSON data: []

View File

@ -1,15 +1,38 @@
Context: Context:
{page_text} {page_text}
Instructions:
Read the context carefully. Read the context carefully.
Maybe there are TER, performance fees data in the context, the TER reported name could be: Maybe there are TER, performance fees data in the context, the TER reported name could be:
Total Expense Ratio, TER, Annualised TER including performance fees,etc. Total Expense Ratio, TER, Annualised TER including performance fees,etc.
The performance fees reported name could be: The performance fees reported name could be:
performance fees, performance fees ratio, etc. performance fees, performance fees ratio, etc.
If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be: Special cases
TER including performance fees - TER excluding performance fees. 1. Performance fees is part of TER.
If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance",
The TER should be "TER including performance fees" or "TER with performance".
The performance fees should be:
"TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees".
The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18.
2. Combo TER value table.
2.1 Exist Feeder fund TER and Master fund TER.
The relevant table header is like this:
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
Please output separately as below:
- "feeder fund share class" and "TER feeder" values
- "Master fund" and "TER Master" values
Here is the example:
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
The output should be:
[
{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]},
{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]},
]
The TER and performance fees value is percentage number, it means the value should be less than 100. The TER and performance fees value is percentage number, it means the value should be less than 100.
The performance fees value can be negative, e.g. -0.2 or -0.67.
Most of cases, the data is in the table(s) of context. Most of cases, the data is in the table(s) of context.
If with multiple TER/ performance fee values in same row, please extract the latest. If with multiple TER/ performance fee values in same row, please extract the latest.
If possible, please extract fund name, share class name, TER or performance fees value as the output. If possible, please extract fund name, share class name, TER or performance fees value as the output.

147
playground.ipynb Normal file
View File

@ -0,0 +1,147 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"from utils.biz_utils import add_slash_to_text_as_regex\n",
"import json\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"regex = r\"Turnover \\n\""
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Turnover\\\\s+\\\\n'"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"add_slash_to_text_as_regex(regex)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover \\nreflects the\\n\""
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<re.Match object; span=(141, 151), match='Turnover \\n'>"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re.search(regex, text)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TOR no match\n",
"Turnover\\*\\s+ no match\n",
"Turnover\\s+ match Turnover \n",
"Turnover\\s+Ratio no match\n",
"Turnover\\s+Rate no match\n",
"Portfolio\\s+Turnover no match\n",
"Portfolio\\s+turnover\\s+ratio no match\n",
"Portfolio\\s+turnover\\s+rate no match\n",
"PTR no match\n",
"Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
]
}
],
"source": [
"with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
" datapoint_keywords_config = json.load(file)\n",
"\n",
"tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
"\n",
"for tor_regex in tor_regex_list:\n",
" regex = add_slash_to_text_as_regex(tor_regex)\n",
" search = re.search(regex, text)\n",
" if search:\n",
" print(f\"{regex} match {search.group()}\")\n",
" else:\n",
" print(f\"{regex} no match\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "torch2_real",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -12,3 +12,11 @@ def add_slash_to_text_as_regex(text: str):
text = re.sub(replace, replace, text) text = re.sub(replace, replace, text)
text = re.sub(r"\s+", r"\\s+", text) text = re.sub(r"\s+", r"\\s+", text)
return text return text
def clean_text(text: str) -> str:
text = text.lower()
# update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
text = re.sub(r"\\u[0-9a-z]{4}", ' ', text)
text = re.sub(r"( ){2,}", ' ', text.strip())
return text