optimize prompts
This commit is contained in:
parent
15720d8bfd
commit
32676728f6
|
|
@ -200,9 +200,9 @@
|
|||
"tor": {
|
||||
"english": [
|
||||
"TOR",
|
||||
"Turnover*",
|
||||
"Turnover",
|
||||
"Turnover Ratio",
|
||||
"Turnover* \\n",
|
||||
"Turnover \\n",
|
||||
"Turnover Ratio",
|
||||
"Turnover Rate",
|
||||
"Portfolio Turnover",
|
||||
"Portfolio turnover ratio",
|
||||
|
|
@ -339,7 +339,9 @@
|
|||
"Performance Fee",
|
||||
"Performance Fees",
|
||||
"performance-based fee",
|
||||
"performance-related fee"
|
||||
"performance-related fee",
|
||||
"with performance)",
|
||||
"with performance fee)"
|
||||
],
|
||||
"spanish": [
|
||||
"Comisión de Gestión sobre Resultados",
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import pandas as pd
|
|||
from utils.pdf_util import PDFUtil
|
||||
from utils.sql_query_util import query_document_fund_mapping
|
||||
from utils.logger import logger
|
||||
from utils.biz_utils import add_slash_to_text_as_regex
|
||||
from utils.biz_utils import add_slash_to_text_as_regex, clean_text
|
||||
|
||||
|
||||
class FilterPages:
|
||||
|
|
@ -96,11 +96,6 @@ class FilterPages:
|
|||
new_keyword = add_slash_to_text_as_regex(keyword)
|
||||
new_keywords.append(new_keyword)
|
||||
return new_keywords
|
||||
|
||||
def clean_text(self, text: str) -> str:
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", ' ', text.strip())
|
||||
return text
|
||||
|
||||
def start_job(self) -> dict:
|
||||
logger.info(f"Start extracting datapoints from {self.pdf_file}")
|
||||
|
|
@ -118,7 +113,7 @@ class FilterPages:
|
|||
for datapoint in self.datapoint_config.keys():
|
||||
result[datapoint] = []
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
text = self.clean_text(page_text)
|
||||
text = clean_text(page_text)
|
||||
for datapoint, keywords in self.datapoint_config.items():
|
||||
# idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean])
|
||||
for keyword in keywords:
|
||||
|
|
|
|||
|
|
@ -11,36 +11,48 @@ The markdown table(s) will be as output with key: "table_contents".
|
|||
3. Extract data from upon parsed text and table(s) contents.
|
||||
3.1 The upon parsed text and table(s) contents as context.
|
||||
3.2 Data Extraction from parsed table contents
|
||||
Maybe there are TER, performance fees data in the parsed table(s) contents.
|
||||
The TER reported name could be:Total Expense Ratio, TER, Annualised TER including performance fees,etc.
|
||||
The performance fees reported name could be:performance fees, performance fees ratio, etc.
|
||||
If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be:
|
||||
TER including performance fees - TER excluding performance fees.
|
||||
Maybe there are TER, performance fees data in the context, the TER reported name could be:
|
||||
Total Expense Ratio, TER, Annualised TER including performance fees,etc.
|
||||
The performance fees reported name could be:
|
||||
performance fees, performance fees ratio, etc.
|
||||
Special cases
|
||||
1. Performance fees is part of TER.
|
||||
If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance",
|
||||
The TER should be "TER including performance fees" or "TER with performance".
|
||||
The performance fees should be:
|
||||
"TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees".
|
||||
The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18.
|
||||
|
||||
2. Combo TER value table.
|
||||
2.1 Exist Feeder fund TER and Master fund TER.
|
||||
The relevant table header is like this:
|
||||
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
|
||||
Please output separately as below:
|
||||
- "feeder fund share class" and "TER feeder" values
|
||||
- "Master fund" and "TER Master" values
|
||||
Here is the example:
|
||||
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
|
||||
|
||||
The output should be:
|
||||
[
|
||||
{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]},
|
||||
{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]},
|
||||
]
|
||||
|
||||
The TER and performance fees value is percentage number, it means the value should be less than 100.
|
||||
Most of cases, the data is in the table(s) of context.
|
||||
|
||||
3.3 Fund name/ share class name extraction from upon context
|
||||
Please extract fund name and share class name from the context.
|
||||
If can't find fund name or share class name from table contents,
|
||||
please try to find them from parsed text contents.
|
||||
|
||||
3.4 Output
|
||||
If with multiple TER/ performance fee values in same row, please extract the latest.
|
||||
If possible, please extract fund name, share class name, TER or performance fees value as the output.
|
||||
One fund could be with multiple share classes and relevant TER or performance fees values.
|
||||
One fund could be with multiple share classes and relevant TER values.
|
||||
The output should be JSON format, the format is like:
|
||||
{
|
||||
"text": "text from image",
|
||||
"table_contents": ["|Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|"],
|
||||
"data":
|
||||
[{
|
||||
"fund name": "fund 1",
|
||||
"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}]
|
||||
},
|
||||
{
|
||||
"fund name": "fund 2",
|
||||
"share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}]
|
||||
}]
|
||||
}
|
||||
[{
|
||||
"fund name": "fund 1",
|
||||
"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}]
|
||||
},
|
||||
{
|
||||
"fund name": "fund 2",
|
||||
"share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}]
|
||||
}]
|
||||
Only output JSON data.
|
||||
If can't find share class name in context, please output empty JSON data: []
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,38 @@
|
|||
Context:
|
||||
{page_text}
|
||||
|
||||
Instructions:
|
||||
Read the context carefully.
|
||||
Maybe there are TER, performance fees data in the context, the TER reported name could be:
|
||||
Total Expense Ratio, TER, Annualised TER including performance fees,etc.
|
||||
The performance fees reported name could be:
|
||||
performance fees, performance fees ratio, etc.
|
||||
If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be:
|
||||
TER including performance fees - TER excluding performance fees.
|
||||
Special cases
|
||||
1. Performance fees is part of TER.
|
||||
If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance",
|
||||
The TER should be "TER including performance fees" or "TER with performance".
|
||||
The performance fees should be:
|
||||
"TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees".
|
||||
The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18.
|
||||
|
||||
2. Combo TER value table.
|
||||
2.1 Exist Feeder fund TER and Master fund TER.
|
||||
The relevant table header is like this:
|
||||
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
|
||||
Please output separately as below:
|
||||
- "feeder fund share class" and "TER feeder" values
|
||||
- "Master fund" and "TER Master" values
|
||||
Here is the example:
|
||||
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
|
||||
|
||||
The output should be:
|
||||
[
|
||||
{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]},
|
||||
{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]},
|
||||
]
|
||||
|
||||
The TER and performance fees value is percentage number, it means the value should be less than 100.
|
||||
The performance fees value can be negative, e.g. -0.2 or -0.67.
|
||||
Most of cases, the data is in the table(s) of context.
|
||||
If with multiple TER/ performance fee values in same row, please extract the latest.
|
||||
If possible, please extract fund name, share class name, TER or performance fees value as the output.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,147 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from utils.biz_utils import add_slash_to_text_as_regex\n",
|
||||
"import json\n",
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"regex = r\"Turnover \\n\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Turnover\\\\s+\\\\n'"
|
||||
]
|
||||
},
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"add_slash_to_text_as_regex(regex)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover \\nreflects the\\n\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<re.Match object; span=(141, 151), match='Turnover \\n'>"
|
||||
]
|
||||
},
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"re.search(regex, text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"TOR no match\n",
|
||||
"Turnover\\*\\s+ no match\n",
|
||||
"Turnover\\s+ match Turnover \n",
|
||||
"Turnover\\s+Ratio no match\n",
|
||||
"Turnover\\s+Rate no match\n",
|
||||
"Portfolio\\s+Turnover no match\n",
|
||||
"Portfolio\\s+turnover\\s+ratio no match\n",
|
||||
"Portfolio\\s+turnover\\s+rate no match\n",
|
||||
"PTR no match\n",
|
||||
"Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" datapoint_keywords_config = json.load(file)\n",
|
||||
"\n",
|
||||
"tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
|
||||
"\n",
|
||||
"for tor_regex in tor_regex_list:\n",
|
||||
" regex = add_slash_to_text_as_regex(tor_regex)\n",
|
||||
" search = re.search(regex, text)\n",
|
||||
" if search:\n",
|
||||
" print(f\"{regex} match {search.group()}\")\n",
|
||||
" else:\n",
|
||||
" print(f\"{regex} no match\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "torch2_real",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -11,4 +11,12 @@ def add_slash_to_text_as_regex(text: str):
|
|||
if replace not in text:
|
||||
text = re.sub(replace, replace, text)
|
||||
text = re.sub(r"\s+", r"\\s+", text)
|
||||
return text
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
text = text.lower()
|
||||
# update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
|
||||
text = re.sub(r"\\u[0-9a-z]{4}", ' ', text)
|
||||
text = re.sub(r"( ){2,}", ' ', text.strip())
|
||||
return text
|
||||
Loading…
Reference in New Issue