optimize prompts
This commit is contained in:
parent
15720d8bfd
commit
32676728f6
|
|
@ -200,9 +200,9 @@
|
||||||
"tor": {
|
"tor": {
|
||||||
"english": [
|
"english": [
|
||||||
"TOR",
|
"TOR",
|
||||||
"Turnover*",
|
"Turnover* \\n",
|
||||||
"Turnover",
|
"Turnover \\n",
|
||||||
"Turnover Ratio",
|
"Turnover Ratio",
|
||||||
"Turnover Rate",
|
"Turnover Rate",
|
||||||
"Portfolio Turnover",
|
"Portfolio Turnover",
|
||||||
"Portfolio turnover ratio",
|
"Portfolio turnover ratio",
|
||||||
|
|
@ -339,7 +339,9 @@
|
||||||
"Performance Fee",
|
"Performance Fee",
|
||||||
"Performance Fees",
|
"Performance Fees",
|
||||||
"performance-based fee",
|
"performance-based fee",
|
||||||
"performance-related fee"
|
"performance-related fee",
|
||||||
|
"with performance)",
|
||||||
|
"with performance fee)"
|
||||||
],
|
],
|
||||||
"spanish": [
|
"spanish": [
|
||||||
"Comisión de Gestión sobre Resultados",
|
"Comisión de Gestión sobre Resultados",
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ import pandas as pd
|
||||||
from utils.pdf_util import PDFUtil
|
from utils.pdf_util import PDFUtil
|
||||||
from utils.sql_query_util import query_document_fund_mapping
|
from utils.sql_query_util import query_document_fund_mapping
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
from utils.biz_utils import add_slash_to_text_as_regex
|
from utils.biz_utils import add_slash_to_text_as_regex, clean_text
|
||||||
|
|
||||||
|
|
||||||
class FilterPages:
|
class FilterPages:
|
||||||
|
|
@ -97,11 +97,6 @@ class FilterPages:
|
||||||
new_keywords.append(new_keyword)
|
new_keywords.append(new_keyword)
|
||||||
return new_keywords
|
return new_keywords
|
||||||
|
|
||||||
def clean_text(self, text: str) -> str:
|
|
||||||
text = text.lower()
|
|
||||||
text = re.sub(r"\s+", ' ', text.strip())
|
|
||||||
return text
|
|
||||||
|
|
||||||
def start_job(self) -> dict:
|
def start_job(self) -> dict:
|
||||||
logger.info(f"Start extracting datapoints from {self.pdf_file}")
|
logger.info(f"Start extracting datapoints from {self.pdf_file}")
|
||||||
"""
|
"""
|
||||||
|
|
@ -118,7 +113,7 @@ class FilterPages:
|
||||||
for datapoint in self.datapoint_config.keys():
|
for datapoint in self.datapoint_config.keys():
|
||||||
result[datapoint] = []
|
result[datapoint] = []
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
text = self.clean_text(page_text)
|
text = clean_text(page_text)
|
||||||
for datapoint, keywords in self.datapoint_config.items():
|
for datapoint, keywords in self.datapoint_config.items():
|
||||||
# idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean])
|
# idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean])
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
|
|
|
||||||
|
|
@ -11,36 +11,48 @@ The markdown table(s) will be as output with key: "table_contents".
|
||||||
3. Extract data from upon parsed text and table(s) contents.
|
3. Extract data from upon parsed text and table(s) contents.
|
||||||
3.1 The upon parsed text and table(s) contents as context.
|
3.1 The upon parsed text and table(s) contents as context.
|
||||||
3.2 Data Extraction from parsed table contents
|
3.2 Data Extraction from parsed table contents
|
||||||
Maybe there are TER, performance fees data in the parsed table(s) contents.
|
Maybe there are TER, performance fees data in the context, the TER reported name could be:
|
||||||
The TER reported name could be:Total Expense Ratio, TER, Annualised TER including performance fees,etc.
|
Total Expense Ratio, TER, Annualised TER including performance fees,etc.
|
||||||
The performance fees reported name could be:performance fees, performance fees ratio, etc.
|
The performance fees reported name could be:
|
||||||
If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be:
|
performance fees, performance fees ratio, etc.
|
||||||
TER including performance fees - TER excluding performance fees.
|
Special cases
|
||||||
|
1. Performance fees is part of TER.
|
||||||
|
If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance",
|
||||||
|
The TER should be "TER including performance fees" or "TER with performance".
|
||||||
|
The performance fees should be:
|
||||||
|
"TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees".
|
||||||
|
The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18.
|
||||||
|
|
||||||
|
2. Combo TER value table.
|
||||||
|
2.1 Exist Feeder fund TER and Master fund TER.
|
||||||
|
The relevant table header is like this:
|
||||||
|
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
|
||||||
|
Please output separately as below:
|
||||||
|
- "feeder fund share class" and "TER feeder" values
|
||||||
|
- "Master fund" and "TER Master" values
|
||||||
|
Here is the example:
|
||||||
|
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
|
||||||
|
|
||||||
|
The output should be:
|
||||||
|
[
|
||||||
|
{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]},
|
||||||
|
{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]},
|
||||||
|
]
|
||||||
|
|
||||||
The TER and performance fees value is percentage number, it means the value should be less than 100.
|
The TER and performance fees value is percentage number, it means the value should be less than 100.
|
||||||
Most of cases, the data is in the table(s) of context.
|
Most of cases, the data is in the table(s) of context.
|
||||||
|
If with multiple TER/ performance fee values in same row, please extract the latest.
|
||||||
3.3 Fund name/ share class name extraction from upon context
|
|
||||||
Please extract fund name and share class name from the context.
|
|
||||||
If can't find fund name or share class name from table contents,
|
|
||||||
please try to find them from parsed text contents.
|
|
||||||
|
|
||||||
3.4 Output
|
|
||||||
If possible, please extract fund name, share class name, TER or performance fees value as the output.
|
If possible, please extract fund name, share class name, TER or performance fees value as the output.
|
||||||
One fund could be with multiple share classes and relevant TER or performance fees values.
|
One fund could be with multiple share classes and relevant TER values.
|
||||||
The output should be JSON format, the format is like:
|
The output should be JSON format, the format is like:
|
||||||
|
[{
|
||||||
|
"fund name": "fund 1",
|
||||||
|
"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"text": "text from image",
|
"fund name": "fund 2",
|
||||||
"table_contents": ["|Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|"],
|
"share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}]
|
||||||
"data":
|
}]
|
||||||
[{
|
|
||||||
"fund name": "fund 1",
|
|
||||||
"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"fund name": "fund 2",
|
|
||||||
"share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}]
|
|
||||||
}]
|
|
||||||
}
|
|
||||||
Only output JSON data.
|
Only output JSON data.
|
||||||
If can't find share class name in context, please output empty JSON data: []
|
If can't find share class name in context, please output empty JSON data: []
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,15 +1,38 @@
|
||||||
Context:
|
Context:
|
||||||
{page_text}
|
{page_text}
|
||||||
|
|
||||||
|
Instructions:
|
||||||
Read the context carefully.
|
Read the context carefully.
|
||||||
Maybe there are TER, performance fees data in the context, the TER reported name could be:
|
Maybe there are TER, performance fees data in the context, the TER reported name could be:
|
||||||
Total Expense Ratio, TER, Annualised TER including performance fees,etc.
|
Total Expense Ratio, TER, Annualised TER including performance fees,etc.
|
||||||
The performance fees reported name could be:
|
The performance fees reported name could be:
|
||||||
performance fees, performance fees ratio, etc.
|
performance fees, performance fees ratio, etc.
|
||||||
If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be:
|
Special cases
|
||||||
TER including performance fees - TER excluding performance fees.
|
1. Performance fees is part of TER.
|
||||||
|
If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance",
|
||||||
|
The TER should be "TER including performance fees" or "TER with performance".
|
||||||
|
The performance fees should be:
|
||||||
|
"TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees".
|
||||||
|
The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18.
|
||||||
|
|
||||||
|
2. Combo TER value table.
|
||||||
|
2.1 Exist Feeder fund TER and Master fund TER.
|
||||||
|
The relevant table header is like this:
|
||||||
|
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
|
||||||
|
Please output separately as below:
|
||||||
|
- "feeder fund share class" and "TER feeder" values
|
||||||
|
- "Master fund" and "TER Master" values
|
||||||
|
Here is the example:
|
||||||
|
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
|
||||||
|
|
||||||
|
The output should be:
|
||||||
|
[
|
||||||
|
{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]},
|
||||||
|
{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]},
|
||||||
|
]
|
||||||
|
|
||||||
The TER and performance fees value is percentage number, it means the value should be less than 100.
|
The TER and performance fees value is percentage number, it means the value should be less than 100.
|
||||||
|
The performance fees value can be negative, e.g. -0.2 or -0.67.
|
||||||
Most of cases, the data is in the table(s) of context.
|
Most of cases, the data is in the table(s) of context.
|
||||||
If with multiple TER/ performance fee values in same row, please extract the latest.
|
If with multiple TER/ performance fee values in same row, please extract the latest.
|
||||||
If possible, please extract fund name, share class name, TER or performance fees value as the output.
|
If possible, please extract fund name, share class name, TER or performance fees value as the output.
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,147 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from utils.biz_utils import add_slash_to_text_as_regex\n",
|
||||||
|
"import json\n",
|
||||||
|
"import re"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"regex = r\"Turnover \\n\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Turnover\\\\s+\\\\n'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"add_slash_to_text_as_regex(regex)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 42,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover \\nreflects the\\n\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 32,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<re.Match object; span=(141, 151), match='Turnover \\n'>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 32,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"re.search(regex, text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 43,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"TOR no match\n",
|
||||||
|
"Turnover\\*\\s+ no match\n",
|
||||||
|
"Turnover\\s+ match Turnover \n",
|
||||||
|
"Turnover\\s+Ratio no match\n",
|
||||||
|
"Turnover\\s+Rate no match\n",
|
||||||
|
"Portfolio\\s+Turnover no match\n",
|
||||||
|
"Portfolio\\s+turnover\\s+ratio no match\n",
|
||||||
|
"Portfolio\\s+turnover\\s+rate no match\n",
|
||||||
|
"PTR no match\n",
|
||||||
|
"Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
|
" datapoint_keywords_config = json.load(file)\n",
|
||||||
|
"\n",
|
||||||
|
"tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
|
||||||
|
"\n",
|
||||||
|
"for tor_regex in tor_regex_list:\n",
|
||||||
|
" regex = add_slash_to_text_as_regex(tor_regex)\n",
|
||||||
|
" search = re.search(regex, text)\n",
|
||||||
|
" if search:\n",
|
||||||
|
" print(f\"{regex} match {search.group()}\")\n",
|
||||||
|
" else:\n",
|
||||||
|
" print(f\"{regex} no match\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "torch2_real",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.11"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
|
|
@ -12,3 +12,11 @@ def add_slash_to_text_as_regex(text: str):
|
||||||
text = re.sub(replace, replace, text)
|
text = re.sub(replace, replace, text)
|
||||||
text = re.sub(r"\s+", r"\\s+", text)
|
text = re.sub(r"\s+", r"\\s+", text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
|
text = text.lower()
|
||||||
|
# update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
|
||||||
|
text = re.sub(r"\\u[0-9a-z]{4}", ' ', text)
|
||||||
|
text = re.sub(r"( ){2,}", ' ', text.strip())
|
||||||
|
return text
|
||||||
Loading…
Reference in New Issue