diff --git a/configuration/datapoint_keyword.json b/configuration/datapoint_keyword.json index 49fe551..23681c1 100644 --- a/configuration/datapoint_keyword.json +++ b/configuration/datapoint_keyword.json @@ -200,9 +200,9 @@ "tor": { "english": [ "TOR", - "Turnover*", - "Turnover", - "Turnover Ratio", + "Turnover* \\n", + "Turnover \\n", + "Turnover Ratio", "Turnover Rate", "Portfolio Turnover", "Portfolio turnover ratio", @@ -339,7 +339,9 @@ "Performance Fee", "Performance Fees", "performance-based fee", - "performance-related fee" + "performance-related fee", + "with performance)", + "with performance fee)" ], "spanish": [ "Comisión de Gestión sobre Resultados", diff --git a/core/page_filter.py b/core/page_filter.py index 7a0774a..a641b42 100644 --- a/core/page_filter.py +++ b/core/page_filter.py @@ -6,7 +6,7 @@ import pandas as pd from utils.pdf_util import PDFUtil from utils.sql_query_util import query_document_fund_mapping from utils.logger import logger -from utils.biz_utils import add_slash_to_text_as_regex +from utils.biz_utils import add_slash_to_text_as_regex, clean_text class FilterPages: @@ -96,11 +96,6 @@ class FilterPages: new_keyword = add_slash_to_text_as_regex(keyword) new_keywords.append(new_keyword) return new_keywords - - def clean_text(self, text: str) -> str: - text = text.lower() - text = re.sub(r"\s+", ' ', text.strip()) - return text def start_job(self) -> dict: logger.info(f"Start extracting datapoints from {self.pdf_file}") @@ -118,7 +113,7 @@ class FilterPages: for datapoint in self.datapoint_config.keys(): result[datapoint] = [] for page_num, page_text in self.page_text_dict.items(): - text = self.clean_text(page_text) + text = clean_text(page_text) for datapoint, keywords in self.datapoint_config.items(): # idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean]) for keyword in keywords: diff --git a/instructions/data_extraction_image_prompts.txt b/instructions/data_extraction_image_prompts.txt index 444a75d..150473f 100644 --- a/instructions/data_extraction_image_prompts.txt +++ b/instructions/data_extraction_image_prompts.txt @@ -11,36 +11,48 @@ The markdown table(s) will be as output with key: "table_contents". 3. Extract data from upon parsed text and table(s) contents. 3.1 The upon parsed text and table(s) contents as context. 3.2 Data Extraction from parsed table contents -Maybe there are TER, performance fees data in the parsed table(s) contents. -The TER reported name could be:Total Expense Ratio, TER, Annualised TER including performance fees,etc. -The performance fees reported name could be:performance fees, performance fees ratio, etc. -If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be: -TER including performance fees - TER excluding performance fees. +Maybe there are TER, performance fees data in the context, the TER reported name could be: +Total Expense Ratio, TER, Annualised TER including performance fees,etc. +The performance fees reported name could be: +performance fees, performance fees ratio, etc. +Special cases +1. Performance fees is part of TER. +If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance", +The TER should be "TER including performance fees" or "TER with performance". +The performance fees should be: +"TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees". +The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18. + +2. Combo TER value table. +2.1 Exist Feeder fund TER and Master fund TER. +The relevant table header is like this: +Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal +Please output separately as below: +- "feeder fund share class" and "TER feeder" values +- "Master fund" and "TER Master" values +Here is the example: +Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n + +The output should be: +[ +{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]}, +{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]}, +] + The TER and performance fees value is percentage number, it means the value should be less than 100. Most of cases, the data is in the table(s) of context. - -3.3 Fund name/ share class name extraction from upon context -Please extract fund name and share class name from the context. -If can't find fund name or share class name from table contents, -please try to find them from parsed text contents. - -3.4 Output +If with multiple TER/ performance fee values in same row, please extract the latest. If possible, please extract fund name, share class name, TER or performance fees value as the output. -One fund could be with multiple share classes and relevant TER or performance fees values. +One fund could be with multiple share classes and relevant TER values. The output should be JSON format, the format is like: -{ - "text": "text from image", - "table_contents": ["|Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|"], - "data": - [{ - "fund name": "fund 1", - "share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}] - }, - { - "fund name": "fund 2", - "share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}] - }] -} +[{ + "fund name": "fund 1", + "share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2}] +}, +{ + "fund name": "fund 2", + "share data": [{"share name": "share a", "ter": 1.16, "performance fees": 0.5},{"share name": "share b", "ter": 1.45, "performance fees": 1.1}] +}] Only output JSON data. If can't find share class name in context, please output empty JSON data: [] diff --git a/instructions/data_extraction_prompts.txt b/instructions/data_extraction_prompts.txt index 6ba469d..f251ab9 100644 --- a/instructions/data_extraction_prompts.txt +++ b/instructions/data_extraction_prompts.txt @@ -1,15 +1,38 @@ Context: {page_text} +Instructions: Read the context carefully. Maybe there are TER, performance fees data in the context, the TER reported name could be: Total Expense Ratio, TER, Annualised TER including performance fees,etc. The performance fees reported name could be: performance fees, performance fees ratio, etc. -If exist both of "TER including performance fees" and "TER excluding performance fees", the performance fees should be: -TER including performance fees - TER excluding performance fees. +Special cases +1. Performance fees is part of TER. +If exist both of "TER including performance fees" or "TER with performance" and "TER excluding performance fees" or "TER without performance", +The TER should be "TER including performance fees" or "TER with performance". +The performance fees should be: +"TER including performance fees - TER excluding performance fees" or "TER with performance fees - TER without performance fees". +The performance fees value can be negative or less than 0, e.g., -0.27 or -0.18. + +2. Combo TER value table. +2.1 Exist Feeder fund TER and Master fund TER. +The relevant table header is like this: +Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal +Please output separately as below: +- "feeder fund share class" and "TER feeder" values +- "Master fund" and "TER Master" values +Here is the example: +Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution \u2013 Balanced Class X\n0.1475%\n0.7025%\n0.850%\n + +The output should be: +[ +{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]}, +{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]}, +] The TER and performance fees value is percentage number, it means the value should be less than 100. +The performance fees value can be negative, e.g. -0.2 or -0.67. Most of cases, the data is in the table(s) of context. If with multiple TER/ performance fee values in same row, please extract the latest. If possible, please extract fund name, share class name, TER or performance fees value as the output. diff --git a/playground.ipynb b/playground.ipynb new file mode 100644 index 0000000..e7bff58 --- /dev/null +++ b/playground.ipynb @@ -0,0 +1,147 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "from utils.biz_utils import add_slash_to_text_as_regex\n", + "import json\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "regex = r\"Turnover \\n\"" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Turnover\\\\s+\\\\n'" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "add_slash_to_text_as_regex(regex)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover \\nreflects the\\n\"" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.search(regex, text)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TOR no match\n", + "Turnover\\*\\s+ no match\n", + "Turnover\\s+ match Turnover \n", + "Turnover\\s+Ratio no match\n", + "Turnover\\s+Rate no match\n", + "Portfolio\\s+Turnover no match\n", + "Portfolio\\s+turnover\\s+ratio no match\n", + "Portfolio\\s+turnover\\s+rate no match\n", + "PTR no match\n", + "Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n" + ] + } + ], + "source": [ + "with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n", + " datapoint_keywords_config = json.load(file)\n", + "\n", + "tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n", + "\n", + "for tor_regex in tor_regex_list:\n", + " regex = add_slash_to_text_as_regex(tor_regex)\n", + " search = re.search(regex, text)\n", + " if search:\n", + " print(f\"{regex} match {search.group()}\")\n", + " else:\n", + " print(f\"{regex} no match\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "torch2_real", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 72346a7..deddead 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -11,4 +11,12 @@ def add_slash_to_text_as_regex(text: str): if replace not in text: text = re.sub(replace, replace, text) text = re.sub(r"\s+", r"\\s+", text) + return text + + +def clean_text(text: str) -> str: + text = text.lower() + # update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space + text = re.sub(r"\\u[0-9a-z]{4}", ' ', text) + text = re.sub(r"( ){2,}", ' ', text.strip()) return text \ No newline at end of file