{ "cells": [ { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "from utils.biz_utils import add_slash_to_text_as_regex\n", "import json\n", "import re" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "regex = r\"Turnover \\n\"" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Turnover\\\\s+\\\\n'" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "add_slash_to_text_as_regex(regex)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover \\nreflects the\\n\"" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.search(regex, text)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\"" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TOR no match\n", "Turnover\\*\\s+ no match\n", "Turnover\\s+ match Turnover \n", "Turnover\\s+Ratio no match\n", "Turnover\\s+Rate no match\n", "Portfolio\\s+Turnover no match\n", "Portfolio\\s+turnover\\s+ratio no match\n", "Portfolio\\s+turnover\\s+rate no match\n", "PTR no match\n", "Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n" ] } ], "source": [ "with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n", " datapoint_keywords_config = json.load(file)\n", "\n", "tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n", "\n", "for tor_regex in tor_regex_list:\n", " regex = add_slash_to_text_as_regex(tor_regex)\n", " search = re.search(regex, text)\n", " if search:\n", " print(f\"{regex} match {search.group()}\")\n", " else:\n", " print(f\"{regex} no match\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "torch2_real", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 2 }