dc-ml-emea-ar/playground.ipynb

148 lines
3.2 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"from utils.biz_utils import add_slash_to_text_as_regex\n",
"import json\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"regex = r\"Turnover \\n\""
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Turnover\\\\s+\\\\n'"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"add_slash_to_text_as_regex(regex)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover \\nreflects the\\n\""
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<re.Match object; span=(141, 151), match='Turnover \\n'>"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re.search(regex, text)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TOR no match\n",
"Turnover\\*\\s+ no match\n",
"Turnover\\s+ match Turnover \n",
"Turnover\\s+Ratio no match\n",
"Turnover\\s+Rate no match\n",
"Portfolio\\s+Turnover no match\n",
"Portfolio\\s+turnover\\s+ratio no match\n",
"Portfolio\\s+turnover\\s+rate no match\n",
"PTR no match\n",
"Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
]
}
],
"source": [
"with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
" datapoint_keywords_config = json.load(file)\n",
"\n",
"tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
"\n",
"for tor_regex in tor_regex_list:\n",
" regex = add_slash_to_text_as_regex(tor_regex)\n",
" search = re.search(regex, text)\n",
" if search:\n",
" print(f\"{regex} match {search.group()}\")\n",
" else:\n",
" print(f\"{regex} no match\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "torch2_real",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}