148 lines
3.2 KiB
Plaintext
148 lines
3.2 KiB
Plaintext
|
|
{
|
||
|
|
"cells": [
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 27,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"from utils.biz_utils import add_slash_to_text_as_regex\n",
|
||
|
|
"import json\n",
|
||
|
|
"import re"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 29,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"regex = r\"Turnover \\n\""
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 30,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/plain": [
|
||
|
|
"'Turnover\\\\s+\\\\n'"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"execution_count": 30,
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "execute_result"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"add_slash_to_text_as_regex(regex)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 42,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover \\nreflects the\\n\""
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 32,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/plain": [
|
||
|
|
"<re.Match object; span=(141, 151), match='Turnover \\n'>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"execution_count": 32,
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "execute_result"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"re.search(regex, text)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 35,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 43,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"TOR no match\n",
|
||
|
|
"Turnover\\*\\s+ no match\n",
|
||
|
|
"Turnover\\s+ match Turnover \n",
|
||
|
|
"Turnover\\s+Ratio no match\n",
|
||
|
|
"Turnover\\s+Rate no match\n",
|
||
|
|
"Portfolio\\s+Turnover no match\n",
|
||
|
|
"Portfolio\\s+turnover\\s+ratio no match\n",
|
||
|
|
"Portfolio\\s+turnover\\s+rate no match\n",
|
||
|
|
"PTR no match\n",
|
||
|
|
"Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
|
||
|
|
" datapoint_keywords_config = json.load(file)\n",
|
||
|
|
"\n",
|
||
|
|
"tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
|
||
|
|
"\n",
|
||
|
|
"for tor_regex in tor_regex_list:\n",
|
||
|
|
" regex = add_slash_to_text_as_regex(tor_regex)\n",
|
||
|
|
" search = re.search(regex, text)\n",
|
||
|
|
" if search:\n",
|
||
|
|
" print(f\"{regex} match {search.group()}\")\n",
|
||
|
|
" else:\n",
|
||
|
|
" print(f\"{regex} no match\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": []
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"metadata": {
|
||
|
|
"kernelspec": {
|
||
|
|
"display_name": "torch2_real",
|
||
|
|
"language": "python",
|
||
|
|
"name": "python3"
|
||
|
|
},
|
||
|
|
"language_info": {
|
||
|
|
"codemirror_mode": {
|
||
|
|
"name": "ipython",
|
||
|
|
"version": 3
|
||
|
|
},
|
||
|
|
"file_extension": ".py",
|
||
|
|
"mimetype": "text/x-python",
|
||
|
|
"name": "python",
|
||
|
|
"nbconvert_exporter": "python",
|
||
|
|
"pygments_lexer": "ipython3",
|
||
|
|
"version": "3.10.11"
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"nbformat": 4,
|
||
|
|
"nbformat_minor": 2
|
||
|
|
}
|