{ "cells": [ { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "from utils.biz_utils import add_slash_to_text_as_regex\n", "import json\n", "import re" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "regex = r\"Turnover \\n\"" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Turnover\\\\s+\\\\n'" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "add_slash_to_text_as_regex(regex)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover reflects the\\n\"" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.search(regex, text)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\"" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TOR no match\n", "Turnover\\*\\s+\\n no match\n", "Turnover\\s+\\n no match\n", "Turnover\\s+Ratio no match\n", "Turnover\\s+Rate no match\n", "Portfolio\\s+Turnover no match\n", "Portfolio\\s+turnover\\s+ratio no match\n", "Portfolio\\s+turnover\\s+rate no match\n", "PTR no match\n", "Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n" ] } ], "source": [ "with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n", " datapoint_keywords_config = json.load(file)\n", "\n", "tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n", "\n", "for tor_regex in tor_regex_list:\n", " regex = add_slash_to_text_as_regex(tor_regex)\n", " search = re.search(regex, text)\n", " if search:\n", " print(f\"{regex} match {search.group()}\")\n", " else:\n", " print(f\"{regex} no match\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from utils.sql_query_util import query_investment_by_provider, query_document_fund_mapping\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "provider_mapping = query_investment_by_provider(company_id=\"0C00008QVP\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ProviderIdProviderNameFundIdFundNameISINSecIdCurrencyIdShareClassNameShareClassStatus
8400C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH4T. Rowe Price Funds Series II SICAV - Credit O...LU1053597990F000010MEEUSDT. Rowe Price Funds Series II SICAV - Credit O...0
8410C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH4T. Rowe Price Funds Series II SICAV - Credit O...LU1053597727F000010MEFUSDT. Rowe Price Funds Series II SICAV - Credit O...0
8420C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU0993574440F000010MEGUSDT. Rowe Price Funds Series II SICAV - Floating...1
8430C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU1805616171F000010PUNCHFT. Rowe Price Funds Series II SICAV - Floating...0
8440C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU1076358073F000010MEHEURT. Rowe Price Funds Series II SICAV - Floating...0
8450C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU2046740358F0000143Y8USDT. Rowe Price Funds Series II SICAV - Floating...0
8460C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU2046740432F0000143Y9USDT. Rowe Price Funds Series II SICAV - Floating...0
8470C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU0993569101F00001564HUSDT. Rowe Price Funds Series II SICAV - Floating...0
8480C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU2122516821F000014UPKAUDT. Rowe Price Funds Series II SICAV - Floating...0
\n", "
" ], "text/plain": [ " ProviderId ProviderName FundId \\\n", "840 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH4 \n", "841 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH4 \n", "842 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "843 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "844 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "845 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "846 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "847 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "848 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "\n", " FundName ISIN \\\n", "840 T. Rowe Price Funds Series II SICAV - Credit O... LU1053597990 \n", "841 T. Rowe Price Funds Series II SICAV - Credit O... LU1053597727 \n", "842 T. Rowe Price Funds Series II SICAV - Floating... LU0993574440 \n", "843 T. Rowe Price Funds Series II SICAV - Floating... LU1805616171 \n", "844 T. Rowe Price Funds Series II SICAV - Floating... LU1076358073 \n", "845 T. Rowe Price Funds Series II SICAV - Floating... LU2046740358 \n", "846 T. Rowe Price Funds Series II SICAV - Floating... LU2046740432 \n", "847 T. Rowe Price Funds Series II SICAV - Floating... LU0993569101 \n", "848 T. Rowe Price Funds Series II SICAV - Floating... LU2122516821 \n", "\n", " SecId CurrencyId ShareClassName \\\n", "840 F000010MEE USD T. Rowe Price Funds Series II SICAV - Credit O... \n", "841 F000010MEF USD T. Rowe Price Funds Series II SICAV - Credit O... \n", "842 F000010MEG USD T. Rowe Price Funds Series II SICAV - Floating... \n", "843 F000010PUN CHF T. Rowe Price Funds Series II SICAV - Floating... \n", "844 F000010MEH EUR T. Rowe Price Funds Series II SICAV - Floating... \n", "845 F0000143Y8 USD T. Rowe Price Funds Series II SICAV - Floating... \n", "846 F0000143Y9 USD T. Rowe Price Funds Series II SICAV - Floating... \n", "847 F00001564H USD T. Rowe Price Funds Series II SICAV - Floating... \n", "848 F000014UPK AUD T. Rowe Price Funds Series II SICAV - Floating... \n", "\n", " ShareClassStatus \n", "840 0 \n", "841 0 \n", "842 1 \n", "843 0 \n", "844 0 \n", "845 0 \n", "846 0 \n", "847 0 \n", "848 0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "provider_mapping[provider_mapping[\"FundName\"].str.contains(\"T. Rowe Price Funds Series II SICAV\")]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "document_mapping = query_document_fund_mapping(doc_id=\"486378555\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DocumentIdEffectiveDateDocumentTypeFormatLanguageDocumentStatusProviderIdProviderNameFundIdFundNameDomicileSecIdCurrencyIdShareClassNameISIN
04863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF000010MEGUSDT. Rowe Price Funds Series II SICAV - Floating...LU0993574440
14863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF000010PUNCHFT. Rowe Price Funds Series II SICAV - Floating...LU1805616171
24863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF000010MEHEURT. Rowe Price Funds Series II SICAV - Floating...LU1076358073
34863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF0000143Y8USDT. Rowe Price Funds Series II SICAV - Floating...LU2046740358
44863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF0000143Y9USDT. Rowe Price Funds Series II SICAV - Floating...LU2046740432
54863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF000014UPKAUDT. Rowe Price Funds Series II SICAV - Floating...LU2122516821
\n", "
" ], "text/plain": [ " DocumentId EffectiveDate DocumentType Format Language DocumentStatus \\\n", "0 486378555 2022-06-30 4 PDF 0L00000122 1 \n", "1 486378555 2022-06-30 4 PDF 0L00000122 1 \n", "2 486378555 2022-06-30 4 PDF 0L00000122 1 \n", "3 486378555 2022-06-30 4 PDF 0L00000122 1 \n", "4 486378555 2022-06-30 4 PDF 0L00000122 1 \n", "5 486378555 2022-06-30 4 PDF 0L00000122 1 \n", "\n", " ProviderId ProviderName FundId \\\n", "0 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "1 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "2 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "3 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "4 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "5 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", "\n", " FundName Domicile SecId \\\n", "0 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010MEG \n", "1 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010PUN \n", "2 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010MEH \n", "3 T. Rowe Price Funds Series II SICAV - Floating... LUX F0000143Y8 \n", "4 T. Rowe Price Funds Series II SICAV - Floating... LUX F0000143Y9 \n", "5 T. Rowe Price Funds Series II SICAV - Floating... LUX F000014UPK \n", "\n", " CurrencyId ShareClassName ISIN \n", "0 USD T. Rowe Price Funds Series II SICAV - Floating... LU0993574440 \n", "1 CHF T. Rowe Price Funds Series II SICAV - Floating... LU1805616171 \n", "2 EUR T. Rowe Price Funds Series II SICAV - Floating... LU1076358073 \n", "3 USD T. Rowe Price Funds Series II SICAV - Floating... LU2046740358 \n", "4 USD T. Rowe Price Funds Series II SICAV - Floating... LU2046740432 \n", "5 AUD T. Rowe Price Funds Series II SICAV - Floating... LU2122516821 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "document_mapping" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund I Cap',\n", " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (CHF) Cap',\n", " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (EUR) Cap',\n", " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Q (USD) Cap',\n", " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Qd (USD) Dis',\n", " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Sdn (AUD) Dis']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(document_mapping[\"ShareClassName\"].unique())" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pymupdf4llm" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing ./data/emea_ar/pdf/501380553.pdf...\n", "[ ] (0/47[ ] ( 1/47[= ] ( 2/4[== ] ( 3/47[=== ] ( 4/4[==== ] ( 5/47[===== ] ( 6/47[===== ] ( 7/4[====== ] ( 8/47[======= ] ( 9/4[======== ] (10/47[========= ] (11/4[========== ] (12/47[=========== ] (13/47[=========== ] (14/4[============ ] (15/47[============= ] (16/4[============== ] (17/47[=============== ] (18/4[================ ] (19/47[================= ] (20/47[================= ] (21/4[================== ] (22/47[=================== ] (23/4[==================== ] (24/47[===================== ] (25/4[====================== ] (26/4[====================== ] (27/47[======================= ] (28/4[======================== ] (29/47[========================= ] (30/4[========================== ] (31/47[=========================== ] (32/4[============================ ] (33/4[============================ ] (34/47[============================= ] (35/4[============================== ] (36/47[=============================== ] (37/4[================================ ] (38/47[================================= ] (39/4[================================== ] (40/4[================================== ] (41/47[=================================== ] (42/4[==================================== ] (43/47[===================================== ] (44/4[====================================== ] (45/47[======================================= ] (46/47[========================================] (47/47]\n" ] }, { "data": { "text/plain": [ "107851" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "md_text = pymupdf4llm.to_markdown(r\"./data/emea_ar/pdf/501380553.pdf\")\n", "\n", "# now work with the markdown text, e.g. store as a UTF8-encoded file\n", "import pathlib\n", "pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "def get_fund_name(fund_name: str, fund_feature: str):\n", " fund_name_split = fund_name.split(fund_feature)\n", " if len(fund_name_split) > 1:\n", " last_fund = fund_name_split[-1].strip()\n", " if len(last_fund) == 0:\n", " last_fund = fund_name_split[-2].strip()\n", " fund_name = f\"{last_fund} {fund_feature}\"\n", " return fund_name" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'C Fund'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "torch2_real", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 2 }