support filter pages by data point keywords
This commit is contained in:
parent
993664cf78
commit
6519dc23d4
|
|
@ -2,3 +2,4 @@
|
||||||
/log
|
/log
|
||||||
/utils/__pycache__
|
/utils/__pycache__
|
||||||
/__pycache__/*.pyc
|
/__pycache__/*.pyc
|
||||||
|
/core/__pycache__/*.pyc
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,468 @@
|
||||||
|
{
|
||||||
|
"ISIN": {
|
||||||
|
"english": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"spanish": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"german": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"dutch": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"french": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"finnish": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"swedish": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"danish": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"norwegian": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"lithuanian": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"polish": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"latvian": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"india(keywords)": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"estonian": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"malay": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"italian": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
],
|
||||||
|
"portuguese": [
|
||||||
|
"ISIN",
|
||||||
|
"ISIN Code"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"ter": {
|
||||||
|
"english": [
|
||||||
|
"Synthetic TER",
|
||||||
|
"Fund TER",
|
||||||
|
"TER",
|
||||||
|
"T.E.R",
|
||||||
|
"TER_REF",
|
||||||
|
"Total Expense Ratio",
|
||||||
|
"Total Expense Ratios",
|
||||||
|
"Expense ratio",
|
||||||
|
"Total Fund Charge",
|
||||||
|
"Gross Expense Ratio",
|
||||||
|
"Gross Expense Ratios",
|
||||||
|
"all-in-fee",
|
||||||
|
"all-in fee",
|
||||||
|
"all in fee",
|
||||||
|
"Total Net Expense Ratio",
|
||||||
|
"Total Net Expense Ratios",
|
||||||
|
"Total Operating Expense",
|
||||||
|
"Expense Ratio",
|
||||||
|
"Expense Ratio -Direct",
|
||||||
|
"Expense Ratio -Regular",
|
||||||
|
"month End Expense ratio",
|
||||||
|
"expense ratio",
|
||||||
|
"Expenses Ratios",
|
||||||
|
"Weighted AverageExpense Ratio",
|
||||||
|
"expenses ratio",
|
||||||
|
"Total Expense as % of AAuM",
|
||||||
|
"Total Expenses as a % of AAuM",
|
||||||
|
"Recurring Expenses as a percentage to Average Net Assets",
|
||||||
|
"Total Expenses as % of AAuM",
|
||||||
|
"Income and Expenditure",
|
||||||
|
"Expenditure at Plan level as %",
|
||||||
|
"Total Expenses Inclusive of Management Fees of Direct Plan",
|
||||||
|
"Total Expenses Inclusive of Management Fees of Regular Plan"
|
||||||
|
],
|
||||||
|
"spanish": [
|
||||||
|
"Rácio da despesa total",
|
||||||
|
"Ratio Total de Gastos",
|
||||||
|
"TER",
|
||||||
|
"Ratio de gastos totales"
|
||||||
|
],
|
||||||
|
"german": [
|
||||||
|
"Gesamtgebühren",
|
||||||
|
"Kostenpauschale",
|
||||||
|
"Gesamtkostenquote",
|
||||||
|
"Gesamtaufwandsquoten",
|
||||||
|
"Betriebskostenquote des Fonds",
|
||||||
|
"TER",
|
||||||
|
"Total Expense Ratio",
|
||||||
|
"Total Expense Ratios"
|
||||||
|
],
|
||||||
|
"dutch": [
|
||||||
|
"Totale-kostenpercentage",
|
||||||
|
"Totale Kostenratio",
|
||||||
|
"TKR",
|
||||||
|
"Total Expense Ratio",
|
||||||
|
"Totale kostenpercentage",
|
||||||
|
"Totaal kostenpercentage"
|
||||||
|
],
|
||||||
|
"french": [
|
||||||
|
"Total des frais sur encours",
|
||||||
|
"TFE",
|
||||||
|
"Ratios des charges totales",
|
||||||
|
"Frais sur encours",
|
||||||
|
"RCT",
|
||||||
|
"Total des frais sur encours",
|
||||||
|
"TER",
|
||||||
|
"Ratio des dépenses totales",
|
||||||
|
"Ratio de dépenses totales"
|
||||||
|
],
|
||||||
|
"finnish": [
|
||||||
|
"palkkiot yhteensä",
|
||||||
|
"total expence ratio"
|
||||||
|
],
|
||||||
|
"swedish": [
|
||||||
|
"Totalkostnadsandel",
|
||||||
|
"TER"
|
||||||
|
],
|
||||||
|
"danish": [
|
||||||
|
"Administrationsomk",
|
||||||
|
"Omkostningsprocent"
|
||||||
|
],
|
||||||
|
"norwegian": [
|
||||||
|
"TER",
|
||||||
|
"Kostnadsratio"
|
||||||
|
],
|
||||||
|
"lithuanian": [
|
||||||
|
"Bendrųjų išlaidų koeficientas",
|
||||||
|
"Bendrasis metinis išlaidų rodikli"
|
||||||
|
],
|
||||||
|
"polish": [
|
||||||
|
"Współczynnik kosztów całkowitych",
|
||||||
|
"WKC"
|
||||||
|
],
|
||||||
|
"latvian": [
|
||||||
|
"Kopējo izdevumu koeficients",
|
||||||
|
"KIK"
|
||||||
|
],
|
||||||
|
"india(keywords)": [
|
||||||
|
"Expenditure",
|
||||||
|
"expense ratio",
|
||||||
|
"ratio of",
|
||||||
|
"gross expense"
|
||||||
|
],
|
||||||
|
"estonian": [
|
||||||
|
"Kogukulude suhe",
|
||||||
|
"Kogukulude suhe aasta lõikes"
|
||||||
|
],
|
||||||
|
"malay": [
|
||||||
|
"NPP",
|
||||||
|
"Nisbah Perbelanjaan Pengurusan"
|
||||||
|
],
|
||||||
|
"italian": [
|
||||||
|
"TER",
|
||||||
|
"Total Expense Ratio",
|
||||||
|
"Coefficienti di spesa totale",
|
||||||
|
"Coefficiente di spesa totale"
|
||||||
|
],
|
||||||
|
"portuguese": [
|
||||||
|
"Taxa Global de custos",
|
||||||
|
"Quocientes de Despesa Total",
|
||||||
|
"Totale kostenpercentage"
|
||||||
|
],
|
||||||
|
"hungarian": [
|
||||||
|
"Összes ráfordítás aránya"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"tor": {
|
||||||
|
"english": [
|
||||||
|
"TOR",
|
||||||
|
"Turnover*",
|
||||||
|
"Turnover",
|
||||||
|
"Turnover Ratio",
|
||||||
|
"Turnover Rate",
|
||||||
|
"Portfolio Turnover",
|
||||||
|
"Portfolio turnover ratio",
|
||||||
|
"Portfolio turnover rate",
|
||||||
|
"PTR",
|
||||||
|
"Annual Portfolio Turnover Ratio"
|
||||||
|
],
|
||||||
|
"india": [
|
||||||
|
"Aggregate Value of Purchase and Sale",
|
||||||
|
"The aggregate value of investments",
|
||||||
|
"The aggregate value of purchases",
|
||||||
|
"the aggregate of sales",
|
||||||
|
"Aggregate value of investments purchased and sold",
|
||||||
|
"The aggregate value of purchases and sales"
|
||||||
|
],
|
||||||
|
"spanish": [
|
||||||
|
"Rotación de la Cartera",
|
||||||
|
"Índice de rotación de la cartera",
|
||||||
|
"Ratio de rotación de la cartera"
|
||||||
|
],
|
||||||
|
"german": [
|
||||||
|
"Rotation",
|
||||||
|
"Umschlagshaufigkeit",
|
||||||
|
"Portfolioumschlagshäufigkeit",
|
||||||
|
"Umschlagshäufigkeit",
|
||||||
|
"PTR",
|
||||||
|
"Portfolio Turnover Rate",
|
||||||
|
"Portefeuilleumsatz",
|
||||||
|
"Portfolio Turnover Ratio",
|
||||||
|
"Umsatz",
|
||||||
|
"Portfolioumschlagsra",
|
||||||
|
"Umschlagkennziffer",
|
||||||
|
"Portfolioumschlag",
|
||||||
|
"Portfolioumschlagsrate"
|
||||||
|
],
|
||||||
|
"dutch": [
|
||||||
|
"Omloopsnelheid",
|
||||||
|
"Omloopfactor",
|
||||||
|
"Omloopsnelheid",
|
||||||
|
"Turnover rate",
|
||||||
|
"Rotatie van de portefeuille",
|
||||||
|
"POF",
|
||||||
|
"Turnover ratio"
|
||||||
|
],
|
||||||
|
"french": [
|
||||||
|
"taux de rotation",
|
||||||
|
"Taux de rotation du portefeuille",
|
||||||
|
"Rotation de portefeuille",
|
||||||
|
"Ratio de rotation du portefeuille"
|
||||||
|
],
|
||||||
|
"finnish": [
|
||||||
|
"salkun kiertonopeus"
|
||||||
|
],
|
||||||
|
"swedish": [
|
||||||
|
"Omsättningshastighet"
|
||||||
|
],
|
||||||
|
"danish": [
|
||||||
|
"Omsætningshastighed"
|
||||||
|
],
|
||||||
|
"norwegian": [
|
||||||
|
"Omløpshastighet"
|
||||||
|
],
|
||||||
|
"india(keywords)": [
|
||||||
|
"Aggregate value"
|
||||||
|
],
|
||||||
|
"malay": [
|
||||||
|
"PGP",
|
||||||
|
"Pusing Ganti Portfolio"
|
||||||
|
],
|
||||||
|
"italian": [
|
||||||
|
"Tasso di movimentazione del portafoglio",
|
||||||
|
"Turnover del portafoglio",
|
||||||
|
"Indice di Rotazione del Portafoglio"
|
||||||
|
],
|
||||||
|
"portuguese": [
|
||||||
|
"Rotação média da carteira",
|
||||||
|
"Índice de Movimentação da Carteira de Investimento"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"ogc": {
|
||||||
|
"english": [
|
||||||
|
"Synthetic Ongoing Charges excluding",
|
||||||
|
"On-going Charge",
|
||||||
|
"Ongoing Charge",
|
||||||
|
"ongoing charges",
|
||||||
|
"On-going Fee",
|
||||||
|
"Ongoing fee",
|
||||||
|
"OGC",
|
||||||
|
"OGF",
|
||||||
|
"Operation Charge",
|
||||||
|
"On Going Charges",
|
||||||
|
"OC",
|
||||||
|
"Ongoing Charge Figure OCF",
|
||||||
|
"Ongoing Fund Charge",
|
||||||
|
"Operating Charge",
|
||||||
|
"Operating Charges",
|
||||||
|
"Operating expenses",
|
||||||
|
"Operating, Administrative and Servicing Expenses"
|
||||||
|
],
|
||||||
|
"spanish": [
|
||||||
|
"Gastos Corrientes",
|
||||||
|
"Gastos Recurrentes"
|
||||||
|
],
|
||||||
|
"german": [
|
||||||
|
"Laufende Kosten",
|
||||||
|
"OGC",
|
||||||
|
"Ongoing Charge",
|
||||||
|
"laufende kosten in prozent",
|
||||||
|
"laufenden Kosten",
|
||||||
|
"Betriebskosten",
|
||||||
|
"Betriebsgebühren"
|
||||||
|
],
|
||||||
|
"dutch": [
|
||||||
|
"Lopende kosten",
|
||||||
|
"Lopende kosten factor",
|
||||||
|
"LKF",
|
||||||
|
"Ongoing Charge",
|
||||||
|
"OCF"
|
||||||
|
],
|
||||||
|
"french": [
|
||||||
|
"Frais courants",
|
||||||
|
"Commission de frais opérationels"
|
||||||
|
],
|
||||||
|
"italian": [
|
||||||
|
"Spese Correnti"
|
||||||
|
],
|
||||||
|
"portuguese": [
|
||||||
|
"Encargos Correntes",
|
||||||
|
"Custos correntes"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"performance_fee": {
|
||||||
|
"english": [
|
||||||
|
"Performance Fee",
|
||||||
|
"Performance Fees",
|
||||||
|
"performance-based fee",
|
||||||
|
"performance-related fee"
|
||||||
|
],
|
||||||
|
"spanish": [
|
||||||
|
"Comisión de Gestión sobre Resultados",
|
||||||
|
"Comisión sobre Resultados",
|
||||||
|
"Comisión de Rentabilidad",
|
||||||
|
"Comisiones de éxito",
|
||||||
|
"Comisión de Éxito",
|
||||||
|
"Comisión por resultados",
|
||||||
|
"comisión de rentabilidad"
|
||||||
|
],
|
||||||
|
"german": [
|
||||||
|
"Erfolgsabhängige Vergütung",
|
||||||
|
"Erfolgsbezogene Vergütung",
|
||||||
|
"Performancegebühren",
|
||||||
|
"An die Wertentwicklung des Fonds gebundene Gebühren",
|
||||||
|
"Performancegebühr",
|
||||||
|
"Performance-gebühr",
|
||||||
|
"Erfolgshonorare",
|
||||||
|
"Erfolgsabhän-giger Vergütung",
|
||||||
|
"Erfolgshonorar",
|
||||||
|
"Performance-Fee",
|
||||||
|
"Erfolgsgebühr",
|
||||||
|
"perfolgsabhängige Verwaltungsvergütung",
|
||||||
|
"performanceabhängige Vergütung",
|
||||||
|
"Performance- gebühren"
|
||||||
|
],
|
||||||
|
"dutch": [
|
||||||
|
"Prestatievergoeding",
|
||||||
|
"Performance Fee"
|
||||||
|
],
|
||||||
|
"french": [
|
||||||
|
"Les commissions de surperformance",
|
||||||
|
"Commission de performance",
|
||||||
|
"Commissions de surperformance",
|
||||||
|
"frais de performance"
|
||||||
|
],
|
||||||
|
"italian": [
|
||||||
|
"Commissioni di performance",
|
||||||
|
"Commissioni legate al rendimento",
|
||||||
|
"Commissioni d’incentivo"
|
||||||
|
],
|
||||||
|
"portuguese": [
|
||||||
|
"Comissão de desempenho",
|
||||||
|
"Custos de performance",
|
||||||
|
"Comissão de Gestão Variável"
|
||||||
|
],
|
||||||
|
"estonian": [
|
||||||
|
" Edukustasud aasta lõikes"
|
||||||
|
],
|
||||||
|
"latvian": [
|
||||||
|
"Gada prēmijas par sasniegtajiem rezultātiem"
|
||||||
|
],
|
||||||
|
"Lithuanian": [
|
||||||
|
"Metinis mokestis už veiklos rezultatu"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"trading expense ratio": {
|
||||||
|
"english": [
|
||||||
|
"Trading expense ratio",
|
||||||
|
"Trading Expense Ratio10"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"mer": {
|
||||||
|
"english": [
|
||||||
|
"Management expense ratio",
|
||||||
|
"Management expense ratio after taxes",
|
||||||
|
"Expense ratio"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"MgtFee": {
|
||||||
|
"english": [
|
||||||
|
"Management Fee as % of AAuM",
|
||||||
|
"Management Fee including GST as % of AAuM",
|
||||||
|
"Management Fees",
|
||||||
|
"Management fee inclusive of service tax GST at annualised average rate",
|
||||||
|
"Management and Trusteeship Fees",
|
||||||
|
"Investment Management and Trusteeship fees",
|
||||||
|
"Investment management fees "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"max_management_fee": {
|
||||||
|
"english": [
|
||||||
|
"management fee",
|
||||||
|
"Periodic Charge",
|
||||||
|
"Advisory",
|
||||||
|
"max_management_fee"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"max_front_load": {
|
||||||
|
"english": [
|
||||||
|
"Sales charge",
|
||||||
|
"subscription fee",
|
||||||
|
"subscription charge",
|
||||||
|
"subscription commission",
|
||||||
|
"sales fee",
|
||||||
|
"entry fee",
|
||||||
|
"initial charge",
|
||||||
|
"preliminary charge",
|
||||||
|
"preliminary fee",
|
||||||
|
"Entry Charge",
|
||||||
|
"Initial Sales Charge",
|
||||||
|
"max_front_load"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"min_initial_purchase": {
|
||||||
|
"english": [
|
||||||
|
"Minimum Initial Subscription",
|
||||||
|
"Minimum Subscription",
|
||||||
|
"Minimum Subscription Amount",
|
||||||
|
"Minimum initial investment",
|
||||||
|
"min_initial_purchase"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"min_subsequent_purchase": {
|
||||||
|
"english": [
|
||||||
|
"Minimum Additional",
|
||||||
|
"Minimum Additional Subscription Amount",
|
||||||
|
"Minimum initial and subsequence subscription",
|
||||||
|
"Minimum Additional Subscription",
|
||||||
|
"Minimum Subsequent Investment",
|
||||||
|
"Minimum Subsequent Purchase",
|
||||||
|
"additional",
|
||||||
|
"min_subsequent_purchase"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,30 @@
|
||||||
|
{
|
||||||
|
"CAN": {
|
||||||
|
"ar": [
|
||||||
|
"mer",
|
||||||
|
"tor",
|
||||||
|
"trading expense ratio"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"IND": {
|
||||||
|
"ar": [
|
||||||
|
"ter",
|
||||||
|
"MgtFee",
|
||||||
|
"tor"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"ar": [
|
||||||
|
"tor",
|
||||||
|
"ter",
|
||||||
|
"ogc",
|
||||||
|
"performance_fee"
|
||||||
|
],
|
||||||
|
"prospectus": [
|
||||||
|
"max_management_fee",
|
||||||
|
"max_front_load",
|
||||||
|
"min_initial_purchase",
|
||||||
|
"min_subsequent_purchase"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
{
|
||||||
|
"0L00000122": "english",
|
||||||
|
"0LMIX00001": "english",
|
||||||
|
"0LMIX00002": "english",
|
||||||
|
"0L00000482": "english",
|
||||||
|
"0LMIX00003": "german",
|
||||||
|
"0L00000152": "german",
|
||||||
|
"0L00000114": "dutch",
|
||||||
|
"0L00000138": "french",
|
||||||
|
"0L00000203": "italian",
|
||||||
|
"0L00000408": "spanish",
|
||||||
|
"0L00000348": "portuguese",
|
||||||
|
"0L00000135": "Finnish",
|
||||||
|
"0L00000415": "Swedish",
|
||||||
|
"0L00000104": "Danish",
|
||||||
|
"0L00000320": "Norwegian",
|
||||||
|
"0L00000254": "Lithuanian",
|
||||||
|
"0L00000347": "Polish",
|
||||||
|
"0L00000250": "Latvian",
|
||||||
|
"0L00000127": "Estonian",
|
||||||
|
"0L00000273": "Malay"
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,129 @@
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import fitz
|
||||||
|
import pandas as pd
|
||||||
|
from utils.pdf_util import PDFUtil
|
||||||
|
from utils.sql_query_util import query_document_fund_mapping
|
||||||
|
from utils.logger import logger
|
||||||
|
from utils.biz_utils import add_slash_to_text_as_regex
|
||||||
|
|
||||||
|
|
||||||
|
class FilterPages:
|
||||||
|
def __init__(
|
||||||
|
self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame
|
||||||
|
) -> None:
|
||||||
|
self.doc_id = doc_id
|
||||||
|
self.pdf_file = pdf_file
|
||||||
|
self.page_text_dict = self.get_pdf_page_text_dict()
|
||||||
|
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
|
||||||
|
self.document_mapping_info_df = query_document_fund_mapping(doc_id)
|
||||||
|
else:
|
||||||
|
self.document_mapping_info_df = document_mapping_info_df
|
||||||
|
self.get_configuration_from_file()
|
||||||
|
self.doc_info = self.get_doc_info()
|
||||||
|
self.datapoint_config = self.get_datapoint_config()
|
||||||
|
|
||||||
|
def get_pdf_page_text_dict(self) -> dict:
|
||||||
|
pdf_util = PDFUtil(self.pdf_file)
|
||||||
|
success, text, page_text_dict = pdf_util.extract_text()
|
||||||
|
return page_text_dict
|
||||||
|
|
||||||
|
def get_configuration_from_file(self) -> dict:
|
||||||
|
language_config_file = r"./configuration/language.json"
|
||||||
|
domicile_datapoint_config_file = r"./configuration/domicile_datapoints.json"
|
||||||
|
datapoint_keywords_config_file = r"./configuration/datapoint_keyword.json"
|
||||||
|
with open(language_config_file, "r", encoding="utf-8") as file:
|
||||||
|
self.language_config = json.load(file)
|
||||||
|
with open(domicile_datapoint_config_file, "r", encoding="utf-8") as file:
|
||||||
|
self.domicile_datapoint_config = json.load(file)
|
||||||
|
with open(datapoint_keywords_config_file, "r", encoding="utf-8") as file:
|
||||||
|
self.datapoint_keywords_config = json.load(file)
|
||||||
|
|
||||||
|
def get_doc_info(self) -> dict:
|
||||||
|
if len(self.document_mapping_info_df) == 0:
|
||||||
|
return {
|
||||||
|
"effective_date": None,
|
||||||
|
"document_type": "ar",
|
||||||
|
"language_id": "0L00000122",
|
||||||
|
"language": "english",
|
||||||
|
"domicile": "LUX",
|
||||||
|
}
|
||||||
|
effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0]
|
||||||
|
document_type = self.document_mapping_info_df["DocumentType"].iloc[0]
|
||||||
|
if document_type in [4, 5]:
|
||||||
|
document_type = "ar"
|
||||||
|
elif document_type == 1:
|
||||||
|
document_type = "prospectus"
|
||||||
|
language_id = self.document_mapping_info_df["Language"].iloc[0]
|
||||||
|
language = self.language_config.get(language_id, None)
|
||||||
|
domicile = self.document_mapping_info_df["Domicile"].iloc[0]
|
||||||
|
return {
|
||||||
|
"effective_date": effective_date,
|
||||||
|
"document_type": document_type,
|
||||||
|
"language_id": language_id,
|
||||||
|
"language": language,
|
||||||
|
"domicile": domicile,
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_datapoint_config(self) -> dict:
|
||||||
|
domicile = self.doc_info.get("domicile", None)
|
||||||
|
document_type = self.doc_info.get("document_type", None)
|
||||||
|
language = self.doc_info.get("language", None)
|
||||||
|
if language is None:
|
||||||
|
language = "english"
|
||||||
|
if self.domicile_datapoint_config.get(domicile, None) is None:
|
||||||
|
domicile = "default"
|
||||||
|
if self.domicile_datapoint_config[domicile].get(document_type, None) is None:
|
||||||
|
document_type = "ar"
|
||||||
|
datapoint_list = self.domicile_datapoint_config[domicile][document_type]
|
||||||
|
datapoint_keywords = {}
|
||||||
|
for datapoint in datapoint_list:
|
||||||
|
keywords = self.datapoint_keywords_config.get(datapoint, {}).get(language, [])
|
||||||
|
if len(keywords) > 0:
|
||||||
|
keywords = self.optimize_keywords_regex(keywords)
|
||||||
|
datapoint_keywords[datapoint] = keywords
|
||||||
|
if language != "english":
|
||||||
|
english_keywords = self.datapoint_keywords_config.get(datapoint, {}).get("english", [])
|
||||||
|
if len(english_keywords) > 0:
|
||||||
|
english_keywords = self.optimize_keywords_regex(english_keywords)
|
||||||
|
datapoint_keywords[datapoint] += english_keywords
|
||||||
|
return datapoint_keywords
|
||||||
|
|
||||||
|
def optimize_keywords_regex(self, keywords: list) -> list:
|
||||||
|
new_keywords = []
|
||||||
|
for keyword in keywords:
|
||||||
|
new_keyword = add_slash_to_text_as_regex(keyword)
|
||||||
|
new_keywords.append(new_keyword)
|
||||||
|
return new_keywords
|
||||||
|
|
||||||
|
def clean_text(self, text: str) -> str:
|
||||||
|
text = text.lower()
|
||||||
|
text = re.sub(r"\s+", ' ', text.strip())
|
||||||
|
return text
|
||||||
|
|
||||||
|
def start_job(self) -> dict:
|
||||||
|
logger.info(f"Start extracting datapoints from {self.pdf_file}")
|
||||||
|
"""
|
||||||
|
1. Ierate document pages
|
||||||
|
2. Filter by data point keywords
|
||||||
|
3. Result should be like this:
|
||||||
|
{
|
||||||
|
"doc_id": "445256897",
|
||||||
|
"ter": [5, 6, 10, 15],
|
||||||
|
"tor": [6, 8, 10]
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
result = {"doc_id": self.doc_id}
|
||||||
|
for datapoint in self.datapoint_config.keys():
|
||||||
|
result[datapoint] = []
|
||||||
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
|
text = self.clean_text(page_text)
|
||||||
|
for datapoint, keywords in self.datapoint_config.items():
|
||||||
|
# idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean])
|
||||||
|
for keyword in keywords:
|
||||||
|
search_regex = r"\b{0}\d*\b\s*".format(keyword)
|
||||||
|
if re.search(search_regex, text, re.IGNORECASE):
|
||||||
|
result[datapoint].append(page_num)
|
||||||
|
break
|
||||||
|
return result
|
||||||
66
main.py
66
main.py
|
|
@ -1,5 +1,65 @@
|
||||||
def main():
|
import os
|
||||||
print("Hello World!")
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
from glob import glob
|
||||||
|
from tqdm import tqdm
|
||||||
|
import time
|
||||||
|
from utils.logger import logger
|
||||||
|
from utils.pdf_download import download_pdf_from_documents_warehouse
|
||||||
|
from utils.sql_query_util import query_document_fund_mapping
|
||||||
|
from core.page_filter import FilterPages
|
||||||
|
|
||||||
|
|
||||||
|
class EMEA_AR_Parsing:
|
||||||
|
def __init__(self, doc_id: str, pdf_folder: str = r"/data/emea_ar/pdf/") -> None:
|
||||||
|
self.doc_id = doc_id
|
||||||
|
self.pdf_folder = pdf_folder
|
||||||
|
os.makedirs(self.pdf_folder, exist_ok=True)
|
||||||
|
self.pdf_file = self.download_pdf()
|
||||||
|
self.document_mapping_info_df = query_document_fund_mapping(doc_id)
|
||||||
|
self.datapoint_page_info = self.get_datapoint_page_info()
|
||||||
|
|
||||||
|
def download_pdf(self) -> str:
|
||||||
|
pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
|
||||||
|
return pdf_file
|
||||||
|
|
||||||
|
def get_datapoint_page_info(self) -> dict:
|
||||||
|
filter_pages = FilterPages(
|
||||||
|
self.doc_id, self.pdf_file, self.document_mapping_info_df
|
||||||
|
)
|
||||||
|
datapoint_page_info = filter_pages.start_job()
|
||||||
|
return datapoint_page_info
|
||||||
|
|
||||||
|
|
||||||
|
def filter_pages(doc_id: str, pdf_folder: str) -> None:
|
||||||
|
logger.info(f"Parsing EMEA AR for doc_id: {doc_id}")
|
||||||
|
emea_ar_parsing = EMEA_AR_Parsing(doc_id, pdf_folder)
|
||||||
|
return emea_ar_parsing.datapoint_page_info
|
||||||
|
|
||||||
|
|
||||||
|
def batch_filter_pdf_files(pdf_folder: str, output_folder: str) -> None:
|
||||||
|
pdf_files = glob(pdf_folder + "*.pdf")
|
||||||
|
result_list = []
|
||||||
|
for pdf_file in tqdm(pdf_files):
|
||||||
|
pdf_base_name = os.path.basename(pdf_file)
|
||||||
|
doc_id = pdf_base_name.split(".")[0]
|
||||||
|
datapoint_page_info = filter_pages(doc_id=doc_id, pdf_folder=pdf_folder)
|
||||||
|
result_list.append(datapoint_page_info)
|
||||||
|
result_df = pd.DataFrame(result_list)
|
||||||
|
result_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
logger.info(f"Saving the result to {output_folder}")
|
||||||
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
|
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
||||||
|
output_file = os.path.join(
|
||||||
|
output_folder,
|
||||||
|
f"datapoint_page_info_{len(result_df)}_documents_{time_stamp}.xlsx",
|
||||||
|
)
|
||||||
|
with pd.ExcelWriter(output_file) as writer:
|
||||||
|
result_df.to_excel(writer, index=False)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
pdf_folder = r"/data/emea_ar/small_pdf/"
|
||||||
|
output_folder = r"/data/emea_ar/output/filter_pages/"
|
||||||
|
batch_filter_pdf_files(pdf_folder, output_folder)
|
||||||
|
|
|
||||||
186
prepare_data.py
186
prepare_data.py
|
|
@ -48,7 +48,7 @@ def download_pdf(doc_provider_file_path: str, sheet_name: str, pdf_path: str):
|
||||||
# download pdfs
|
# download pdfs
|
||||||
logger.info(f"Start downloading {len(doc_id_list)} pdfs")
|
logger.info(f"Start downloading {len(doc_id_list)} pdfs")
|
||||||
os.makedirs(pdf_path, exist_ok=True)
|
os.makedirs(pdf_path, exist_ok=True)
|
||||||
for doc_id in tqdm.tqdm(doc_id_list):
|
for doc_id in tqdm(doc_id_list):
|
||||||
logger.info(f"Downloading pdf for docid: {doc_id}")
|
logger.info(f"Downloading pdf for docid: {doc_id}")
|
||||||
download_pdf_from_documents_warehouse(pdf_directory=pdf_path, doc_id=doc_id)
|
download_pdf_from_documents_warehouse(pdf_directory=pdf_path, doc_id=doc_id)
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
@ -565,17 +565,53 @@ def statistics_document_fund_share_count(provider_mapping_data_file: str):
|
||||||
describe_stat_df_list = []
|
describe_stat_df_list = []
|
||||||
# statistics document mapping information
|
# statistics document mapping information
|
||||||
doc_mapping_data = pd.read_excel(provider_mapping_data_file, sheet_name="all_data")
|
doc_mapping_data = pd.read_excel(provider_mapping_data_file, sheet_name="all_data")
|
||||||
|
# set noTor column value to 0 if column tor value is not nan, set 1 otherwise
|
||||||
|
doc_mapping_data["noTor"] = doc_mapping_data["tor"].apply(
|
||||||
|
lambda x: 0 if pd.notna(x) else 1
|
||||||
|
)
|
||||||
|
# set share_noTer column value to 0 if column share_ter value is not nan, set 1 otherwise
|
||||||
|
doc_mapping_data["share_noTer"] = doc_mapping_data["share_ter"].apply(
|
||||||
|
lambda x: 0 if pd.notna(x) else 1
|
||||||
|
)
|
||||||
|
# set share_noOgc column value to 0 if column share_ter value is not nan, set 1 otherwise
|
||||||
|
doc_mapping_data["share_noOgc"] = doc_mapping_data["share_ogc"].apply(
|
||||||
|
lambda x: 0 if pd.notna(x) else 1
|
||||||
|
)
|
||||||
|
# set share_noPerfFee column value to 0 if column share_ter value is not nan, set 1 otherwise
|
||||||
|
doc_mapping_data["share_noPerfFee"] = doc_mapping_data["share_perfFee"].apply(
|
||||||
|
lambda x: 0 if pd.notna(x) else 1
|
||||||
|
)
|
||||||
|
|
||||||
# statistics doc_mapping_data for counting FundId count based on DocumentId
|
# statistics doc_mapping_data for counting FundId count based on DocumentId
|
||||||
logger.info(
|
logger.info(
|
||||||
"statistics doc_mapping_data for counting FundId count based on DocumentId"
|
"statistics doc_mapping_data for counting FundId count based on DocumentId"
|
||||||
)
|
)
|
||||||
doc_fund_id_df = doc_mapping_data[["DocumentId", "CompanyId", "CompanyName", "FundId"]].drop_duplicates()
|
doc_fund_id_df = doc_mapping_data[["DocumentId", "EffectiveDate", "CompanyId", "CompanyName", "FundId"]].drop_duplicates()
|
||||||
doc_fund_count = (
|
doc_fund_count = (
|
||||||
doc_fund_id_df.groupby(["DocumentId", "CompanyId", "CompanyName"]).size().reset_index(name="fund_count")
|
doc_fund_id_df.groupby(["DocumentId", "EffectiveDate", "CompanyId", "CompanyName"]).size().reset_index(name="fund_count")
|
||||||
)
|
)
|
||||||
# order by fund_count in descending order
|
# order by fund_count in descending order
|
||||||
doc_fund_count = doc_fund_count.sort_values(by="fund_count", ascending=True)
|
doc_fund_count = doc_fund_count.sort_values(by="fund_count", ascending=True)
|
||||||
|
|
||||||
|
# set with_ar_data to True if noTor == 0 or share_noOgc == 0 or share_noPerfFee == 0
|
||||||
|
doc_fund_count["with_ar_data"] = False
|
||||||
|
for index, row in doc_fund_count.iterrows():
|
||||||
|
document_id = row["DocumentId"]
|
||||||
|
ar_data = doc_mapping_data[
|
||||||
|
(doc_mapping_data["DocumentId"] == document_id)
|
||||||
|
& (
|
||||||
|
(
|
||||||
|
(doc_mapping_data["noTor"] == 0)
|
||||||
|
| (doc_mapping_data["share_noTer"] == 0)
|
||||||
|
| (doc_mapping_data["share_noOgc"] == 0)
|
||||||
|
| (doc_mapping_data["share_noPerfFee"] == 0)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
if len(ar_data) > 0:
|
||||||
|
doc_fund_count.loc[index, "with_ar_data"] = True
|
||||||
|
|
||||||
|
|
||||||
# statistics fund_count in doc_fund_count by describe and transform to DataFrame
|
# statistics fund_count in doc_fund_count by describe and transform to DataFrame
|
||||||
doc_fund_count_stat_df = get_describe_stat(
|
doc_fund_count_stat_df = get_describe_stat(
|
||||||
doc_fund_count, "fund_count", "doc_fund_count"
|
doc_fund_count, "fund_count", "doc_fund_count"
|
||||||
|
|
@ -587,10 +623,10 @@ def statistics_document_fund_share_count(provider_mapping_data_file: str):
|
||||||
"statistics doc_mapping_data for counting FundClassId count based on DocumentId"
|
"statistics doc_mapping_data for counting FundClassId count based on DocumentId"
|
||||||
)
|
)
|
||||||
doc_share_class_id_df = doc_mapping_data[
|
doc_share_class_id_df = doc_mapping_data[
|
||||||
["DocumentId", "CompanyId", "CompanyName", "FundClassId"]
|
["DocumentId", "EffectiveDate", "CompanyId", "CompanyName", "FundClassId"]
|
||||||
].drop_duplicates()
|
].drop_duplicates()
|
||||||
doc_share_class_count = (
|
doc_share_class_count = (
|
||||||
doc_share_class_id_df.groupby(["DocumentId", "CompanyId", "CompanyName"])
|
doc_share_class_id_df.groupby(["DocumentId", "EffectiveDate", "CompanyId", "CompanyName"])
|
||||||
.size()
|
.size()
|
||||||
.reset_index(name="share_class_count")
|
.reset_index(name="share_class_count")
|
||||||
)
|
)
|
||||||
|
|
@ -598,6 +634,24 @@ def statistics_document_fund_share_count(provider_mapping_data_file: str):
|
||||||
doc_share_class_count = doc_share_class_count.sort_values(
|
doc_share_class_count = doc_share_class_count.sort_values(
|
||||||
by="share_class_count", ascending=True
|
by="share_class_count", ascending=True
|
||||||
)
|
)
|
||||||
|
# set with_ar_data to True if noTor == 0 or share_noOgc == 0 or share_noPerfFee == 0
|
||||||
|
doc_share_class_count["with_ar_data"] = False
|
||||||
|
for index, row in doc_share_class_count.iterrows():
|
||||||
|
document_id = row["DocumentId"]
|
||||||
|
ar_data = doc_mapping_data[
|
||||||
|
(doc_mapping_data["DocumentId"] == document_id)
|
||||||
|
& (
|
||||||
|
(
|
||||||
|
(doc_mapping_data["noTor"] == 0)
|
||||||
|
| (doc_mapping_data["share_noTer"] == 0)
|
||||||
|
| (doc_mapping_data["share_noOgc"] == 0)
|
||||||
|
| (doc_mapping_data["share_noPerfFee"] == 0)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
if len(ar_data) > 0:
|
||||||
|
doc_share_class_count.loc[index, "with_ar_data"] = True
|
||||||
|
|
||||||
# statistics share_class_count in doc_share_class_count by describe and transform to DataFrame
|
# statistics share_class_count in doc_share_class_count by describe and transform to DataFrame
|
||||||
doc_share_class_count_stat_df = get_describe_stat(
|
doc_share_class_count_stat_df = get_describe_stat(
|
||||||
doc_share_class_count, "share_class_count", "doc_share_class_count"
|
doc_share_class_count, "share_class_count", "doc_share_class_count"
|
||||||
|
|
@ -648,6 +702,116 @@ def get_describe_stat(df: pd.DataFrame, column_name: str, stat_type_name: str):
|
||||||
return stat_df
|
return stat_df
|
||||||
|
|
||||||
|
|
||||||
|
def pickup_document_from_top_100_providers():
|
||||||
|
"""
|
||||||
|
Pickup 100 documents from top 100 providers.
|
||||||
|
The documents are with less 10 share classes.
|
||||||
|
The purpose is to analyze the document structure and content from small documents.
|
||||||
|
"""
|
||||||
|
provider_mapping_data_file = (
|
||||||
|
r"/data/emea_ar/basic_information/English/provider_mapping_data_statistics.xlsx"
|
||||||
|
)
|
||||||
|
top_100_provider_document_file = (
|
||||||
|
r"/data/emea_ar/basic_information/English/lux_english_ar_from_top_100_provider_since_2020.xlsx"
|
||||||
|
)
|
||||||
|
provider_share_count = pd.read_excel(
|
||||||
|
provider_mapping_data_file, sheet_name="provider_share_count"
|
||||||
|
)
|
||||||
|
# add a new column with name share_count_rank to provider_share_count
|
||||||
|
provider_share_count["share_count_rank"] = provider_share_count[
|
||||||
|
"share_class_count"
|
||||||
|
].rank(method="min", ascending=False)
|
||||||
|
|
||||||
|
top_100_provider_document_all_data = pd.read_excel(
|
||||||
|
top_100_provider_document_file, sheet_name="all_data"
|
||||||
|
)
|
||||||
|
|
||||||
|
top_100_provider_document_share_count = pd.read_excel(
|
||||||
|
top_100_provider_document_file, sheet_name="doc_share_class_count"
|
||||||
|
)
|
||||||
|
top_100_provider_document_share_count = \
|
||||||
|
top_100_provider_document_share_count[top_100_provider_document_share_count["with_ar_data"] == True]
|
||||||
|
top_100_provider_document_share_count.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
# add a new column with name share_count_rank to top_100_provider_document_share_count by merge with provider_share_count
|
||||||
|
top_100_provider_document_share_count = pd.merge(
|
||||||
|
top_100_provider_document_share_count,
|
||||||
|
provider_share_count,
|
||||||
|
on=["CompanyId"],
|
||||||
|
how="left",
|
||||||
|
)
|
||||||
|
# Keep columns: DocumentId, CompanyId, CompanyName, share_class_count_x, share_count_rank
|
||||||
|
top_100_provider_document_share_count = top_100_provider_document_share_count[
|
||||||
|
["DocumentId", "CompanyId", "CompanyName_x", "share_class_count_x", "share_count_rank"]
|
||||||
|
]
|
||||||
|
# rename column share_class_count_x to share_class_count
|
||||||
|
top_100_provider_document_share_count.rename(
|
||||||
|
columns={"share_class_count_x": "share_class_count",
|
||||||
|
"CompanyName_x": "Company_Name",
|
||||||
|
"share_count_rank": "provider_share_count_rank"}, inplace=True
|
||||||
|
)
|
||||||
|
top_100_provider_document_share_count = top_100_provider_document_share_count.sort_values(
|
||||||
|
by=["provider_share_count_rank", "share_class_count"], ascending=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# According to share_count_rank, from 1 to 10,
|
||||||
|
# random pickup one documents with 1 to 10 share classes for each rank
|
||||||
|
data_filter = top_100_provider_document_share_count[
|
||||||
|
(top_100_provider_document_share_count["share_class_count"] <= 10)
|
||||||
|
& (top_100_provider_document_share_count["share_class_count"] >= 1)
|
||||||
|
]
|
||||||
|
data_filter = data_filter.sort_values(
|
||||||
|
by=["provider_share_count_rank", "share_class_count"], ascending=[True, True]
|
||||||
|
)
|
||||||
|
unique_rank_list = top_100_provider_document_share_count["provider_share_count_rank"].unique().tolist()
|
||||||
|
random_pickup_document_data_list = []
|
||||||
|
for rank in unique_rank_list:
|
||||||
|
data_filter_rank = data_filter[data_filter["provider_share_count_rank"] == rank]
|
||||||
|
if len(data_filter_rank) == 0:
|
||||||
|
# get the first document with rank from top_100_provider_document_share_count
|
||||||
|
data_filter_rank = top_100_provider_document_share_count[
|
||||||
|
top_100_provider_document_share_count["provider_share_count_rank"] == rank
|
||||||
|
].head(1)
|
||||||
|
data_filter_rank = data_filter_rank.sample(n=1, random_state=88)
|
||||||
|
random_pickup_document_data_list.append(data_filter_rank)
|
||||||
|
random_pickup_document_data = pd.concat(random_pickup_document_data_list)
|
||||||
|
# sort by share_count_rank in ascending order
|
||||||
|
random_pickup_document_data = random_pickup_document_data.sort_values(
|
||||||
|
by="provider_share_count_rank", ascending=True
|
||||||
|
)
|
||||||
|
random_pickup_document_data.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
random_pickup_document_mini_data = random_pickup_document_data[
|
||||||
|
["DocumentId", "provider_share_count_rank"]
|
||||||
|
]
|
||||||
|
# get all data from top_100_provider_document_all_data by merge with random_pickup_document_mini_data
|
||||||
|
random_pickup_document_all_data = pd.merge(
|
||||||
|
random_pickup_document_mini_data,
|
||||||
|
top_100_provider_document_all_data,
|
||||||
|
on=["DocumentId"],
|
||||||
|
how="left",
|
||||||
|
)
|
||||||
|
# sort random_pickup_document_all_data by provider_share_count_rank, FundLegalName, FundClassLegalName in ascending order
|
||||||
|
random_pickup_document_all_data = random_pickup_document_all_data.sort_values(
|
||||||
|
by=["provider_share_count_rank", "FundLegalName", "FundClassLegalName"], ascending=True
|
||||||
|
)
|
||||||
|
|
||||||
|
random_small_document_data_file = (
|
||||||
|
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
|
||||||
|
)
|
||||||
|
with pd.ExcelWriter(random_small_document_data_file) as writer:
|
||||||
|
top_100_provider_document_share_count.to_excel(
|
||||||
|
writer, sheet_name="all_doc_with_ar_data", index=False
|
||||||
|
)
|
||||||
|
random_pickup_document_data.to_excel(
|
||||||
|
writer, sheet_name="random_small_document", index=False
|
||||||
|
)
|
||||||
|
|
||||||
|
random_pickup_document_all_data.to_excel(
|
||||||
|
writer, sheet_name="random_small_document_all_data", index=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
doc_provider_file_path = (
|
doc_provider_file_path = (
|
||||||
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
||||||
|
|
@ -664,7 +828,14 @@ if __name__ == "__main__":
|
||||||
output_folder = r"/data/emea_ar/output/"
|
output_folder = r"/data/emea_ar/output/"
|
||||||
# get_unique_docids_from_doc_provider_data(doc_provider_file_path)
|
# get_unique_docids_from_doc_provider_data(doc_provider_file_path)
|
||||||
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
|
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
|
||||||
# output_pdf_page_text(pdf_folder, output_folder)
|
pdf_folder = r"/data/emea_ar/small_pdf/"
|
||||||
|
output_folder = r"/data/emea_ar/small_pdf_txt/"
|
||||||
|
random_small_document_data_file = (
|
||||||
|
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
|
||||||
|
)
|
||||||
|
download_pdf(random_small_document_data_file, 'random_small_document', pdf_folder)
|
||||||
|
output_pdf_page_text(pdf_folder, output_folder)
|
||||||
|
|
||||||
# extract_pdf_table(pdf_folder, output_folder)
|
# extract_pdf_table(pdf_folder, output_folder)
|
||||||
# analyze_json_error()
|
# analyze_json_error()
|
||||||
|
|
||||||
|
|
@ -674,4 +845,5 @@ if __name__ == "__main__":
|
||||||
# provider_mapping_data_file=provider_mapping_data_file,
|
# provider_mapping_data_file=provider_mapping_data_file,
|
||||||
# output_folder=basic_info_folder,
|
# output_folder=basic_info_folder,
|
||||||
# )
|
# )
|
||||||
statistics_document_fund_share_count(doc_mapping_from_top_100_provider_file)
|
# statistics_document_fund_share_count(doc_mapping_from_top_100_provider_file)
|
||||||
|
# pickup_document_from_top_100_providers()
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
def add_slash_to_text_as_regex(text: str):
|
||||||
|
if text is None or len(text) == 0:
|
||||||
|
return text
|
||||||
|
special_char_iter = re.finditer("\W", text)
|
||||||
|
for special_iter in special_char_iter:
|
||||||
|
if len(special_iter.group().strip()) == 0:
|
||||||
|
continue
|
||||||
|
replace = r"\{0}".format(special_iter.group())
|
||||||
|
if replace not in text:
|
||||||
|
text = re.sub(replace, replace, text)
|
||||||
|
text = re.sub(r"\s+", r"\\s+", text)
|
||||||
|
return text
|
||||||
|
|
@ -0,0 +1,80 @@
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from urllib import request
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
import dotenv
|
||||||
|
# loads .env file with your OPENAI_API_KEY
|
||||||
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
def query_document_fund_mapping(doc_id):
|
||||||
|
count = 1
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
document_mapping_info_df = query_data_by_biz_type(
|
||||||
|
biztype="getFundInfoByDocId", para=doc_id, return_df=True
|
||||||
|
).drop_duplicates()
|
||||||
|
if len(document_mapping_info_df) == 0:
|
||||||
|
return document_mapping_info_df
|
||||||
|
document_mapping_info_df = document_mapping_info_df.sort_values(
|
||||||
|
by=["FundName", "ShareClassName"]
|
||||||
|
).reset_index(drop=True)
|
||||||
|
return document_mapping_info_df
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
time.sleep(3)
|
||||||
|
if count == 5:
|
||||||
|
break
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
|
||||||
|
def query_investment_by_provider(company_id: str):
|
||||||
|
count = 1
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
investment_by_provider_df = query_data_by_biz_type(biztype='getInvestmentByProvider',
|
||||||
|
para=company_id,
|
||||||
|
return_df=True).drop_duplicates()
|
||||||
|
investment_by_provider_df = investment_by_provider_df \
|
||||||
|
.sort_values(by=['FundName', 'ShareClassName']) \
|
||||||
|
.reset_index(drop=True)
|
||||||
|
return investment_by_provider_df
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
time.sleep(3)
|
||||||
|
if count == 5:
|
||||||
|
break
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
|
||||||
|
def query_data_by_biz_type(biztype: str, para, return_df: bool):
|
||||||
|
sqlpass_url = "https://api.morningstar.com/sqlpassapi/v1/sql"
|
||||||
|
url = sqlpass_url + "?sqlName={0}¶ms={1}".format(biztype, str(para))
|
||||||
|
headers = {"ApiKey": os.getenv("SQL_PASS_KEY")}
|
||||||
|
if return_df:
|
||||||
|
return pd.DataFrame(query_data_by_url(url, headers))
|
||||||
|
else:
|
||||||
|
return query_data_by_url(url, headers)
|
||||||
|
|
||||||
|
|
||||||
|
def query_data_by_url(url, headers):
|
||||||
|
res = None
|
||||||
|
count = 1
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
req = request.Request(url=url, headers=headers)
|
||||||
|
res = request.urlopen(req)
|
||||||
|
res = res.read().decode(encoding="utf-8", errors="ignore")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
time.sleep(3)
|
||||||
|
if count == 5:
|
||||||
|
break
|
||||||
|
count += 1
|
||||||
|
if res is not None:
|
||||||
|
dic = json.loads(res)
|
||||||
|
return dic["result"]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
Loading…
Reference in New Issue