supplement EMEA AR configuration files

This commit is contained in:
Blade He 2025-01-16 11:30:44 -06:00
parent 9f0e77a11e
commit db0827435b
19 changed files with 2104 additions and 187 deletions

View File

@ -0,0 +1,17 @@
{
"ISIN": {
"english": []
},
"ter": {
"english": []
},
"tor": {
"english": []
},
"ogc": {
"english": ["operating expenses paid"]
},
"performance_fee": {
"english": ["Performance fees payable"]
}
}

View File

@ -0,0 +1,526 @@
{
"ISIN": {
"english": [
"ISIN",
"ISIN Code"
],
"spanish": [
"ISIN",
"ISIN Code"
],
"german": [
"ISIN",
"ISIN Code"
],
"dutch": [
"ISIN",
"ISIN Code"
],
"french": [
"ISIN",
"ISIN Code"
],
"finnish": [
"ISIN",
"ISIN Code"
],
"swedish": [
"ISIN",
"ISIN Code"
],
"danish": [
"ISIN",
"ISIN Code"
],
"norwegian": [
"ISIN",
"ISIN Code"
],
"lithuanian": [
"ISIN",
"ISIN Code"
],
"polish": [
"ISIN",
"ISIN Code"
],
"latvian": [
"ISIN",
"ISIN Code"
],
"indiakeywords": [
"ISIN",
"ISIN Code"
],
"estonian": [
"ISIN",
"ISIN Code"
],
"malay": [
"ISIN",
"ISIN Code"
],
"italian": [
"ISIN",
"ISIN Code"
],
"portuguese": [
"ISIN",
"ISIN Code"
]
},
"ter": {
"english": [
"Synthetic TER",
"Fund TER",
"TER",
"T.E.R",
"TER_REF",
"Total Expense Ratio",
"Total Expense Ratios",
"Expense ratio",
"Total Fund Charge",
"Gross Expense Ratio",
"Gross Expense Ratios",
"Capped Expense Ratio",
"all-in-fee",
"all-in fee",
"all in fee",
"Total Net Expense Ratio",
"Total Net Expense Ratios",
"Total Operating Expense",
"Expense Ratio",
"Expense Ratio -Direct",
"Expense Ratio -Regular",
"month End Expense ratio",
"expense ratio",
"Expenses Ratios",
"Weighted AverageExpense Ratio",
"Synthetic total Expense Ratio",
"Annualised TER including performance fees",
"TER (en %) (with performance)",
"Annualised TER % (with fee waiver)",
"expenses ratio",
"Total Expense as % of AAuM",
"Total Expenses as a % of AAuM",
"Recurring Expenses as a percentage to Average Net Assets",
"Total Expenses as % of AAuM",
"Income and Expenditure",
"Expenditure at Plan level as %",
"Total Expenses Inclusive of Management Fees of Direct Plan",
"Total Expenses Inclusive of Management Fees of Regular Plan"
],
"spanish": [
"Rácio da despesa total",
"Ratio Total de Gastos",
"Ratio de gastos totales",
"Porcentaje de gastos totales",
"Ratio de gastos totales"
],
"german": [
"Gesamtgebühren",
"Kostenpauschale",
"Gesamtkostenquote",
"Gesamtaufwandsquoten",
"Pauschalvergütung",
"GESAMTKOSTENANTEIL",
"kostenquote",
"Gesamt kostenquote",
"Betriebskostenquote des Fonds",
"Pauschalgebühr",
"Total Expense Ratio in Prozent",
"Annualisierte TER in % (Mit Gebührenverzicht)"
],
"dutch": [
"Totale-kostenpercentage",
"Totale Kostenratio",
"TKR",
"Totale kostenpercentage",
"Totaal kostenpercentage"
],
"french": [
"Le ratio de dépenses totales",
"Total des frais sur encours",
"TFE",
"Ratios des charges totales",
"Frais sur encours",
"RCT",
"Ratios des charges totales",
"Total des frais sur encours",
"TER",
"Ratio des dépenses totales",
"Ratio de dépenses totales",
"coefficienti di spesa totale",
"Total des commissions et frais imputés à"
],
"finnish": [
"palkkiot yhteensä",
"total expence ratio"
],
"swedish": [
"coefficienti di spesa totale",
"Totalkostnadsandel",
"TER"
],
"danish": [
"Administrationsomk",
"Omkostningsprocent",
"Totalkostnadsandel"
],
"norwegian": [
"Administrationsomk",
"Kostnadsratio",
"Kostnadsratioer for året",
"Omkostningsprocent"
],
"lithuanian": [
"Bendrųjų išlaidų koeficientas",
"Bendrasis metinis išlaidų rodikli"
],
"polish": [
"Współczynnik kosztów całkowitych",
"Bendrųjų išlaidų koeficientas",
"WKC"
],
"latvian": [
"Kopējo izdevumu koeficients",
"KIK",
"Kostnadsratio"
],
"indiakeywords": [
"Expenditure",
"expense ratio",
"ratio of",
"gross expense"
],
"estonian": [
"Kogukulude suhe",
"Kogukulude suhe aasta lõikes"
],
"malay": [
"NPP",
"Nisbah Perbelanjaan Pengurusan"
],
"italian": [
"Commissione forfetaria di gestione",
"Coefficienti di spesa totale",
"Coefficiente di spesa totale"
],
"portuguese": [
"Taxa Global de custos",
"Quocientes de Despesa Total",
"rácios de despesa total"
],
"hungarian": [
"Összes ráfordítás aránya"
]
},
"tor": {
"english": [
"TOR",
"Turnover Ratio",
"Turnover Rate",
"Portfolio Turnover",
"Portfolio turnover ratio",
"Portfolio turnover rate",
"PTR",
"Rotation",
"Annual Portfolio Turnover Ratio",
"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid"
],
"india": [
"Aggregate Value of Purchase and Sale",
"The aggregate value of investments",
"The aggregate value of purchases",
"the aggregate of sales",
"Aggregate value of investments purchased and sold",
"The aggregate value of purchases and sales"
],
"spanish": [
"Rotación de la Cartera",
"Índice de rotación de la cartera",
"Ratio de rotación de la cartera"
],
"german": [
"Umschlagshaufigkeit",
"Portfolioumschlagshäufigkeit",
"Umschlagshäufigkeit",
"Portefeuilleumsatz",
"Portfolio Turnover Ratio",
"Umsatz",
"Portfolioumschlagsra",
"Umschlagkennziffer",
"Portfolioumschlag",
"Portfolioumschlagsrate"
],
"dutch": [
"Omloopsnelheid",
"Omloopfactor",
"Omloopsnelheid",
"Turnover van de portefeuille",
"Rotatie van de portefeuille",
"POF",
"Portefeuille omloop factor",
"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid"
],
"french": [
"taux de rotation",
"Taux de rotation du portefeuille",
"Rotation de portefeuille",
"Ratio de rotation du portefeuille"
],
"finnish": [
"salkun kiertonopeus"
],
"swedish": [
"Omsättningshastighet"
],
"danish": [
"Omsætningshastighed"
],
"norwegian": [
"Omløpshastighet"
],
"indiakeywords": [
"Aggregate value"
],
"malay": [
"PGP",
"Pusing Ganti Portfolio"
],
"italian": [
"Tasso di movimentazione del portafoglio",
"Turnover del portafoglio",
"Indice di Rotazione del Portafoglio"
],
"portuguese": [
"Rotação média da carteira",
"Índice de Movimentação da Carteira de Investimento"
]
},
"ogc": {
"english": [
"Synthetic Ongoing Charges excluding",
"On-going Charge",
"Ongoing Charge",
"ongoing charges",
"On-going Fee",
"Ongoing fee",
"OGC",
"OGF",
"Operation Charge",
"On Going Charges",
"OC",
"Ongoing Charge Figure OCF",
"OCF Cap Rate",
"Ongoing Fund Charge",
"Operating Charge",
"Operating Charges",
"Operating, Administrative and Servicing Expenses"
],
"spanish": [
"Gastos Corrientes",
"Gastos Recurrentes"
],
"german": [
"Laufende Kosten",
"OCF",
"Ongoing Charge",
"Laufende Gebühren",
"laufende kosten in prozent",
"laufenden Kosten",
"Betriebskosten",
"Betriebsgebühren",
"custos correntes",
"Betriebliche Aufwendungen"
],
"dutch": [
"Lopende kosten",
"Lopende kosten factor",
"LKF",
"custos correntes",
"OCF",
"Lopende kosten factor"
],
"french": [
"Frais courants",
"Commission de frais opérationels"
],
"italian": [
"Spese Correnti",
"Durante il funzionamento Addebiti"
],
"portuguese": [
"Encargos Correntes",
"Custos correntes"
],
"swedish": [
"Årliga avgifter",
"pågående avgifter"
],
"danish": [
"Årlig avgift"
],
"norwegian": [
"Løpende gebyrer"
],
"malay": [
"Tρέχουσες επιβαρύνσεις"
]
},
"performance_fee": {
"english": [
"Performance Fee",
"Performance Fees",
"performance-based fee",
"performance-related fee",
"Performance- related Fee",
"perform- mance fees",
"per- formance fees",
"with performance",
"with performance fee",
"de Performance"
],
"spanish": [
"Comisión de Gestión sobre Resultados",
"Comisión sobre Resultados",
"Comisión de Rentabilidad",
"Comisiones de éxito",
"Comisión de Éxito",
"Comisión por resultados",
"comisión de rentabilidad",
"comisión de rendimiento"
],
"german": [
"Erfolgsabhängige Vergütung",
"Erfolgsabhängige Verwaltungsvergütung",
"Erfolgsbezogene Vergütung",
"Performancegebühren",
"Performancevergütung",
"Anlageerfolgsprämie",
"Anlageerfolgs-prämie",
"Anlageerfolgs- prämie",
"TER in % (inkl.",
"TER % (inkl.",
"TER in % (exkl.",
"TER % (exkl.",
"TER% (einschließlich",
"TER% (ohne",
"An die Wertentwicklung des Fonds gebundene Gebühren",
"Performancegebühr",
"Performance-gebühr",
"Erfolgshonorare",
"Erfolgsabhän-giger Vergütung",
"Erfolgshonorar",
"Performance-Fee",
"Erfolgsgebühr",
"perfolgsabhängige Verwaltungsvergütung",
"performanceabhängige Vergütung",
"Performance- gebühren"
],
"dutch": [
"Prestatievergoeding"
],
"french": [
"Les commissions de surperformance",
"Commission de performance",
"Commissions de surperformance",
"frais de performance"
],
"swedish": [
"Prestationsbaserad avgift",
"Performance-avgift"
],
"norwegian": [
"prestasjonsgebyr"
],
"italian": [
"Commissioni di performance",
"Commissioni legate al rendimento",
"Commissioni dincentivo"
],
"portuguese": [
"Comissão de desempenho",
"Custos de performance",
"Comissão de Gestão Variável"
],
"estonian": [
" Edukustasud aasta lõikes"
],
"latvian": [
"Gada prēmijas par sasniegtajiem rezultātiem"
],
"Lithuanian": [
"Metinis mokestis už veiklos rezultatu"
]
},
"trading expense ratio": {
"english": [
"Trading expense ratio",
"Trading Expense Ratio10"
]
},
"mer": {
"english": [
"Management expense ratio",
"Management expense ratio after taxes",
"Expense ratio"
]
},
"MgtFee": {
"english": [
"Management Fee as % of AAuM",
"Management Fee including GST as % of AAuM",
"Management Fees",
"Management fee inclusive of service tax GST at annualised average rate",
"Management and Trusteeship Fees",
"Investment Management and Trusteeship fees",
"Investment management fees "
]
},
"max_management_fee": {
"english": [
"management fee",
"Periodic Charge",
"Advisory",
"max_management_fee"
]
},
"max_front_load": {
"english": [
"Sales charge",
"subscription fee",
"subscription charge",
"subscription commission",
"sales fee",
"entry fee",
"initial charge",
"preliminary charge",
"preliminary fee",
"Entry Charge",
"Initial Sales Charge",
"max_front_load"
]
},
"min_initial_purchase": {
"english": [
"Minimum Initial Subscription",
"Minimum Subscription",
"Minimum Subscription Amount",
"Minimum initial investment",
"min_initial_purchase"
]
},
"min_subsequent_purchase": {
"english": [
"Minimum Additional",
"Minimum Additional Subscription Amount",
"Minimum initial and subsequence subscription",
"Minimum Additional Subscription",
"Minimum Subsequent Investment",
"Minimum Subsequent Purchase",
"additional",
"min_subsequent_purchase"
]
}
}

View File

@ -0,0 +1,6 @@
{
"tor": "fund_level",
"ogc": "share_level",
"ter": "share_level",
"performance_fee": "share_level"
}

View File

@ -0,0 +1,6 @@
{
"tor": "TOR",
"ogc": "OGC",
"ter": "TER",
"performance_fee": "performance fees"
}

View File

@ -0,0 +1,392 @@
{
"ter": {
"english": [
"Synthetic TER",
"Fund TER",
"TER",
"TFE",
"T.E.R",
"TER_REF",
"Total Expense Ratio",
"Total Expense Ratios",
"Total Fund Charge",
"Gross Expense Ratio",
"Gross Expense Ratios",
"Capped Expense Ratio",
"all-in-fee",
"all in fee",
"Total Net Expense Ratio",
"Total Net Expense Ratios",
"Total Operating Expense",
"Expense Ratio",
"month End Expense ratio",
"Expenses Ratios",
"Weighted AverageExpense Ratio",
"Synthetic total Expense Ratio",
"Annualised TER including performance fees",
"TER (en %) (with performance)",
"Annualised TER % (with fee waiver)"
],
"spanish": [
"Rácio da despesa total",
"Ratio Total de Gastos",
"Ratio de gastos totales",
"Porcentaje de gastos totales",
"Ratio de gastos totales"
],
"german": [
"Mit anteiliger Performance Fee in %",
"TER inkl. Performance-Fee in % **)",
"TER% (einschließlich Anlageerfolgsprämie)",
"TER % (inkl. Anlageerfolgsprämie)",
"Gesamtgebühren",
"Kostenpauschale",
"Gesamtkostenquote",
"Gesamtaufwandsquoten",
"Pauschalvergütung",
"GESAMTKOSTENANTEIL",
"kostenquote",
"Gesamt kostenquote",
"Betriebskostenquote des Fonds",
"Total Expense Ratio in Prozent",
"Annualisierte TER in % (Mit Gebührenverzicht)"
],
"dutch": [
"Totale-kostenpercentage",
"Totale Kostenratio",
"TKR",
"Totale kostenpercentage",
"Totaal kostenpercentage"
],
"french": [
"Le ratio de dépenses totales",
"Total des frais sur encours",
"Ratios des charges totales",
"Frais sur encours",
"RCT",
"Ratios des charges totales",
"Total des frais sur encours",
"Ratio des dépenses totales",
"Ratio de dépenses totales",
"coefficienti di spesa totale",
"Total des commissions et frais imputés à"
],
"finnish": [
"palkkiot yhteensä",
"total expence ratio"
],
"swedish": [
"coefficienti di spesa totale",
"Totalkostnadsandel",
"TER"
],
"danish": [
"Administrationsomk",
"Omkostningsprocent",
"Totalkostnadsandel"
],
"norwegian": [
"Administrationsomk",
"Kostnadsratio",
"Kostnadsratioer for året",
"Omkostningsprocent"
],
"lithuanian": [
"Bendrųjų išlaidų koeficientas",
"Bendrasis metinis išlaidų rodikli"
],
"polish": [
"Współczynnik kosztów całkowitych",
"Bendrųjų išlaidų koeficientas",
"WKC"
],
"latvian": [
"Kopējo izdevumu koeficients",
"KIK",
"Kostnadsratio"
],
"indiakeywords": [
"Expenditure",
"expense ratio",
"ratio of",
"gross expense"
],
"estonian": [
"Kogukulude suhe",
"Kogukulude suhe aasta lõikes"
],
"malay": [
"NPP",
"Nisbah Perbelanjaan Pengurusan"
],
"italian": [
"Commissione forfetaria di gestione",
"Coefficienti di spesa totale",
"Coefficiente di spesa totale"
],
"portuguese": [
"Taxa Global de custos",
"Quocientes de Despesa Total",
"rácios de despesa total"
],
"hungarian": [
"Összes ráfordítás aránya"
]
},
"tor": {
"english": [
"TOR",
"Turnover Ratio",
"Turnover Rate",
"Portfolio Turnover",
"Portfolio turnover ratio",
"Portfolio turnover rate",
"PTR",
"Rotation",
"Annual Portfolio Turnover Ratio",
"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid"
],
"india": [
"Aggregate Value of Purchase and Sale",
"The aggregate value of investments",
"The aggregate value of purchases",
"the aggregate of sales",
"Aggregate value of investments purchased and sold",
"The aggregate value of purchases and sales"
],
"spanish": [
"Rotación de la Cartera",
"Índice de rotación de la cartera",
"Ratio de rotación de la cartera"
],
"german": [
"Umschlagshaufigkeit",
"Portfolioumschlagshäufigkeit",
"Umschlagshäufigkeit",
"Portefeuilleumsatz",
"Umsatz",
"Portfolioumschlagsra",
"Umschlagkennziffer",
"Portfolioumschlag",
"Portfolioumschlagsrate"
],
"dutch": [
"Omloopsnelheid",
"Omloopfactor",
"Omloopsnelheid",
"Turnover van de portefeuille",
"Rotatie van de portefeuille",
"POF",
"Portefeuille omloop factor",
"Taux de rotation - Omloopsnelheid"
],
"french": [
"taux de rotation",
"Taux de rotation du portefeuille",
"Rotation de portefeuille",
"Ratio de rotation du portefeuille"
],
"finnish": [
"salkun kiertonopeus"
],
"swedish": [
"Omsättningshastighet"
],
"danish": [
"Omsætningshastighed"
],
"norwegian": [
"Omløpshastighet"
],
"indiakeywords": [
"Aggregate value"
],
"malay": [
"PGP",
"Pusing Ganti Portfolio"
],
"italian": [
"Tasso di movimentazione del portafoglio",
"Turnover del portafoglio",
"Indice di Rotazione del Portafoglio"
],
"portuguese": [
"Rotação média da carteira",
"Índice de Movimentação da Carteira de Investimento"
]
},
"ogc": {
"english": [
"Synthetic Ongoing Charges excluding",
"On-going Charge",
"Ongoing Charge",
"ongoing charges",
"On-going Fee",
"Ongoing fee",
"OGC",
"OGF",
"OCF",
"Operation Charge",
"On Going Charges",
"OC",
"Ongoing Charge Figure OCF",
"OCF Cap Rate",
"Ongoing Fund Charge",
"Operating Charge",
"Operating Charges",
"Operating, Administrative and Servicing Expenses"
],
"spanish": [
"Gastos Corrientes",
"Gastos Recurrentes",
"Gastos corrientes en porcentaje",
"Gastos Corrientes 1)",
"Gastos Recurrentes 2)",
"Gastos corrientes en porcentaje 3)"
],
"german": [
"Ongoing Charges inkl.Performance-Fee in % **)",
"Laufende Kosten",
"Laufende Kosten in Prozent",
"Laufende Kosten 1)",
"Laufende Kosten in Prozent 2)",
"Laufende Gebühren",
"Betriebskosten",
"Betriebsgebühren",
"custos correntes",
"Betriebliche Aufwendungen"
],
"dutch": [
"Lopende kosten",
"Lopende kosten factor",
"Lopende kosten in procent",
"Lopende kosten factor 1)",
"Lopende kosten in procent 2)",
"LKF",
"custos correntes",
"OCF",
"Lopende kosten factor"
],
"french": [
"Frais courants",
"Frais courants exprimés en pourcentage",
"Frais courants 1)",
"Frais courants exprimés en pourcentage 2)",
"Commission de frais opérationels"
],
"italian": [
"Spese Correnti",
"Durante il funzionamento Addebiti",
"Spese Correnti 1)",
"Durante il funzionamento Addebiti 2)"
],
"portuguese": [
"Encargos Correntes",
"Custos correntes",
"Encargos Correntes 1)",
"Custos correntes 2)"
],
"swedish": [
"Årliga avgifter",
"pågående avgifter",
"Årliga avgifter 1)",
"pågående avgifter 2)"
],
"danish": [
"Årlig avgift",
"Årlig avgift 1)"
],
"norwegian": [
"Løpende gebyrer",
"Løpende gebyrer 1)"
],
"malay": [
"Tρέχουσες επιβαρύνσεις",
"Τρέχουσες επιβαρύνσεις 2)"
]
},
"performance_fee": {
"english": [
"Performance Fee",
"Performance Fees",
"performance-based fee",
"performance-related fee"
],
"spanish": [
"Comisión de Gestión sobre Resultados",
"Comisión sobre Resultados",
"Comisión de Gestión sobre Resultados 1)",
"Comisión sobre Resultados 2)",
"Comisión de Rentabilidad",
"Comisiones de éxito",
"Comisión de Éxito",
"Comisión por resultados",
"comisión de rentabilidad",
"comisión de rendimiento"
],
"german": [
"Erfolgsabhängige Vergütung",
"Erfolgsabhängige Verwaltungsvergütung",
"Erfolgsbezogene Vergütung",
"Performancegebühren",
"Erfolgsbezogene Vergütung 1)",
"Performancevergütung in Prozent",
"Performancevergütung in Prozent 2)",
"Anlageerfolgsprämie",
"An die Wertentwicklung des Fonds gebundene Gebühren",
"Performancegebühr",
"Erfolgshonorare",
"Erfolgshonorar",
"Erfolgsgebühr",
"perfolgsabhängige Verwaltungsvergütung",
"performanceabhängige Vergütung"
],
"dutch": [
"Prestatievergoeding",
"Prestatievergoeding 1)"
],
"french": [
"Les commissions de surperformance",
"Commission de performance",
"Commissions de surperformance",
"frais de performance",
"Commission de performance exprimée en pourcentage 2)"
],
"swedish": [
"Prestationsbaserad avgift",
"Performance-avgift",
"Prestationsbaserad avgift 1)",
"Performance-avgift 2)"
],
"norwegian": [
"prestasjonsgebyr",
"prestasjonsgebyr 1)"
],
"italian": [
"Commissioni di performance",
"Commissioni legate al rendimento",
"Commissioni di performance 1)",
"Commissioni legate al rendimento 2)",
"Commissioni dincentivo"
],
"portuguese": [
"Comissão de desempenho",
"Custos de performance",
"Comissão de desempenho 1)",
"Custos de performance 2)",
"Comissão de Gestão Variável"
],
"estonian": [
"Edukustasud aasta lõikes",
"Edukustasud aasta lõikes 1)"
],
"latvian": [
"Gada prēmijas par sasniegtajiem rezultātiem",
"Gada prēmijas par sasniegtajiem rezultātiem 1)"
],
"Lithuanian": [
"Metinis mokestis už veiklos rezultatu",
"Metinis mokestis už veiklos rezultatu 2)"
]
}
}

View File

@ -0,0 +1,2 @@
{
}

View File

@ -0,0 +1,30 @@
{
"CAN": {
"ar": [
"mer",
"tor",
"trading expense ratio"
]
},
"IND": {
"ar": [
"ter",
"MgtFee",
"tor"
]
},
"default": {
"ar": [
"tor",
"ter",
"ogc",
"performance_fee"
],
"prospectus": [
"max_management_fee",
"max_front_load",
"min_initial_purchase",
"min_subsequent_purchase"
]
}
}

View File

@ -0,0 +1,22 @@
{
"0L00000122": "english",
"0LMIX00001": "english",
"0LMIX00002": "english",
"0L00000482": "english",
"0LMIX00003": "german",
"0L00000152": "german",
"0L00000114": "dutch",
"0L00000138": "french",
"0L00000203": "italian",
"0L00000408": "spanish",
"0L00000348": "portuguese",
"0L00000135": "Finnish",
"0L00000415": "Swedish",
"0L00000104": "Danish",
"0L00000320": "Norwegian",
"0L00000254": "Lithuanian",
"0L00000347": "Polish",
"0L00000250": "Latvian",
"0L00000127": "Estonian",
"0L00000273": "Malay"
}

View File

@ -0,0 +1,3 @@
{
"apply_pdf2html": false
}

View File

@ -0,0 +1,7 @@
{
"objective_strategy":
{
"start": "\\n[0-9\\W\\s]*(investment\\s*objective|objective|fund\\s*objective|investment\\s*objective(s)?\\s*(and|\\&)\\s*(policy|policies|investment)|Investment\\s*(Policy|policies)\\s*and\\s*Objective(s)?\\s*of\\s*the\\s*Trust|investment\\s*objective(s)?\\s*(and|\\&)\\s*policy\\W*and\\s*investment\\s*restriction|Investment\\s*Objective\\s*and\\s*Investment\\s*Policy\\s*and\\s*Strategy|What\\s*the\\s*Fund\\s*Aims\\s*to Deliver\\s*(\\WFund\\s*Objective\\W)?)(s)?(\\W)*\\s*\\n",
"end": "\\n[0-9\\W\\s]*(uk\\s*ucits\\s*investment\\s*and\\s*borrowing\\s*powers|risk\\s*consideration|risk\\s*factor|fund\\s*risk|investor(s)?\\s*profile|final\\s*accounting\\s*date|dealing\\s*cut\\s*off\\s*point|cut\\s*off\\s* point|class(es)?\\s*of\\s*share(s)?\\s*available|class(es)?\\s*of\\s*share(s)?\\s*which\\s*may\\s*be\\s*issue(d)?|manager.*charge|investment\\s*style|profile\\s*of\\s*the\\s*typical\\s*investor|typical\\s*investor(s)?\\s*profile|accounting\\s*reference\\s*date.*|specific\\s*fund\\s*risk\\s*factor|change(s)?\\s*to\\s*the\\s*investment\\s*objective\\s*and(\\/or)?\\s*investment\\s*policy|accounting\\s*and\\s*record\\s*date|share\\s*class(es)?\\s*established\\s*as\\s*at\\s*the\\s*date\\s*of\\s*this\\s*prospectus|isa|class(es)?\\s*for\\s*investment\\s*in\\s*the\\s*catholic\\s*investment\\s*fund|fund\\s*detail|derivative(s)?\\s*and\\s*technique|investment\\s*(restriction|approach)|Tracking\\s*Error|Characteristics\\s*of\\s*the\\s*Trust|investment\\s*style|Limit\\s*on\\s*investment\\s*in\\s*other\\s*collective\\s*investment\\s*scheme|Participation\\s*in\\s*the\\s*Fund|Initial\\s*Charge|other|Additional\\s*Information)(s)?(\\W)*\\s*\\n"
}
}

View File

@ -0,0 +1,134 @@
Instructions:
Please read the image carefully.
1. Identify the text in the PDF page image.
The text will be as output with key: "text".
2. Identify and format the all of tables in the PDF page image.
Table contents should be as markdown format,
ensuring the table structure and contents are exactly as in the PDF page image.
The format should be: |Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|
Each cell in the table(s) should be in the proper position of relevant row and column.
The markdown table(s) will be as output with key: "table_contents".
3. Extract data from upon parsed text and table(s) contents.
3.1 The upon parsed text and table(s) contents as context.
3.2 Data Extraction from parsed table contents
Maybe there are TOR, TER, performance fees, OGC data in the context.
The TOR reported name could be:
TOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, etc.
The TER reported name could be:
TER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, etc.
The performance fees reported name could be:
performance fees, performance fees ratio, Performance, etc.
The OGC reported name could be:
OGC, OGF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, etc.
Data business features:
1. Most of cases, the data is in the table(s) of context.
2. TOR is fund level data.
- The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.
- The sub-fund name may be as the first column values in the table.
3. TER, performance fees, OGC are share class level data.
4. Their values are belong to percentage number.
- The TER, performance fees, OGC values should be less than 100.
- The TOR value could be more than 100, e.g. 126.33.
- The TOR and performance fees could be negative number, e.g. -7.99.
5. Special TER rule
5.1 If there are multiple TER value columns, here is the priority rules:
- With "TER with Performance Fee" and "Fund TER", pick up the values from "TER with Performance Fee".
- With "TER including Performance Fee" and "TER excluding Performance Fee", pick up the values from "TER including Performance Fee".
- With both of "Synthetic TER" and "Fund TER", if "Synthetic TER" with value(s), pick up the value(s) from "Synthetic TER", otherwise, pick up the value(s) from "Fund TER".
- With both of "Net TER (including reimbursement)" and "Capped Expense Ratio", the priority is "Capped Expense Ratio", please exclude the column: "Net TER (including reimbursement)", only pick up the values from "Capped Expense Ratio".
5.2 Please ignore TER values which with the exception of performance fees or excluded performance fees.
6. If with multiple data values in same row, please extract the latest.
7. Only output the values which with significant reported names.
- Please exclude below reported names and relevant values: "Management Fees", "Management", "Management Fees p.a.", "Taxe d Abonnement in % p.a.".
DON'T EXTRACT MANAGEMENT FEES!
8. One fund could be with multiple share classes and relevant TER, performance fees or OGC values.
Special cases:
1. Performance fees is part of TER.
If exist both of "TER including performance fees" and "TER excluding performance fees",
The TER should be "TER including performance fees".
The performance fees should be:
TER including performance fees - TER excluding performance fees.
Here is the example:
GAMAX FUNDS FCP\nClass\nTER (excluding Performance Fees)\nTER (including Performance Fees)\nGAMAX FUNDS - ASIA PACIFIC\nA\n2.07%\n2.07%\n
The output should be:
[
{"fund name": "GAMAX FUNDS - ASIA PACIFIC", "share data": ["share name": "A", "ter": 2.07, "performance fees": 0]}
]
The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0
2. Combo TER value table.
2.1 Exist Feeder fund TER and Master fund TER.
The relevant table header is like this:
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
Please output separately as below:
- "feeder fund share class" and "TER feeder" values
- "Master fund" and "TER Master" values
Here is the example:
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
The output should be:
[
{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]},
{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]},
]
3. Latest data with time series data
Some data table is with multiple date columns, please extract the data from the latest date column:
- Get dates from column header.
- Only extract data from the columns which column header is as the latest date.
The latest date-time column usually is the first "TOR, TER, performance fees, OGC" value column.
Here is the example:
PERFORMANCE\nHISTORICAL PERFORMANCE\nHISTORICAL PERFORMANCE\nFrom \n1 July \nFrom \n19 July \nFrom \n1 January \nFrom \n27 April \nFrom \n19 July \nFrom \n1 January \n2021\nFrom \n22 May \n2021\nFrom \n16 July \n2021\nFrom \n21 September \n2021\nto 30 June 2023\nto 31 December 2022\nto 31 December 2021\nAsia Total Return Fund Class I5 (CHF Hedged) Acc\n6.73%\n \n-13.32%\n \n \n 6.04%\n \n \n \n
The output should be:
[
{"fund name": "Asia Total Return Fund", "share data": ["share name": "Class I5 (CHF Hedged) Acc", "performance fees": 6.73]},
]
The keyword for performance fees is PERFORMANCE, the value 6.73 is the first number with the latest date-time.
4. TER reported name priority
If exists both of Expense Ratio and Synthetic total Expense Ratio, please extract the value of Synthetic total Expense Ratio.
Output requirement:
1. If possible, please extract fund name, share name, TOR, TER, performance fees, OGC values as the output.
2. The required output items are "fund name" and "share name".
3. Only output the dasta point which with relevant value.
4. fund level data: ("fund name" and "TOR") and share level data: ("fund name", "share name", "ter", "performance fees", "ogc") should be output separately.
5. The output should be JSON format, the format is like:
[{
"fund name": "fund 1",
"TOR": 35.26
},
{
"fund name": "fund 2",
"TOR": -28.26
},
{
"fund name": "fund 3",
"TOR": 115.52,
},
{
"fund name": "fund 1",
"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2, "ogc": 0.05},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2, "ogc": 1.16}]
},
{
"fund name": "fund 2",
"share data": [{"share name": "share a", "ter": 1.16, "performance fees": -0.15},{"share name": "share b", "ter": 1.45}]
},
{
"fund name": "fund 3",
"share data": [{"share name": "share a", "performance fees": 0.57, "ogc": 0.18},{"share name": "share b", "performance fees": -0.11}]
}]
Only output JSON data.
Don't output the value which not exist in context, especiall for fund level datapoint: TOR.
If can't find share class name in context, please output empty JSON data: []
Answer:

View File

@ -0,0 +1,161 @@
Context:
{page_text}
Instructions:
Read the context carefully.
Maybe there are TOR, TER, performance fees, OGC data in the context.
The TOR reported name could be:
TOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, etc.
The TER reported name could be:
TER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, etc.
The performance fees reported name could be:
performance fees, performance fees ratio, Performance, etc.
The OGC reported name could be:
OGC, OGF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, etc.
Data business features:
1. Most of cases, the data is in the table(s) of context.
2. TOR is fund level data.
- The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.
- The sub-fund name may be as the first column values in the table.
3. TER, performance fees, OGC are share class level data.
4. Their values are belong to percentage number.
- The TER, performance fees, OGC values should be less than 100.
- The TOR value could be more than 100, e.g. 126.33.
- The TOR and performance fees could be negative number, e.g. -7.99.
5. Special TER rule
5.1 If there are multiple TER value columns, here is the priority rules:
- With "TER with Performance Fee" and "Fund TER", pick up the values from "TER with Performance Fee".
- With "TER including Performance Fee" and "TER excluding Performance Fee", pick up the values from "TER including Performance Fee".
- With both of "Synthetic TER" and "Fund TER", if "Synthetic TER" with value(s), pick up the value(s) from "Synthetic TER", otherwise, pick up the value(s) from "Fund TER".
- With both of "Net TER (including reimbursement)" and "Capped Expense Ratio", the priority is "Capped Expense Ratio", please exclude the column: "Net TER (including reimbursement)", only pick up the values from "Capped Expense Ratio".
5.2 Please ignore TER values which with the exception of performance fees or excluded performance fees.
6. If with multiple data values in same row, please extract the latest.
7. Only output the values which with significant reported names.
- Please exclude below reported names and relevant values: "Management Fees", "Management", "Management Fees p.a.", "Taxe d Abonnement in % p.a.".
DON'T EXTRACT MANAGEMENT FEES!
8. One fund could be with multiple share classes and relevant TER, performance fees or OGC values.
Special cases:
1. Performance fees is part of TER.
If exist both of "TER including performance fees" and "TER excluding performance fees",
The TER should be "TER including performance fees".
The performance fees should be:
TER including performance fees - TER excluding performance fees.
Here is the example:
GAMAX FUNDS FCP\nClass\nTER (excluding Performance Fees)\nTER (including Performance Fees)\nGAMAX FUNDS - ASIA PACIFIC\nA\n2.07%\n2.07%\n
The output should be:
{
"data":
[
{"fund name": "GAMAX FUNDS - ASIA PACIFIC", "share name": "A", "ter": 2.07, "performance fees": 0}
]
}
The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0
2. Combo TER value table.
2.1 Exist Feeder fund TER and Master fund TER.
The relevant table header is like this:
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal
Please output separately as below:
- "feeder fund share class" and "TER feeder" values
- "Master fund" and "TER Master" values
Here is the example:
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
The output should be:
{
"data":
[
{"fund name": "Global Portfolio Solution DKK", "share name": "Balanced Class TI", "ter": 0.1475},
{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share name": "Balanced Class X", "ter": 0.7025}
]
}
3. Latest data with time series data
Some data table is with multiple date columns, please extract the data from the latest date column:
- Get dates from column header.
- Only extract data from the columns which column header is as the latest date.
The latest date-time column usually is the first "TOR, TER, performance fees, OGC" value column.
Here is the example:
PERFORMANCE\nHISTORICAL PERFORMANCE\nHISTORICAL PERFORMANCE\nFrom \n1 July \nFrom \n19 July \nFrom \n1 January \nFrom \n27 April \nFrom \n19 July \nFrom \n1 January \n2021\nFrom \n22 May \n2021\nFrom \n16 July \n2021\nFrom \n21 September \n2021\nto 30 June 2023\nto 31 December 2022\nto 31 December 2021\nAsia Total Return Fund Class I5 (CHF Hedged) Acc\n6.73%\n \n-13.32%\n \n \n 6.04%\n \n \n \n
The output should be:
{
"data":
[
{"fund name": "Asia Total Return Fund", "share name": "Class I5 (CHF Hedged) Acc", "performance fees": 6.73}
]
}
The keyword for performance fees is PERFORMANCE, the value 6.73 is the first number with the latest date-time.
4. TER reported name priority
If exists both of Expense Ratio and Synthetic total Expense Ratio, please extract the value of Synthetic total Expense Ratio.
Output requirement:
1. If possible, please extract fund name, share name, TOR, TER, performance fees, OGC values as the output.
2. If find share name, and exist relevant currency, please output share name + currency, e.g. share name is "Class A", currency is "USD", the output share name should be: "Class A USD".
3. Only output the dasta point which with relevant value.
4. fund level data: ("fund name" and "TOR") and share level data: ("fund name", "share name", "ter", "performance fees", "ogc") should be output separately.
5. The output should be JSON format, the format is like:
{
"data":
[{
"fund name": "fund 1",
"TOR": 35.26
},
{
"fund name": "fund 2",
"TOR": -28.26
},
{
"fund name": "fund 3",
"TOR": 115.52,
},
{
"fund name": "fund 1",
"share name": "share 1",
"ter": 1.23,
"performance fees": 0.2,
"ogc": 0.05
},
{
"fund name": "fund 1",
"share name": "share 2",
"ter": 2.56,
"performance fees": 1.2,
"ogc": 1.16
},
{
"fund name": "fund 2",
"share name": "share a",
"ter": 1.16,
"performance fees": -0.15
},
{
"fund name": "fund 2",
"share name": "share b",
"ter": 1.45
},
{
"fund name": "fund 3",
"share name": "share a",
"performance fees": 0.57,
"ogc": 0.18
},
{
"fund name": "fund 3",
"share name": "share b",
"performance fees": -0.11
}
]
}
Only output JSON data.
Don't output the value which not exist in context, especiall for fund level datapoint: TOR.
If can't find share class name in context, please output empty JSON data: {"data": []}
Answer:

View File

@ -0,0 +1,387 @@
{
"summary": "Read the context carefully.\nMaybe exists {} data in the context.\n",
"summary_image": "Read the image carefully.\nMaybe exists {} data in the image.\n",
"get_image_text": "Instructions:\nYou are given an image of a page from a PDF document. Extract **all visible text** from the image while preserving the original order, structure, and any associated context as closely as possible. Ensure that:\n\n1. **All textual elements are included**, such as headings, body text, tables, and labels.\n2. **Numerical data, symbols, and special characters** are preserved accurately.\n3. Text in structured formats (e.g., tables, lists) is retained in a logical and readable format.\n4. Any text embedded in graphical elements, if clearly readable, is also included.\n5. The text is clean, readable, and free of formatting artifacts or errors.\n\nDo not include non-textual elements such as images or graphics unless they contain text that can be meaningfully extracted.\n\n### Output Format:\nOutput the result as JSON format, here is the example: \n{\"text\": \"Text from image\"}\n\nAnswer: \n[Extracted Text Here, retaining logical structure and all content]",
"image_features":
[
"1. Identify the text in the PDF page image.",
"2. Identify and format the all of tables in the PDF page image.",
"Table contents should be as markdown format,",
"ensuring the table structure and contents are exactly as in the PDF page image.",
"The format should be: |Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|",
"Each cell in the table(s) should be in the proper position of relevant row and column.",
" 3. Extract data from upon parsed text and table(s) contents.",
"3.1 The upon parsed text and table(s) contents as context.",
"3.2 Please extract data from the context."
],
"reported_name": {
"tor": "The TOR reported name could be:\nTOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, Taux de rotation corrigé - Gecorrigeerde omloopsnelheid, etc.",
"ogc": "The OGC reported name could be:\nOGC, OGF, OCF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, OCF Cap Rate, Ongoing Charges Figure, Frais courants, Lopende kosten, Laufende Kosten, Årliga avgifter, Laufende Gebühren, Gastos Corrientes, Gastos Recurrentes, etc.",
"ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), Net TER, Total Expense Ratio in Prozent, Annualisierte TER in % (Mit Gebührenverzicht), Annualised TER % (with fee waiver), kostenquote, Gesamt kostenquote, etc.",
"performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc."
},
"multilingual_reported_name": {
"describe": "Please be careful to extract relevant data from multilingual Context.",
"regular_example_template": "{datapoint} Example {number}:\nLanguage: {language}\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name}\n{value}\n---Context End-----\nAnswer: {answer}",
"special_example_template_none": "{datapoint} Example {number}:\nLanguage: {language}\nIf value is belong to \"-, *, **, N/A, N/A%, N/A %, NONE\", ignore it\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name} 2)\n-\n---Context End-----\nAnswer: {answer}",
"value_examples": ["1,98", "3.25", "2.16", "1,73", "4,53"],
"fund_example": "Fund 1",
"share_example": "Share 1"
},
"data_business_features": {
"common": [
"General rules:",
"- Most of cases, the data is in the table(s) of context.",
"- Fund name: ",
"a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.",
"b. The sub-fund name may be as the first column or first row values in the table.",
"b.1 fund name example:",
"---- Example Start ----",
"Summary information\nCapital International Fund Audited Annual Report 2023 | 15\nFootnotes are on page 17.\nCapital Group Multi-Sector \nIncome Fund (LUX) \n(CGMSILU)\nCapital Group US High Yield \nFund (LUX) (CGUSHYLU)\nCapital Group Emerging \nMarkets Debt Fund (LUX) \n(CGEMDLU)",
"---- Example End ----",
"Fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
"\n",
"c. If with multiple fund names in context, please retrieve the fund name closest above the numerical value.",
"c.1 fund name example:",
"---- Example Start ----",
"AXA World Funds ACT Emerging Markets Bonds\nAXA World Funds \n \nAdditional Unaudited Appendix \n\nƒ$GGLWLRQDO8QDXGLWHG$SSHQGL[$118$/5(3257$;$:RUOG)XQGV\nExpense Ratios (continued) \n \nCalculated TER (1) \nSwiss method \nApplied\nService Fee (2)\nOngoing \nCharges (3) \n \nwith performance \nfees \nwithout performance \nfees \n \nAXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon \nA Capitalisation CHF Hedged \n1.26% \n1.26% \n0.26% \n1.29%",
"---- Example End ----",
"Correct fund name: AXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon",
"\n",
"- Only extract the latest data from context:",
"If with multiple data values in same row, please extract the latest.",
"\n",
"- Reported names:",
"Only output the values which with significant reported names.",
"Please exclude below reported names and relevant values: \"Management Fees\", \"Management\", \"Management Fees p.a.\", \"Taxe d Abonnement in % p.a.\".\nDON'T EXTRACT MANAGEMENT FEES!",
"One fund could be with multiple share classes and relevant share class level data values."
],
"investment_level": {
"tor": "TOR is fund level data.",
"ogc": "OGC is share class level data",
"ter": "TER is share class level data.",
"performance_fee": "Performance fees is share class level data."
},
"data_value_range": {
"tor": "TOR is belong to percentage number, the value could be more than 100, e.g. 126.33.\nTOR could be negative number, e.g. -7.99",
"ogc": "OGC is belong to percentage number, the value should be less than 100.",
"ter": "TER is belong to percentage number, the value should be less than 100.",
"performance_fee": "Performance fees is belong to percentage number, the value should be less than 100.\nPerformance fees could be negative number, e.g. -0.56"
},
"special_rule": {
"tor": [
"If there are multiple TOR reported names, here is the priority rules:",
"- With \"Taux de rotation - Omloopsnelheid\" and \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\", pick up the values from \"Taux de rotation - Omloopsnelheid\".",
"- With \"Omloopsnelheid\" and \"Gecorrigeerde omloopsnelheid\", pick up the values from \"Omloopsnelheid\"."
],
"ter": [
"If there are multiple TER value columns, here is the priority rules:",
"- With \"TER with Performance Fee\" and \"Fund TER\", pick up the values from \"TER with Performance Fee\".",
"- With \"TER (en %) (with performance)\" and \"TER(en %) (without performance)\", pick up the values from \"TER (en %) (with performance)\".",
"- With \"TER including Performance Fee\" and \"TER excluding Performance Fee\", pick up the values from \"TER including Performance Fee\".",
"- With \"TER inkl. Performance-Fee in % **)\" and \"TER exkl. Performance-Fee in % **)\", pick up the values from \"TER inkl. Performance-Fee in % **)\".",
"- With \"TER inkl. Performance-Fee in % **)\" and \"TER inkl. Performance-Fee in % (inkl. Zielfonds)\", pick up the values from \"TER inkl. Performance-Fee in % **)\".",
"- With \"Mit anteiliger Performance Fee in %\" and \"Ohne anteilige Performance-Fee in %\", pick up the values from \"Mit anteiliger Performance Fee in %\".",
"- With both of \"Synthetic TER\" and \"Fund TER\", if \"Synthetic TER\" with value(s), pick up the value(s) from \"Synthetic TER\", otherwise, pick up the value(s) from \"Fund TER\".",
"- With both of \"Net TER\" and \"Capped Expense Ratio\", the priority is \"Net TER\", please exclude the column: \"Capped Expense Ratio\", only pick up the values from \"Net TER\".",
"- With \"Gross TER\", \"Waiver\", \"Net TER\", \"Capped Expense Ratio\" as column titles, pick up the values from \"Net TER\".",
"- If exist Gross TER as column title, please ignore this title",
"Please ignore TER values which with the exception of performance fees or excluded performance fees."
],
"ogc": [
"If there are multiple OGC value columns, here is the priority rules:",
"- With \"Ongoing Charges inkl. Performance-Fee in % **)\" and \"Ongoing Charges exkl. Performance-Fee in % **)\", pick up the values from \"Ongoing Charges inkl. Performance-Fee in % **)\".",
"- With \"Ongoing Charges inkl. Performance-Fee in % **)\" and \"Ongoing Charges inkl. Performance-Fee in % (inkl. Zielfonds)\", pick up the values from \"Ongoing Charges inkl. Performance-Fee in % **)\"."
],
"performance_fee": [
"The performance fees should not be the presence of the rates at which the performance fees are calculated.",
"The reported of performance fees should not be \"% based on the NAV at the end of the accounting period\""
]
}
},
"special_cases": {
"common": [
{
"title": "Latest data with time series data:",
"contents": [
"Case 1:",
"Some data table is with multiple date columns, please extract the data from the latest date column:",
"- Get dates from column header.",
"- Only extract data from the columns which column header is as the latest date.",
"-- commone case",
"The latest date-time column usually is the first datapoint value column.",
"-- special case",
"If with several value columns with same latest date, if one of these column titles with \"(c)\", please extract the data from this column.",
"---Example 1 Start---",
"Columns: \"For the year ended 31 Dec 23\", \"For the year ended 31 Dec 23\", \"For the year ended 31 Dec 23 (a)\", \"For the year ended 31 Dec 23 (b)\", \"For the year ended 31 Dec 23 (c)\", \"For the year ended 31 Dec 22\", \"For the year ended 31 Dec 21\", please extract the data from \"For the year ended 31 Dec 23 (c)\" column.",
"---Example 1 End---",
"---Example 2 Start---",
"Columns: \"For the period ended 31 Dec 23\", \"For the period ended 31 Dec 23\", \"For the period ended 31 Dec 23 (a)\", \"For the period ended 31 Dec 23 (b)\", \"For the period ended 31 Dec 23 (c)\", \"For the period ended 31 Dec 22\", \"For the period ended 31 Dec 21\", please extract the data from \"For the period ended 31 Dec 23 (c)\" column.",
"---Example 2 End---",
"More examples for extracting data from the latest date column:",
"-----Example Start-----",
"performance fees\\nhistorical performance fees\\nhistorical performance fees\\nFrom \\n1 July \\nFrom \\n19 July \\nFrom \\n1 January \\nFrom \\n27 April \\nFrom \\n19 July \\nFrom \\n1 January \\n2021\\nFrom \\n22 May \\n2021\\nFrom \\n16 July \\n2021\\nFrom \\n21 September \\n2021\\nto 30 June 2023\\nto 31 December 2022\\nto 31 December 2021\\nAsia Total Return Fund Class I5 (CHF Hedged) Acc\\n1.73%\\n \\n-1.32%\\n \\n \\n 2.04%\\n \\n \\n \\n",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"Asia Total Return Fund\", \"share name\": \"Class I5 (CHF Hedged) Acc\", \"performance_fee\": 1.73}]}",
"The keywords are performance fees, the value 1.73 is the first number with the latest date-time.",
"Case 2:",
"Some table with messy text as header, please extract the data from the first 1 - 2 data value columns:",
"Example context:",
"-----Example Start-----",
"1RWHV WR WKH ILQDQFLDO VWDWHPHQWV Notes aux tats financiers\nLO Funds - 30/09/2023\n678 \n,6,1 &RGH \n6XE )XQGV \n6KDUH &ODVV \n)XQG 7(5 \n7(5 ZLWK \n3HUIRUPDQFH \n)HH \f \n6KDUH RI \n3HUIRUPDQFH \n)HH \n)XQG 7(5 \n7(5 ZLWK \n3HUIRUPDQFH \n)HH \f \n6KDUH RI \n3HUIRUPDQFH \n)HH \nCompartiments \nClasse \nTER du Fonds \nTER avec \nComm. de \nPerformance4) \nQuote part de la \nComm. de \nPerformance \nTER du Fonds \nTER avec \nComm. de \nPerformance4) \nQuote part de \nla Comm. de \nPerformance \n \f \n \f \n \f \n \f \n \f \n \f \n\b \n\b \n\b \n\b\n\b\n\b\nLU2376083999 \nTerreNeuve \nN A EUR SH X1 \n1.60 \n1.61 \n0.01 \n1.58 \n1.58 \n- \nLU1858044701 \nTerreNeuve \nN D GBP SH \n1.85 \n1.85 \n- \n1.84 \n1.86 \n- \n",
"-----Example End-----",
"Although the table is with messy text as header, but the latest date columns are the first 2 value columns, they are \"TER du Fonds\" and \"TER avec \nComm. de \nPerformance4\".",
"The TER value is from TER avec \nComm. de \nPerformance4, the performance fees value is from \"TER avec \nComm. de \nPerformance4\" - \"TER du Fonds\", e.g. 1.61 - 1.60 = 0.01, 1.85 - 1.85 = 0.",
"The output should be:",
"{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}",
"Summary: \nIf there are several data value columns in the table, please extract the data from the latest date column(s).",
"If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns.",
"Case 3:",
"If the value of column with latest date is N/A or -, please ignore.",
"-----Example Start-----",
"I-class income shares\n\n31.10.22\n30.04.22\n30.04.21\n30.04.20\n\npence per share\npence per share\npence per share\npence per share\nOther information\nOperating charges**\nN/A\n—\n0.90%\n0.90%",
"-----Example End-----",
"The output should be:",
"{\"data\": []}"
]
},
{
"title": "Don't fetch data with \"up to\" statement",
"contents":[
"If the value is with \"up to\" statement, please ignore the value.",
"Example 1:",
"-----Example Start-----",
"A-Class\nB-Class\nC-Class\n",
"TER\nUp to 1.00%\nUp to 1.20%\nUp to 1.50%\n",
"-----Example End-----",
"The output should be:",
"{\"data\": []}",
"Example 2:",
"-----Example Start-----",
"A-Aktien\nB-Aktien\nC-Aktien\n",
"TER\nbis zu 1,20 % p.a.\nbis zu 2,20 % p.a.\nbis zu 1,00 % p.a.\n",
"-----Example End-----",
"The output should be:",
"{\"data\": []}"
]
}
],
"ter": [
{
"title": "Combo TER value table:",
"contents": [
"Exist Feeder fund TER and Master fund TER.",
"The relevant table header is like this:",
"Feeder fund (share class)\\nMaster fund\\nTER\\nFeeder\\nTER Master\\nTotal",
"Please output separately as below:",
"- \"feeder fund share class\" and \"TER feeder\" values",
"- \"Master fund\" and \"TER Master\" values",
"Here is the example:",
"-----Example Start-----",
"Feeder fund (share class)\\nMaster fund\\nTER\\nFeeder\\nTER Master\\nTotal\\nGlobal Portfolio Solution DKK -\\nBalanced Class TI\\nDanske Invest SICAV Global Portfolio\\nSolution Balanced Class X\\n0.1475%\\n0.7025%\\n0.850%\\n",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"Global Portfolio Solution DKK\", \"share name\": \"Balanced Class TI\", \"ter\": 0.1475}, {\"fund name\": \"Danske Invest SICAV Global Portfolio Solution DKK\", \"share name\": \"Balanced Class X\", \"ter\": 0.7025}]}"
]
},
{
"title": "TER reported name priority:",
"contents": [
"If exists both of Expense Ratio and Synthetic total Expense Ratio, please extract the value of Synthetic total Expense Ratio."
]
}
],
"performance_fee": [
{
"title": "Performance fees is part of TER:",
"contents": [
"Case 1:",
"If exist both of \"TER including performance fees\" and \"TER excluding performance fees\",",
"The TER should be \"TER including performance fees\".",
"The performance fees should be:",
"TER including performance fees - TER excluding performance fees.",
"Here is the example:",
"Example 1:",
"-----Example Start-----",
"GAMAX FUNDS FCP\\nClass\\nTER (excluding Performance Fees)\\nTER (including Performance Fees)\\nGAMAX FUNDS - ASIA PACIFIC\\nA\\n2.07%\\n2.07%\\n",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"GAMAX FUNDS - ASIA PACIFIC\", \"share name\": \"A\", \"ter\": 2.07, \"performance_fee\": 0}]}",
"The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0",
"Example 2:",
"-----Example Start-----",
"D/S Strategie ausgewogen\\nErgänzende Angaben für Anleger in der Schweiz zum 31. Dezember 2020 (ungeprüft)\\nFonds\\nTER exkl. \\nPerformance-Fee in % **)\\nTER inkl. \\nPerformance-Fee in % **)\\nTER inkl. \\nPerformance-Fee in % (inkl. Zielfonds)\\n1,15\\n1,63\\n1,15\\n1,63\\nTER exkl.\\nPerformance-Fee in % (inkl. Zielfonds)",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"D/S Strategie ausgewogen\", \"ter\": 1.15, \"performance_fee\": 0}]}",
"The performance fees value is TER inkl. Performance-Fee in % **) - TER exkl. Performance-Fee in % **) = 1,15 - 1,15 = 0",
"Example 3:",
"-----Example Start-----",
"TER % \n(inkl. \nAnlageerfolgsprämie)\nTER %\n(exkl. \nAnlageerfolgsprämie)\nPIANO 400 Fund\n0,58 %\n0,58 %\n",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}",
"The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0",
"Example 4:",
"-----Example Start-----",
"Fonds \nTER % \n(einschließlich \nAnlageerfolgs- \nprämie) \nTER % \n(ohne \nAnlageerfolgs-\nprämie) \ndb x-trackers EUR Liquid Corporate 12.5 UCITS ETF \n \n \nKlasse 1C \n0,35 % \n0,35 %",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"db x-trackers EUR Liquid Corporate 12.5 UCITS ETF\", \"share name\": \"Klasse 1C\", \"ter\": 0.35, \"performance_fee\": 0}]}",
"The performance fees value is TER % (einschließlich Anlageerfolgsprämie) - TER % (ohne Anlageerfolgsprämie) = 0,35 - 0,35 = 0",
"or TER % (einschließlich Anlageerfolgs- \nprämie) - TER % (ohne Anlageerfolgs- \nprämie) = 0,35 - 0,35 = 0",
"Case 2:",
"If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ",
"The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ",
"But it's incorrect, according to this issue, please still extract performance fees from \"TER including performance fees\" - \"TER excluding performance fees\".",
"To make sure performance fees is with actual value.",
"Case 3:",
"If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"SYNTHETIC TER\", ",
"The performace fee value is still \"TER including performance fees\" - \"TER excluding performance fees\", ",
"For this scenario, please ignore the \"SYNTHETIC TER\" column.",
"Here is the example:",
"-----Example Start-----",
"As at September 30, 2022, the annualised total expense ratios of \\nthe sub-fund Pictet - Corto Europe Long Short are as follows: \\nCLASS \\nANNUALISED TER INCLUDING \\nPERFORMANCE FEES \\nANNUALISED TER EXCLUDING \\nPERFORMANCE FEES \\nSYNTHETIC TER \\nP EUR \\n1.66% \\n1.66% \\n1.98%",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}",
"Attention: Please always output performance fee value including 0 after calculation as (TER including performance fees - TER excluding performance fees), although the value is 0, but it's with actual value."
]
},
{
"title": "Performance fees is part of OGC:",
"contents": [
"If exist both of \"Ongoing Charges including Performance Fee\" and \"Ongoing Charges excluding Performance Fee\",",
"The OGC should be \"Ongoing Charges including Performance Fee\".",
"The performance fees should be:",
"Ongoing Charges including Performance Fee - Ongoing Charges excluding Performance Fee.",
"Here is the example:",
"Example 1:",
"-----Example Start-----",
"GAMAX FUNDS FCP\\nClass\\Ongoing Charges (excluding Performance Fees)\\Ongoing Charges (including Performance Fees)\\nFund 1\\nShare A\\n1.50%\\n1.58%\\n",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"Fund 1\", \"share name\": \"Share 1\", \"ogc\": 1.58, \"performance_fee\": 0.08}]}",
"The performance fees value is Ongoing Charges including Performance Fee - Ongoing Charges excluding Performance Fee = 1.58 - 1.50 = 0.08",
"Example 2:",
"-----Example Start-----",
"Fund1\\nOngoing Charges exkl. \\nPerformance-Fee in % **)\\nOngoing Charges exkl. \\nPerformance-Fee in % (inkl. Zielfonds)\\nOngoing Charges inkl. \\nPerformance-Fee in % **)\\nOngoing Charges inkl. \\nPerformance-Fee in % (inkl. Zielfonds)\\n1,15\\n1,35\\n1,20\\n1,35\\n",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"Fund1\", \"ogc\": 1.20, \"performance_fee\": 0.05}]}",
"The performance fees value is Ongoing Charges inkl. Performance-Fee in % **) - Ongoing Charges exkl. Performance-Fee in % **) = 1.20 - 1.15 = 0.05"
]
}
],
"tor": [
{
"title": "TOR with TER and multiple years:",
"contents": [
"TOR and TER are in same table and with multiple years, please extract the TER and TOR value from the latest year column.",
"---Example 1 Start---",
"APPENDIX 1 TOTAL EXPENSE RATIOS AND PORTFOLIO TURNOVER RATIOS\nTotal Expense Ratios are based on the trading 12 months preceding the dates listed below. \nTER \nPTR* \nFor the period/year ended \n2024\n2023\n2024\n2023\nYacktman \nClass A US$ \n1.70%\n1.71%\nTotal Sub-Fund \n(5.94)%\n(5.57)%\nDriehaus Emerging \nClass A US$ \n1.76%\n1.89%\nTotal Sub-Fund \n101.51%\n89.41%",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Yacktman\", \"share name\": \"Class A US$\", \"ter\": 1.70}, {\"fund name\": \"Yacktman\", \"tor\": -5.94}, {\"fund name\": \"Driehaus Emerging\", \"share name\": \"Class A US$\", \"ter\": 1.76}, {\"fund name\": \"Driehaus Emerging\", \"tor\": 101.51}]}"
]
}
],
"extreme_complex": [
{
"title": "Complex Data Table Structure",
"regex": "([A-Z]{1,2}\\,\\s?){3,}",
"contents": [
"Complex Data Table Structure",
"Table structure: the first column is fund name, for each table title, there are a lot of share class names in it.",
"Please split these share class names and extract all of relevant data as fund name, share name, data point and value one by one from the table.",
"-----Example Start-----",
"Charges and expenses (continued) ",
"d) Operating, Administrative and Servicing Expenses / Operating Currency Hedged Share Class Fees (continued)",
"The following table shows the rates of Operating, Administrative and Servicing Expenses:",
"Class A, B, E, ",
"M,O ",
"EQUITY SUB-FUNDS ",
"a) Equity sub-funds ",
"Fund 1",
"0.35",
"Fund 2",
"0.26",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"Fund 1\", \"share name\": \"A\", \"ogc\": 0.35},",
"{\"fund name\": \"Fund 1\", \"share name\": \"B\", \"ogc\": 0.35},",
"{\"fund name\": \"Fund 1\", \"share name\": \"E\", \"ogc\": 0.35},",
"{\"fund name\": \"Fund 1\", \"share name\": \"M\", \"ogc\": 0.35},",
"{\"fund name\": \"Fund 1\", \"share name\": \"O\", \"ogc\": 0.35}",
"{\"fund name\": \"Fund 2\", \"share name\": \"A\", \"ogc\": 0.26},",
"{\"fund name\": \"Fund 2\", \"share name\": \"B\", \"ogc\": 0.26},",
"{\"fund name\": \"Fund 2\", \"share name\": \"E\", \"ogc\": 0.26},",
"{\"fund name\": \"Fund 2\", \"share name\": \"M\", \"ogc\": 0.26},",
"{\"fund name\": \"Fund 2\", \"share name\": \"O\", \"ogc\": 0.26}]}"
]
}
]
},
"output_requirement": {
"common": [
"If possible, please extract fund name, share name, TOR, TER, performance fees, OGC values as the output.",
"If find share name, and exist relevant currency, please output share name + currency, e.g. share name is \"Class A\", currency is \"USD\", the output share name should be: \"Class A USD\".",
"If find fund name, and exist sub fund name, please output fund name + sub fund name, e.g. fund name is \"Black Rock European\", sub fund name is \"Growth\", the output fund name should be: \"Black Rock European Growth\".",
"Only output the data point which with relevant value.",
"Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
"Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.",
"Please also output the data point reported name in context.",
"Example:",
"-----Example Start-----",
"Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n",
"-----Example End-----",
"Output:",
"{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}], \"dp_reported_name\": {\"ter\": \"TER\", \"performance_fee\": \"Performance\nfees\"}}",
"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
"The output should be JSON format, the format is like below example(s):"
],
"fund_level": [
"[{\"fund name\": \"fund 1 - sub fund name 1\",\"tor\": 35.26}, {\"fund name\": \"fund 2 - sub fund name 2\",\"tor\": -28.26}, {\"fund name\": \"fund 3\",\"tor\": 115.52,}]"
],
"share_level": {
"fund_name": [
"fund 1",
"fund 2",
"fund 3"
],
"share_name": [
"share 1",
"share 2",
"share 3"
],
"ogc_value": [
0.05,
1.08,
0.17
],
"ter_value": [
1.23,
2.56,
1.16
],
"performance_fee_value": [
0,
-0.15,
0.11
]
},
"dp_reported_name" : {
"tor": "TOR",
"ogc": "OGC",
"ter": "TER",
"performance_fee": "Performance fees"
}
},
"end": [
"Only output JSON data.",
"Don't output the value which not exist in context, especially for fund level datapoint: TOR.",
"If can't find share class name in context, please output empty JSON data: {\"data\": []}"
]
}

View File

@ -0,0 +1,35 @@
Smith is a professional to process financial report.
He want to extract table(s) from PDF, output as markdown format.
He decides to aks ChatGPT4o to help him for this.
Smith's prompt is as below:
--------------------------------------Smith's prompts start--------------------------------------
Instructions:
Please read the image carefully.
Answer below questions:
1. Please find the table or tables in the image.
2. Output the table contents as markdown format, it's like:
|name|age|hobby|
|Annie|18|music|
The contents should be exactly precise as the image contents.
3. Please output the results as JSON format, the result member is with legal markdown table format, the example is:
{
"tables": ["
|name|age|hobby|
|Annie|18|music|
"]
}
4. Only output JSON with tables
Here is the answer from ChatGPT4o:
--------------------------------------ChatGPT4o start--------------------------------------
|Share Class|TER for the year (Note 6)|\n|---|---|\n|AI - Shares| |\n|BF - Shares| |\n|BI - Shares| |\n|BP - Shares| |\n|E - Shares|0.30%|\n|HAF - SEK Shares|0.84%|\n|HAI - SEK Shares|1.59%|\n|HB - EUR Shares| |\n|HB - SEK Shares| |\n|HBC - EUR Shares|0.65%|\n|HBF - EUR Shares| |\n|HBF - NOK Shares| |\n|HBF - SEK Shares| |\n|HBI - DKK Shares| |\n|HBI - EUR Shares| |\n|HBI - NOK Shares| |\n|HBI - SEK Shares| |\n|HY - DKK Shares| |\n|HY - EUR Shares| |\n|HY - SEK Shares| |\n|LE - Shares| |\n|LP - Shares| |\n|X - Shares| |\n|Y - Shares|0.09%|
--------------------------------------ChatGPT4o end--------------------------------------
But it's incorrect, the correct answer is as below:
--------------------------------------correct answer start--------------------------------------
|Share Class|TER for the year (Note 6)|\n|---|---|\n|AI - Shares| |\n|BF - Shares| |\n|BI - Shares|0.30%|\n|BP - Shares|0.84%|\n|E - Shares|1.59%|\n|HAF - SEK Shares| |\n|HAI - SEK Shares| |\n|HB - EUR Shares| |\n|HB - SEK Shares| |\n|HBC - EUR Shares|0.65%|\n|HBF - EUR Shares| |\n|HBF - NOK Shares| |\n|HBF - SEK Shares| |\n|HBI - DKK Shares| |\n|HBI - EUR Shares| |\n|HBI - NOK Shares| |\n|HBI - SEK Shares| |\n|HY - DKK Shares| |\n|HY - EUR Shares| |\n|HY - SEK Shares| |\n|LE - Shares| |\n|LP - Shares| |\n|X - Shares| |\n|Y - Shares|0.09%|
--------------------------------------correct answer end--------------------------------------
Please analyze the image, incorrect answer, correct answer, help Mr. Smith to optimize the instructions and output as JSON format: {"Instructions": "optimized instructions"}
Answer:

View File

@ -0,0 +1,18 @@
Instructions:
Please read the image carefully.
Answer below questions:
1. Please find the table or tables in the image.
2. Output the table contents as markdown format, it's like:
|name|age|hobby|
|Annie|18|music|
The contents should be exactly precise as the image contents.
3. Please output the results as JSON format, the result member is with legal markdown table format, the example is:
{
"tables": ["
|name|age|hobby|
|Annie|18|music|
"]
}
4. Only output JSON with tables
Answer:

View File

@ -0,0 +1,11 @@
Instructions:
Please read the image carefully.
Answer the following questions:
1. Identify the table or tables in the image.
2. Output the table contents in markdown format, ensuring the table structure and contents are exactly as in the image.
The format should be: |Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|
3. Output the results in JSON format with the key 'tables' containing the markdown table(s).
The format should be:
{"tables": ["|Column1|Column2|\n|---|---|\n|Row1Col1|Row1Col2|"]}
4. Only output JSON with tables.
Answer:

View File

@ -0,0 +1,21 @@
Context:
{page_text}
Instructions:
Please read the contex carefully.
Answer below questions:
1. Please find the table or tables in the context.
2. Output the table contents as markdown format, it's like:
|name|age|hobby|
|Annie|18|music|
The contents should be exactly precise as the context.
3. Please output the results as JSON format, the result member is with legal markdown table format, the example is:
{
"tables": ["
|name|age|hobby|
|Annie|18|music|
"]
}
4. Only output JSON with tables
Answer:

View File

@ -0,0 +1,11 @@
Instructions:
Please read the image carefully.
Answer the following questions:
1. Identify the text contents in the image.
2. Output the text contexts, ensuring the contents are exactly as in the image.
The format should be totally same as the sequences in the image.
3. Output the results in JSON format with the key 'text' containing the markdown table(s).
The format should be:
{"text": "image contents text"}
4. Only output JSON with text.
Answer:

318
main.py
View File

@ -77,9 +77,13 @@ class EMEA_AR_Parsing:
try:
os.makedirs(output_pdf_text_folder, exist_ok=True)
if self.filter_pages.apply_pdf2html:
output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pdf2html/")
output_pdf_text_folder = os.path.join(
output_pdf_text_folder, "pdf2html/"
)
else:
output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pymupdf/")
output_pdf_text_folder = os.path.join(
output_pdf_text_folder, "pymupdf/"
)
os.makedirs(output_pdf_text_folder, exist_ok=True)
self.page_text_file = os.path.join(
output_pdf_text_folder, f"{self.doc_id}_page_text.json"
@ -167,7 +171,9 @@ class EMEA_AR_Parsing:
if page_index == -1:
continue
extract_data_list = data.get("extract_data", {}).get("data", [])
dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {})
dp_reported_name_dict = data.get("extract_data", {}).get(
"dp_reported_name", {}
)
highlighted_value_list = []
for extract_data in extract_data_list:
for data_point, value in extract_data.items():
@ -181,7 +187,7 @@ class EMEA_AR_Parsing:
"data_point": data_point,
"parent_text_block": None,
"value": value,
"annotation_attribute": {}
"annotation_attribute": {},
}
drilldown_data_list.append(drilldown_data)
highlighted_value_list.append(value)
@ -196,13 +202,15 @@ class EMEA_AR_Parsing:
"data_point": data_point,
"parent_text_block": None,
"value": reported_name,
"annotation_attribute": {}
"annotation_attribute": {},
}
drilldown_data_list.append(drilldown_data)
highlighted_value_list.append(reported_name)
drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list,
output_pdf_folder=self.drilldown_folder)
drilldown_result = pdf_util.batch_drilldown(
drilldown_data_list=drilldown_data_list,
output_pdf_folder=self.drilldown_folder,
)
annotation_list = []
if len(drilldown_result) > 0:
logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully")
@ -212,23 +220,37 @@ class EMEA_AR_Parsing:
if self.drilldown_folder is not None and len(self.drilldown_folder) > 0:
drilldown_data_folder = os.path.join(self.drilldown_folder, "data/")
os.makedirs(drilldown_data_folder, exist_ok=True)
drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx")
drilldown_file = os.path.join(
drilldown_data_folder, f"{doc_id}_drilldown.xlsx"
)
drilldown_source_df = pd.DataFrame(drilldown_data_list)
annotation_list_df = pd.DataFrame(annotation_list)
# set drilldown_result_df column order as doc_id, pdf_file, page_index,
# data_point, value, matching_val_area, normalized_bbox
try:
annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index",
"data_point", "value", "matching_val_area",
"normalized_bbox"]]
annotation_list_df = annotation_list_df[
[
"doc_id",
"pdf_file",
"page_index",
"data_point",
"value",
"matching_val_area",
"normalized_bbox",
]
]
except Exception as e:
logger.error(f"Error: {e}")
logger.info(f"Writing drilldown data to {drilldown_file}")
try:
with pd.ExcelWriter(drilldown_file) as writer:
drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data")
annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data")
drilldown_source_df.to_excel(
writer, index=False, sheet_name="source_data"
)
annotation_list_df.to_excel(
writer, index=False, sheet_name="drilldown_data"
)
except Exception as e:
logger.error(f"Error: {e}")
annotation_list = annotation_list_df.to_dict(orient="records")
@ -267,7 +289,9 @@ class EMEA_AR_Parsing:
def filter_pages(doc_id: str, pdf_folder: str, doc_source: str) -> None:
logger.info(f"Filter EMEA AR PDF pages for doc_id: {doc_id}")
emea_ar_parsing = EMEA_AR_Parsing(doc_id, doc_source=doc_source, pdf_folder=pdf_folder)
emea_ar_parsing = EMEA_AR_Parsing(
doc_id, doc_source=doc_source, pdf_folder=pdf_folder
)
datapoint_page_info, result_details = emea_ar_parsing.get_datapoint_page_info()
return datapoint_page_info, result_details
@ -315,7 +339,9 @@ def mapping_data(
extract_way=extract_way,
drilldown_folder=drilldown_folder,
)
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(
re_run=re_run_extract_data
)
doc_mapping_data = emea_ar_parsing.mapping_data(
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
)
@ -393,7 +419,7 @@ def batch_start_job(
re_run_mapping_data: bool = False,
force_save_total_data: bool = False,
calculate_metrics: bool = False,
total_data_prefix: str = None
total_data_prefix: str = None,
):
pdf_files = glob(pdf_folder + "*.pdf")
doc_list = []
@ -505,7 +531,9 @@ def batch_start_job(
# metrics_output_folder,
# )
logger.info(f"Calculating metrics for investment mapping by database document mapping")
logger.info(
f"Calculating metrics for investment mapping by database document mapping"
)
missing_error_list, metrics_list, metrics_file = get_metrics(
"document_mapping_in_db",
output_file,
@ -527,7 +555,10 @@ def only_output_mapping_data_in_db(mapping_data: pd.DataFrame) -> None:
sec_id_list = document_mapping["SecId"].unique().tolist()
id_list = fund_id_list + sec_id_list
# filter doc_mapping_data by id_list or empty id
filter_doc_mapping_data = doc_mapping_data[(doc_mapping_data["investment_id"].isin(id_list)) | (doc_mapping_data["investment_id"] == "")]
filter_doc_mapping_data = doc_mapping_data[
(doc_mapping_data["investment_id"].isin(id_list))
| (doc_mapping_data["investment_id"] == "")
]
data_in_mapping_df_list.append(filter_doc_mapping_data)
result_mapping_data_df = pd.concat(data_in_mapping_df_list)
result_mapping_data_df.reset_index(drop=True, inplace=True)
@ -616,7 +647,9 @@ def get_metrics(
ground_truth_sheet_name=ground_truth_sheet_name,
output_folder=output_folder,
)
missing_error_list, metrics_list, metrics_file = metrics.get_metrics(strict_model=False)
missing_error_list, metrics_list, metrics_file = metrics.get_metrics(
strict_model=False
)
return missing_error_list, metrics_list, metrics_file
@ -638,7 +671,12 @@ def test_auto_generate_instructions():
datapoint_list.remove("doc_id")
data_extraction = DataExtraction(
"emear_ar", doc_id, pdf_file, page_text_dict, datapoint_page_info, document_mapping_info_df
"emear_ar",
doc_id,
pdf_file,
page_text_dict,
datapoint_page_info,
document_mapping_info_df,
)
page_index_list = list(page_text_dict.keys())
if len(page_index_list) > 0:
@ -763,13 +801,14 @@ def test_mapping_raw_name():
raw_share_name=raw_share_name,
parent_id="FSGBR051XK",
matching_type="share",
process_cache=process_cache
process_cache=process_cache,
)
print(mapping_info)
def test_translate_pdf():
from core.data_translate import Translate_PDF
pdf_file = r"/data/emea_ar/pdf/451063582.pdf"
output_folder = r"/data/translate/output/"
translate_pdf = Translate_PDF(pdf_file, output_folder)
@ -778,7 +817,9 @@ def test_translate_pdf():
def test_replace_abbrevation():
from utils.biz_utils import replace_abbrevation
text_list= ["M&G European Credit Investment Fund A CHFH Acc",
text_list = [
"M&G European Credit Investment Fund A CHFH Acc",
"M&G European Credit Investment Fund A CHFHInc",
"M&G European Credit Investment Fund A USDHAcc",
"M&G European High Yield Credit Investment Fund E GBPHedgedAcc",
@ -787,7 +828,8 @@ def test_replace_abbrevation():
"M&G Total Return Credit Investment Fund Class WI GBPHedgedInc",
"M&G Total Return Credit Investment Fund Class W GBP HedgedInc",
"M&G Total Return Credit Investment Fund Class P CHF H Acc",
"M&G Total Return Credit Investment Fund P EUR Inc"]
"M&G Total Return Credit Investment Fund P EUR Inc",
]
for text in text_list:
result = replace_abbrevation(text)
logger.info(f"Original text: {text}, replaced text: {result}")
@ -795,6 +837,7 @@ def test_replace_abbrevation():
def test_calculate_metrics():
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
mapping_file = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/doc_ar_data_with_all_4_dp.xlsx"
@ -808,7 +851,10 @@ def test_calculate_metrics():
mapping_share_id = mapping_df["FundClassId"].unique().tolist()
mapping_id_list = mapping_fund_id + mapping_share_id
# filter data_df whether investment_id in mapping_id_list
filter_data_df = data_df[(data_df["investment_id"].isin(mapping_id_list)) | (data_df["investment_id"] == "")]
filter_data_df = data_df[
(data_df["investment_id"].isin(mapping_id_list))
| (data_df["investment_id"] == "")
]
# Investment mapping data
mapping_metrics = get_sub_metrics(filter_data_df, "investment_mapping")
@ -830,21 +876,36 @@ def test_calculate_metrics():
logger.info(f"OGC metrics: {ogc_metrics}")
# performance_fee data
performance_fee_data_df = filter_data_df[filter_data_df["datapoint"] == "performance_fee"]
performance_fee_metrics = get_sub_metrics(performance_fee_data_df, "performance_fee")
performance_fee_data_df = filter_data_df[
filter_data_df["datapoint"] == "performance_fee"
]
performance_fee_metrics = get_sub_metrics(
performance_fee_data_df, "performance_fee"
)
logger.info(f"Performance fee metrics: {performance_fee_metrics}")
metrics_df = pd.DataFrame([mapping_metrics, tor_metrics, ter_metrics, ogc_metrics, performance_fee_metrics])
metrics_df = pd.DataFrame(
[
mapping_metrics,
tor_metrics,
ter_metrics,
ogc_metrics,
performance_fee_metrics,
]
)
metrics_df.reset_index(drop=True, inplace=True)
output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/"
output_metrics_file = os.path.join(output_folder,
r"mapping_data_info_30_documents_all_4_datapoints_roughly_metrics.xlsx")
output_metrics_file = os.path.join(
output_folder,
r"mapping_data_info_30_documents_all_4_datapoints_roughly_metrics.xlsx",
)
with pd.ExcelWriter(output_metrics_file) as writer:
metrics_df.to_excel(writer, index=False, sheet_name="metrics")
def get_sub_metrics(data_df: pd.DataFrame, data_point: str) -> dict:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
gt_list = [1] * len(data_df)
pre_list = data_df["check"].tolist()
# convert pre_list member to be integer
@ -867,54 +928,81 @@ def get_sub_metrics(data_df: pd.DataFrame, data_point: str) -> dict:
"Precision": precision,
"Recall": recall,
"Accuracy": accuracy,
"Support": support
"Support": support,
}
return metrics
def replace_rerun_data(new_data_file: str, original_data_file: str):
data_in_doc_mapping_sheet = "data_in_doc_mapping"
total_mapping_data_sheet = "total_mapping_data"
extract_data_sheet = "extract_data"
new_data_in_doc_mapping = pd.read_excel(new_data_file, sheet_name=data_in_doc_mapping_sheet)
new_total_mapping_data = pd.read_excel(new_data_file, sheet_name=total_mapping_data_sheet)
new_data_in_doc_mapping = pd.read_excel(
new_data_file, sheet_name=data_in_doc_mapping_sheet
)
new_total_mapping_data = pd.read_excel(
new_data_file, sheet_name=total_mapping_data_sheet
)
new_extract_data = pd.read_excel(new_data_file, sheet_name=extract_data_sheet)
document_list = new_data_in_doc_mapping["doc_id"].unique().tolist()
original_data_in_doc_mapping = pd.read_excel(original_data_file, sheet_name=data_in_doc_mapping_sheet)
original_total_mapping_data = pd.read_excel(original_data_file, sheet_name=total_mapping_data_sheet)
original_extract_data = pd.read_excel(original_data_file, sheet_name=extract_data_sheet)
original_data_in_doc_mapping = pd.read_excel(
original_data_file, sheet_name=data_in_doc_mapping_sheet
)
original_total_mapping_data = pd.read_excel(
original_data_file, sheet_name=total_mapping_data_sheet
)
original_extract_data = pd.read_excel(
original_data_file, sheet_name=extract_data_sheet
)
# remove data in original data by document_list
original_data_in_doc_mapping = original_data_in_doc_mapping[~original_data_in_doc_mapping["doc_id"].isin(document_list)]
original_total_mapping_data = original_total_mapping_data[~original_total_mapping_data["doc_id"].isin(document_list)]
original_extract_data = original_extract_data[~original_extract_data["doc_id"].isin(document_list)]
original_data_in_doc_mapping = original_data_in_doc_mapping[
~original_data_in_doc_mapping["doc_id"].isin(document_list)
]
original_total_mapping_data = original_total_mapping_data[
~original_total_mapping_data["doc_id"].isin(document_list)
]
original_extract_data = original_extract_data[
~original_extract_data["doc_id"].isin(document_list)
]
# merge new data to original data
new_data_in_doc_mapping = pd.concat([original_data_in_doc_mapping, new_data_in_doc_mapping])
new_data_in_doc_mapping = pd.concat(
[original_data_in_doc_mapping, new_data_in_doc_mapping]
)
new_data_in_doc_mapping.reset_index(drop=True, inplace=True)
new_total_mapping_data = pd.concat([original_total_mapping_data, new_total_mapping_data])
new_total_mapping_data = pd.concat(
[original_total_mapping_data, new_total_mapping_data]
)
new_total_mapping_data.reset_index(drop=True, inplace=True)
new_extract_data = pd.concat([original_extract_data, new_extract_data])
new_extract_data.reset_index(drop=True, inplace=True)
with pd.ExcelWriter(original_data_file) as writer:
new_data_in_doc_mapping.to_excel(writer, index=False, sheet_name=data_in_doc_mapping_sheet)
new_total_mapping_data.to_excel(writer, index=False, sheet_name=total_mapping_data_sheet)
new_data_in_doc_mapping.to_excel(
writer, index=False, sheet_name=data_in_doc_mapping_sheet
)
new_total_mapping_data.to_excel(
writer, index=False, sheet_name=total_mapping_data_sheet
)
new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet)
def batch_run_documents(doc_source: str = "emea_ar",
def batch_run_documents(
doc_source: str = "emea_ar",
special_doc_id_list: list = None,
pdf_folder:str = r"/data/emea_ar/pdf/",
pdf_folder: str = r"/data/emea_ar/pdf/",
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
output_extract_data_child_folder:str = r"/data/emea_ar/output/extract_data/docs/",
output_extract_data_total_folder:str = r"/data/emea_ar/output/extract_data/total/",
output_mapping_child_folder:str = r"/data/emea_ar/output/mapping_data/docs/",
output_mapping_total_folder:str = r"/data/emea_ar/output/mapping_data/total/",
drilldown_folder: str = r"/data/emea_ar/output/drilldown/"):
sample_document_list_folder = r'./sample_documents/'
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
output_mapping_total_folder: str = r"/data/emea_ar/output/mapping_data/total/",
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
):
sample_document_list_folder = r"./sample_documents/"
document_list_files = glob(sample_document_list_folder + "*.txt")
page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@ -931,9 +1019,11 @@ def batch_run_documents(doc_source: str = "emea_ar",
file_base_name_candidates = []
for document_list_file in document_list_files:
file_base_name = os.path.basename(document_list_file).replace(".txt", "")
if (file_base_name_candidates is not None and
len(file_base_name_candidates) > 0 and
file_base_name not in file_base_name_candidates):
if (
file_base_name_candidates is not None
and len(file_base_name_candidates) > 0
and file_base_name not in file_base_name_candidates
):
continue
with open(document_list_file, "r", encoding="utf-8") as f:
doc_id_list = f.readlines()
@ -954,7 +1044,7 @@ def batch_run_documents(doc_source: str = "emea_ar",
re_run_mapping_data,
force_save_total_data=force_save_total_data,
calculate_metrics=calculate_metrics,
total_data_prefix=file_base_name
total_data_prefix=file_base_name,
)
else:
batch_start_job(
@ -976,28 +1066,34 @@ def batch_run_documents(doc_source: str = "emea_ar",
)
def batch_initial_document(sample_document_list_folder: str = r'./sample_documents/',
def batch_initial_document(
sample_document_list_folder: str = r"./sample_documents/",
document_list_file: str = "sample_document_complex.txt",
doc_source: str = "emea_ar",
pdf_folder: str = r"/data/emea_ar/pdf/",
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/"):
document_list_file_path = os.path.join(sample_document_list_folder, document_list_file)
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
):
document_list_file_path = os.path.join(
sample_document_list_folder, document_list_file
)
with open(document_list_file_path, "r", encoding="utf-8") as f:
doc_id_list = f.readlines()
doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
for doc_id in tqdm(doc_id_list):
logger.info(f"Start to initial document: {doc_id}")
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
emea_ar_parsing = EMEA_AR_Parsing(
doc_id=doc_id,
doc_source=doc_source,
pdf_folder=pdf_folder,
output_extract_data_folder=output_extract_data_child_folder,
output_mapping_data_folder=output_mapping_child_folder)
output_mapping_data_folder=output_mapping_child_folder,
)
def merge_output_data(data_file_path: str,
document_mapping_file: str,
output_data_file_path: str):
def merge_output_data(
data_file_path: str, document_mapping_file: str, output_data_file_path: str
):
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="doc_date")
# set doc_id to be string type
@ -1013,13 +1109,17 @@ def merge_output_data(data_file_path: str,
"tor": "TurnoverRatio",
"ter": "NetExpenseRatio",
"ogc": "OngoingCharge",
"performance_fee": "PerformanceFee"
"performance_fee": "PerformanceFee",
}
total_data_list = []
for doc_id in tqdm(doc_id_list):
doc_data_list = []
doc_data_df = data_df[data_df["doc_id"] == doc_id]
doc_date = str(document_mapping_df[document_mapping_df["DocumentId"] == doc_id]["EffectiveDate"].values[0])[0:10]
doc_date = str(
document_mapping_df[document_mapping_df["DocumentId"] == doc_id][
"EffectiveDate"
].values[0]
)[0:10]
exist_raw_name_list = []
for index, row in doc_data_df.iterrows():
doc_id = str(row["doc_id"])
@ -1035,7 +1135,10 @@ def merge_output_data(data_file_path: str,
for exist_raw_name_info in exist_raw_name_list:
exist_raw_name = exist_raw_name_info["raw_name"]
exist_investment_type = exist_raw_name_info["investment_type"]
if exist_raw_name == raw_name and exist_investment_type == investment_type:
if (
exist_raw_name == raw_name
and exist_investment_type == investment_type
):
exist = True
break
if not exist:
@ -1050,13 +1153,18 @@ def merge_output_data(data_file_path: str,
"NetExpenseRatio": "",
"OngoingCharge": "",
"TurnoverRatio": "",
"PerformanceFee": ""
"PerformanceFee": "",
}
exist_raw_name_list.append({"raw_name": raw_name, "investment_type": investment_type})
exist_raw_name_list.append(
{"raw_name": raw_name, "investment_type": investment_type}
)
doc_data_list.append(data)
# find data from total_data_list by raw_name
for data in doc_data_list:
if data["RawName"] == raw_name and data["investment_type"] == investment_type:
if (
data["RawName"] == raw_name
and data["investment_type"] == investment_type
):
update_key = data_point_dict[datapoint]
data[update_key] = value
if page_index not in data["page_index"]:
@ -1069,9 +1177,9 @@ def merge_output_data(data_file_path: str,
total_data_df.to_excel(writer, index=False, sheet_name="total_data")
def merge_output_data_aus_prospectus(data_file_path: str,
document_mapping_file: str,
output_data_file_path: str):
def merge_output_data_aus_prospectus(
data_file_path: str, document_mapping_file: str, output_data_file_path: str
):
# TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="doc_date")
@ -1088,13 +1196,17 @@ def merge_output_data_aus_prospectus(data_file_path: str,
"tor": "TurnoverRatio",
"ter": "NetExpenseRatio",
"ogc": "OngoingCharge",
"performance_fee": "PerformanceFee"
"performance_fee": "PerformanceFee",
}
total_data_list = []
for doc_id in tqdm(doc_id_list):
doc_data_list = []
doc_data_df = data_df[data_df["doc_id"] == doc_id]
doc_date = str(document_mapping_df[document_mapping_df["DocumentId"] == doc_id]["EffectiveDate"].values[0])[0:10]
doc_date = str(
document_mapping_df[document_mapping_df["DocumentId"] == doc_id][
"EffectiveDate"
].values[0]
)[0:10]
exist_raw_name_list = []
for index, row in doc_data_df.iterrows():
doc_id = str(row["doc_id"])
@ -1110,7 +1222,10 @@ def merge_output_data_aus_prospectus(data_file_path: str,
for exist_raw_name_info in exist_raw_name_list:
exist_raw_name = exist_raw_name_info["raw_name"]
exist_investment_type = exist_raw_name_info["investment_type"]
if exist_raw_name == raw_name and exist_investment_type == investment_type:
if (
exist_raw_name == raw_name
and exist_investment_type == investment_type
):
exist = True
break
if not exist:
@ -1125,13 +1240,18 @@ def merge_output_data_aus_prospectus(data_file_path: str,
"NetExpenseRatio": "",
"OngoingCharge": "",
"TurnoverRatio": "",
"PerformanceFee": ""
"PerformanceFee": "",
}
exist_raw_name_list.append({"raw_name": raw_name, "investment_type": investment_type})
exist_raw_name_list.append(
{"raw_name": raw_name, "investment_type": investment_type}
)
doc_data_list.append(data)
# find data from total_data_list by raw_name
for data in doc_data_list:
if data["RawName"] == raw_name and data["investment_type"] == investment_type:
if (
data["RawName"] == raw_name
and data["investment_type"] == investment_type
):
update_key = data_point_dict[datapoint]
data[update_key] = value
if page_index not in data["page_index"]:
@ -1161,11 +1281,11 @@ if __name__ == "__main__":
# output_extract_data_child_folder=output_extract_data_child_folder,
# output_mapping_child_folder=output_mapping_child_folder)
# special_doc_id_list = ["553242411"]
doc_source = "aus_prospectus"
doc_source = "emea_ar"
if doc_source == "aus_prospectus":
special_doc_id_list: list = ["539790009",
special_doc_id_list: list = [
"539790009",
"542300403",
"542301117",
"542306317",
@ -1175,18 +1295,26 @@ if __name__ == "__main__":
"554431052",
"554851189",
"555377021",
"555654388"]
"555654388",
]
special_doc_id_list: list = ["554851189"]
pdf_folder:str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"
output_extract_data_total_folder:str = r"/data/aus_prospectus/output/extract_data/total/"
output_mapping_child_folder:str = r"/data/aus_prospectus/output/mapping_data/docs/"
output_mapping_total_folder:str = r"/data/aus_prospectus/output/mapping_data/total/"
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (
r"/data/aus_prospectus/output/extract_data/docs/"
)
output_extract_data_total_folder: str = (
r"/data/aus_prospectus/output/extract_data/total/"
)
output_mapping_child_folder: str = (
r"/data/aus_prospectus/output/mapping_data/docs/"
)
output_mapping_total_folder: str = (
r"/data/aus_prospectus/output/mapping_data/total/"
)
drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
elif doc_source == "emea_ar":
special_doc_id_list = ["553242411"]
batch_run_documents(doc_source=doc_source,
batch_run_documents(
doc_source=doc_source,
special_doc_id_list=special_doc_id_list,
pdf_folder=pdf_folder,
output_pdf_text_folder=output_pdf_text_folder,
@ -1196,6 +1324,11 @@ if __name__ == "__main__":
output_mapping_total_folder=output_mapping_total_folder,
drilldown_folder=drilldown_folder,
)
elif doc_source == "emea_ar":
special_doc_id_list = ["553242408"]
batch_run_documents(
doc_source=doc_source, special_doc_id_list=special_doc_id_list
)
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
@ -1234,8 +1367,3 @@ if __name__ == "__main__":
# output_extract_data_child_folder,
# extract_way,
# re_run_extract_data)