update for deployment
This commit is contained in:
parent
fb4a6402f0
commit
f10ff8ee33
|
|
@ -7,3 +7,9 @@
|
||||||
/test_metrics
|
/test_metrics
|
||||||
/data
|
/data
|
||||||
/sample_documents/japan_prospectus.txt
|
/sample_documents/japan_prospectus.txt
|
||||||
|
/pdf_table_extraction.py
|
||||||
|
/playground.ipynb
|
||||||
|
/playground.py
|
||||||
|
/specific_calc_metrics.py
|
||||||
|
/test_specific_biz_logic.py
|
||||||
|
/drilldown_practice.py
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ swagger = Swagger(app, template=template)
|
||||||
|
|
||||||
@app.route('/automation/api/model/emea_ar', methods=['POST'])
|
@app.route('/automation/api/model/emea_ar', methods=['POST'])
|
||||||
@swag_from('yml/emea_ar.yml')
|
@swag_from('yml/emea_ar.yml')
|
||||||
def us_ar_data_extract():
|
def emea_ar_data_extract():
|
||||||
"""
|
"""
|
||||||
Extract EMEA AR cost data from EMEA LUX PDF document
|
Extract EMEA AR cost data from EMEA LUX PDF document
|
||||||
input sample:
|
input sample:
|
||||||
|
|
@ -40,6 +40,7 @@ def us_ar_data_extract():
|
||||||
return jsonify({"error": "doc_id is required"}), 400
|
return jsonify({"error": "doc_id is required"}), 400
|
||||||
|
|
||||||
pdf_folder = r"./data/emea_ar/pdf/"
|
pdf_folder = r"./data/emea_ar/pdf/"
|
||||||
|
output_pdf_text_folder = r"./data/emea_ar/output/pdf_text/"
|
||||||
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
|
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
|
||||||
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
|
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
|
||||||
drilldown_folder = r"./data/emea_ar/output/drilldown/"
|
drilldown_folder = r"./data/emea_ar/output/drilldown/"
|
||||||
|
|
@ -62,6 +63,7 @@ def us_ar_data_extract():
|
||||||
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
||||||
doc_source="emea_ar",
|
doc_source="emea_ar",
|
||||||
pdf_folder=pdf_folder,
|
pdf_folder=pdf_folder,
|
||||||
|
output_pdf_text_folder=output_pdf_text_folder,
|
||||||
output_extract_data_folder=output_extract_data_folder,
|
output_extract_data_folder=output_extract_data_folder,
|
||||||
output_mapping_data_folder=output_mapping_data_folder,
|
output_mapping_data_folder=output_mapping_data_folder,
|
||||||
extract_way=extract_way,
|
extract_way=extract_way,
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ import json_repair
|
||||||
import re
|
import re
|
||||||
import fitz
|
import fitz
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from traceback import print_exc
|
||||||
from utils.gpt_utils import chat
|
from utils.gpt_utils import chat
|
||||||
from utils.pdf_util import PDFUtil
|
from utils.pdf_util import PDFUtil
|
||||||
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
|
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
|
||||||
|
|
@ -294,6 +295,8 @@ class DataExtraction:
|
||||||
for current_page_data in page_data_list:
|
for current_page_data in page_data_list:
|
||||||
if current_page_data in next_page_data_list:
|
if current_page_data in next_page_data_list:
|
||||||
next_page_data_list.remove(current_page_data)
|
next_page_data_list.remove(current_page_data)
|
||||||
|
if len(next_page_data_list) == 0:
|
||||||
|
break
|
||||||
next_page_extract_data["extract_data"][
|
next_page_extract_data["extract_data"][
|
||||||
"data"
|
"data"
|
||||||
] = next_page_data_list
|
] = next_page_data_list
|
||||||
|
|
|
||||||
|
|
@ -1,159 +0,0 @@
|
||||||
from tqdm import tqdm
|
|
||||||
from glob import glob
|
|
||||||
import json
|
|
||||||
import pandas as pd
|
|
||||||
import os
|
|
||||||
from traceback import print_exc
|
|
||||||
from sklearn.metrics import recall_score
|
|
||||||
|
|
||||||
from utils.logger import logger
|
|
||||||
from utils.pdf_util import PDFUtil
|
|
||||||
|
|
||||||
|
|
||||||
def drilldown_documents(pdf_folder: str, extract_data_folder: str, drilldown_folder: str):
|
|
||||||
extract_files = glob(extract_data_folder + '*.json')
|
|
||||||
|
|
||||||
for index, json_file in enumerate(tqdm(extract_files)):
|
|
||||||
try:
|
|
||||||
# doc_id = file.split('/')[-1].split('.')[0]
|
|
||||||
json_base_name = os.path.basename(json_file)
|
|
||||||
doc_id = json_base_name.split('.')[0]
|
|
||||||
logger.info(f"Processing {doc_id}")
|
|
||||||
pdf_file = os.path.join(pdf_folder, f"{doc_id}.pdf")
|
|
||||||
if not os.path.exists(pdf_file):
|
|
||||||
logger.error(f"PDF file not found for {doc_id}")
|
|
||||||
continue
|
|
||||||
with open(json_file, "r", encoding="utf-8") as f:
|
|
||||||
data_from_gpt = json.load(f)
|
|
||||||
drilldown_pdf_document(doc_id=doc_id,
|
|
||||||
pdf_file=pdf_file,
|
|
||||||
drilldown_folder=drilldown_folder,
|
|
||||||
data_from_gpt=data_from_gpt)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print_exc()
|
|
||||||
logger.error(f"Error in processing {doc_id}: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
def drilldown_pdf_document(doc_id:str,
|
|
||||||
pdf_file: str,
|
|
||||||
drilldown_folder: str,
|
|
||||||
data_from_gpt: list) -> list:
|
|
||||||
logger.info(f"Drilldown PDF document for doc_id: {doc_id}")
|
|
||||||
pdf_util = PDFUtil(pdf_file)
|
|
||||||
drilldown_data_list = []
|
|
||||||
for data in data_from_gpt:
|
|
||||||
doc_id = str(data.get("doc_id", ""))
|
|
||||||
# if doc_id != "506326520":
|
|
||||||
# continue
|
|
||||||
page_index = data.get("page_index", -1)
|
|
||||||
if page_index == -1:
|
|
||||||
continue
|
|
||||||
extract_data_list = data.get("extract_data", {}).get("data", [])
|
|
||||||
dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {})
|
|
||||||
if len(dp_reported_name_dict.keys()) == 0:
|
|
||||||
continue
|
|
||||||
highlighted_value_list = []
|
|
||||||
for extract_data in extract_data_list:
|
|
||||||
for data_point, value in extract_data.items():
|
|
||||||
if value in highlighted_value_list:
|
|
||||||
continue
|
|
||||||
if data_point in ["ter", "ogc", "performance_fee"]:
|
|
||||||
continue
|
|
||||||
drilldown_data = {
|
|
||||||
"doc_id": doc_id,
|
|
||||||
"page_index": page_index,
|
|
||||||
"data_point": data_point,
|
|
||||||
"parent_text_block": None,
|
|
||||||
"value": value,
|
|
||||||
"annotation_attribute": {}
|
|
||||||
}
|
|
||||||
drilldown_data_list.append(drilldown_data)
|
|
||||||
highlighted_value_list.append(value)
|
|
||||||
|
|
||||||
for data_point, reported_name in dp_reported_name_dict.items():
|
|
||||||
if reported_name in highlighted_value_list:
|
|
||||||
continue
|
|
||||||
data_point = f"{data_point}_reported_name"
|
|
||||||
drilldown_data = {
|
|
||||||
"doc_id": doc_id,
|
|
||||||
"page_index": page_index,
|
|
||||||
"data_point": data_point,
|
|
||||||
"parent_text_block": None,
|
|
||||||
"value": reported_name,
|
|
||||||
"annotation_attribute": {}
|
|
||||||
}
|
|
||||||
drilldown_data_list.append(drilldown_data)
|
|
||||||
highlighted_value_list.append(reported_name)
|
|
||||||
drilldown_result = []
|
|
||||||
if len(drilldown_data_list) > 0:
|
|
||||||
drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list,
|
|
||||||
output_pdf_folder=drilldown_folder)
|
|
||||||
if len(drilldown_result) > 0:
|
|
||||||
logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully")
|
|
||||||
annotation_list = drilldown_result.get("annotation_list", [])
|
|
||||||
for annotation in annotation_list:
|
|
||||||
annotation["doc_id"] = doc_id
|
|
||||||
if drilldown_folder is not None and len(drilldown_folder) > 0:
|
|
||||||
drilldown_data_folder = os.path.join(drilldown_folder, "data/")
|
|
||||||
os.makedirs(drilldown_data_folder, exist_ok=True)
|
|
||||||
drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx")
|
|
||||||
|
|
||||||
drilldown_source_df = pd.DataFrame(drilldown_data_list)
|
|
||||||
annotation_list_df = pd.DataFrame(annotation_list)
|
|
||||||
# set drilldown_result_df column order as doc_id, pdf_file, page_index,
|
|
||||||
# data_point, value, matching_val_area, normalized_bbox
|
|
||||||
annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index",
|
|
||||||
"data_point", "value", "matching_val_area", "normalized_bbox"]]
|
|
||||||
logger.info(f"Writing drilldown data to {drilldown_file}")
|
|
||||||
with pd.ExcelWriter(drilldown_file) as writer:
|
|
||||||
drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data")
|
|
||||||
annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data")
|
|
||||||
|
|
||||||
|
|
||||||
def calculate_metrics():
|
|
||||||
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
|
||||||
drilldown_data_folder = os.path.join(drilldown_folder, "data/")
|
|
||||||
drilldown_files = glob(drilldown_data_folder + '*.xlsx')
|
|
||||||
y_true_list = []
|
|
||||||
y_pred_list = []
|
|
||||||
series_list = []
|
|
||||||
for drilldown_file in drilldown_files:
|
|
||||||
drilldown_file_base_name = os.path.basename(drilldown_file)
|
|
||||||
if drilldown_file_base_name.startswith("~"):
|
|
||||||
continue
|
|
||||||
drilldown_data = pd.read_excel(drilldown_file, sheet_name="drilldown_data")
|
|
||||||
for index, row in drilldown_data.iterrows():
|
|
||||||
matching_val_area = row["matching_val_area"]
|
|
||||||
# transform matching_val_area to list
|
|
||||||
if isinstance(matching_val_area, str):
|
|
||||||
matching_val_area = eval(matching_val_area)
|
|
||||||
y_true_list.append(1)
|
|
||||||
if len(matching_val_area) > 0:
|
|
||||||
y_pred_list.append(1)
|
|
||||||
else:
|
|
||||||
y_pred_list.append(0)
|
|
||||||
series_list.append(row)
|
|
||||||
recall = recall_score(y_true_list, y_pred_list)
|
|
||||||
logger.info(f"Recall: {recall}, Support: {len(y_true_list)}")
|
|
||||||
no_annotation_df = pd.DataFrame(series_list)
|
|
||||||
no_annotation_df.reset_index(drop=True, inplace=True)
|
|
||||||
metrics_folder = os.path.join(drilldown_folder, "metrics/")
|
|
||||||
os.makedirs(metrics_folder, exist_ok=True)
|
|
||||||
metrics_file = os.path.join(metrics_folder, "metrics.xlsx")
|
|
||||||
metrics_result = {
|
|
||||||
"recall": recall,
|
|
||||||
"support": len(y_true_list)
|
|
||||||
}
|
|
||||||
metrics_df = pd.DataFrame([metrics_result])
|
|
||||||
with pd.ExcelWriter(metrics_file) as writer:
|
|
||||||
metrics_df.to_excel(writer, index=False, sheet_name="metrics")
|
|
||||||
no_annotation_df.to_excel(writer, index=False, sheet_name="no_annotation")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
|
||||||
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
|
||||||
extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/'
|
|
||||||
drilldown_documents()
|
|
||||||
# calculate_metrics()
|
|
||||||
|
|
@ -1,110 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
import os
|
|
||||||
import tqdm
|
|
||||||
import json_repair
|
|
||||||
import json
|
|
||||||
from glob import glob
|
|
||||||
import fitz
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
from utils.logger import logger
|
|
||||||
from utils.pdf_download import download_pdf_from_documents_warehouse
|
|
||||||
from utils.pdf_util import PDFUtil
|
|
||||||
from utils.gpt_utils import chat
|
|
||||||
|
|
||||||
|
|
||||||
class PDFTableExtraction:
|
|
||||||
"""
|
|
||||||
Iterate PDF pages
|
|
||||||
Extract tables from PDF pages
|
|
||||||
Save these tables as markdown files
|
|
||||||
"""
|
|
||||||
def __init__(self,
|
|
||||||
pdf_file: str,
|
|
||||||
output_folder: str) -> None:
|
|
||||||
self.pdf_file = pdf_file
|
|
||||||
self.pdf_file_name = os.path.basename(pdf_file)
|
|
||||||
self.table_extraction_prompts = self.get_table_extraction_prompts()
|
|
||||||
|
|
||||||
self.output_folder = output_folder
|
|
||||||
os.makedirs(output_folder, exist_ok=True)
|
|
||||||
|
|
||||||
self.prompts_output_folder = os.path.join(output_folder, 'pdf_table_prompts/')
|
|
||||||
os.makedirs(self.prompts_output_folder, exist_ok=True)
|
|
||||||
|
|
||||||
self.json_output_folder = os.path.join(output_folder, 'pdf_table_json/')
|
|
||||||
os.makedirs(self.json_output_folder, exist_ok=True)
|
|
||||||
|
|
||||||
self.table_md_output_folder = os.path.join(output_folder, 'pdf_table_markdown/')
|
|
||||||
os.makedirs(self.table_md_output_folder, exist_ok=True)
|
|
||||||
|
|
||||||
def get_table_extraction_prompts(self):
|
|
||||||
instructions_file = r'./instructions/table_extraction_prompts.txt'
|
|
||||||
with open(instructions_file, 'r', encoding='utf-8') as file:
|
|
||||||
return file.read()
|
|
||||||
|
|
||||||
def extract_tables(self):
|
|
||||||
try:
|
|
||||||
if self.pdf_file is None or len(self.pdf_file) == 0 or not os.path.exists(self.pdf_file):
|
|
||||||
logger.error(f"Invalid pdf_file: {self.pdf_file}")
|
|
||||||
return
|
|
||||||
logger.info(f"Start processing {self.pdf_file}")
|
|
||||||
pdf_util = PDFUtil(self.pdf_file)
|
|
||||||
success, text, page_text_dict = pdf_util.extract_text(output_folder=self.output_folder)
|
|
||||||
if success:
|
|
||||||
logger.info(f"Successfully extracted text from {self.pdf_file}")
|
|
||||||
|
|
||||||
for page_num, page_text in page_text_dict.items():
|
|
||||||
try:
|
|
||||||
self.extract_tables_from_page(page_text, page_num)
|
|
||||||
except Exception as e:
|
|
||||||
traceback.print_exc()
|
|
||||||
logger.error(f"Error in extracting tables from page {page_num}: {str(e)}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in extracting PDF tables: {str(e)}")
|
|
||||||
|
|
||||||
|
|
||||||
def extract_tables_from_page(self, page_text: str, page_num: int):
|
|
||||||
pure_pdf_name = self.pdf_file_name.replace('.pdf', '')
|
|
||||||
table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text)
|
|
||||||
prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt')
|
|
||||||
if os.path.exists(prompts_response_file):
|
|
||||||
logger.info(f"Prompts response file already exists: {prompts_response_file}")
|
|
||||||
return
|
|
||||||
|
|
||||||
response, with_error = chat(table_extraction_prompts)
|
|
||||||
if with_error:
|
|
||||||
logger.error(f"Error in extracting tables from page")
|
|
||||||
return
|
|
||||||
|
|
||||||
json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response)
|
|
||||||
if json_response is None:
|
|
||||||
logger.info(f"Can't extract tables from page")
|
|
||||||
return
|
|
||||||
|
|
||||||
table_json_text = json_response.group(1)
|
|
||||||
table_data = {"tables": []}
|
|
||||||
try:
|
|
||||||
table_data = json.loads(table_json_text)
|
|
||||||
except:
|
|
||||||
table_data = json_repair.loads(table_json_text)
|
|
||||||
self.save_table_data(table_data, page_num)
|
|
||||||
|
|
||||||
prompts_response = f'{table_extraction_prompts}\n\n{response}'
|
|
||||||
with open(prompts_response_file, 'w', encoding='utf-8') as file:
|
|
||||||
file.write(prompts_response)
|
|
||||||
|
|
||||||
def save_table_data(self, table_data: dict, page_num: int):
|
|
||||||
pdf_pure_name = self.pdf_file_name.replace('.pdf', '')
|
|
||||||
json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json')
|
|
||||||
with open(json_output_file, 'w', encoding='utf-8') as file:
|
|
||||||
file.write(json.dumps(table_data, indent=4))
|
|
||||||
|
|
||||||
table_list = table_data.get('tables', [])
|
|
||||||
for table_num, table in enumerate(table_list):
|
|
||||||
table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md')
|
|
||||||
table = re.sub(r'(\n)+', '\n', table)
|
|
||||||
with open(table_md_file, 'w', encoding='utf-8') as file:
|
|
||||||
file.write(table)
|
|
||||||
713
playground.ipynb
713
playground.ipynb
|
|
@ -1,713 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 27,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from utils.biz_utils import add_slash_to_text_as_regex\n",
|
|
||||||
"import json\n",
|
|
||||||
"import re"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 29,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"regex = r\"Turnover \\n\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 30,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'Turnover\\\\s+\\\\n'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 30,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"add_slash_to_text_as_regex(regex)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 46,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover reflects the\\n\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 32,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"<re.Match object; span=(141, 151), match='Turnover \\n'>"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 32,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"re.search(regex, text)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 35,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 47,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"TOR no match\n",
|
|
||||||
"Turnover\\*\\s+\\n no match\n",
|
|
||||||
"Turnover\\s+\\n no match\n",
|
|
||||||
"Turnover\\s+Ratio no match\n",
|
|
||||||
"Turnover\\s+Rate no match\n",
|
|
||||||
"Portfolio\\s+Turnover no match\n",
|
|
||||||
"Portfolio\\s+turnover\\s+ratio no match\n",
|
|
||||||
"Portfolio\\s+turnover\\s+rate no match\n",
|
|
||||||
"PTR no match\n",
|
|
||||||
"Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
|
|
||||||
" datapoint_keywords_config = json.load(file)\n",
|
|
||||||
"\n",
|
|
||||||
"tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
|
|
||||||
"\n",
|
|
||||||
"for tor_regex in tor_regex_list:\n",
|
|
||||||
" regex = add_slash_to_text_as_regex(tor_regex)\n",
|
|
||||||
" search = re.search(regex, text)\n",
|
|
||||||
" if search:\n",
|
|
||||||
" print(f\"{regex} match {search.group()}\")\n",
|
|
||||||
" else:\n",
|
|
||||||
" print(f\"{regex} no match\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from utils.sql_query_util import query_investment_by_provider, query_document_fund_mapping\n",
|
|
||||||
"import pandas as pd"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"provider_mapping = query_investment_by_provider(company_id=\"0C00008QVP\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>ProviderId</th>\n",
|
|
||||||
" <th>ProviderName</th>\n",
|
|
||||||
" <th>FundId</th>\n",
|
|
||||||
" <th>FundName</th>\n",
|
|
||||||
" <th>ISIN</th>\n",
|
|
||||||
" <th>SecId</th>\n",
|
|
||||||
" <th>CurrencyId</th>\n",
|
|
||||||
" <th>ShareClassName</th>\n",
|
|
||||||
" <th>ShareClassStatus</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>840</th>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH4</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
|
|
||||||
" <td>LU1053597990</td>\n",
|
|
||||||
" <td>F000010MEE</td>\n",
|
|
||||||
" <td>USD</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>841</th>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH4</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
|
|
||||||
" <td>LU1053597727</td>\n",
|
|
||||||
" <td>F000010MEF</td>\n",
|
|
||||||
" <td>USD</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>842</th>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU0993574440</td>\n",
|
|
||||||
" <td>F000010MEG</td>\n",
|
|
||||||
" <td>USD</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>843</th>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU1805616171</td>\n",
|
|
||||||
" <td>F000010PUN</td>\n",
|
|
||||||
" <td>CHF</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>844</th>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU1076358073</td>\n",
|
|
||||||
" <td>F000010MEH</td>\n",
|
|
||||||
" <td>EUR</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>845</th>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU2046740358</td>\n",
|
|
||||||
" <td>F0000143Y8</td>\n",
|
|
||||||
" <td>USD</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>846</th>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU2046740432</td>\n",
|
|
||||||
" <td>F0000143Y9</td>\n",
|
|
||||||
" <td>USD</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>847</th>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU0993569101</td>\n",
|
|
||||||
" <td>F00001564H</td>\n",
|
|
||||||
" <td>USD</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>848</th>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU2122516821</td>\n",
|
|
||||||
" <td>F000014UPK</td>\n",
|
|
||||||
" <td>AUD</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" ProviderId ProviderName FundId \\\n",
|
|
||||||
"840 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH4 \n",
|
|
||||||
"841 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH4 \n",
|
|
||||||
"842 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"843 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"844 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"845 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"846 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"847 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"848 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"\n",
|
|
||||||
" FundName ISIN \\\n",
|
|
||||||
"840 T. Rowe Price Funds Series II SICAV - Credit O... LU1053597990 \n",
|
|
||||||
"841 T. Rowe Price Funds Series II SICAV - Credit O... LU1053597727 \n",
|
|
||||||
"842 T. Rowe Price Funds Series II SICAV - Floating... LU0993574440 \n",
|
|
||||||
"843 T. Rowe Price Funds Series II SICAV - Floating... LU1805616171 \n",
|
|
||||||
"844 T. Rowe Price Funds Series II SICAV - Floating... LU1076358073 \n",
|
|
||||||
"845 T. Rowe Price Funds Series II SICAV - Floating... LU2046740358 \n",
|
|
||||||
"846 T. Rowe Price Funds Series II SICAV - Floating... LU2046740432 \n",
|
|
||||||
"847 T. Rowe Price Funds Series II SICAV - Floating... LU0993569101 \n",
|
|
||||||
"848 T. Rowe Price Funds Series II SICAV - Floating... LU2122516821 \n",
|
|
||||||
"\n",
|
|
||||||
" SecId CurrencyId ShareClassName \\\n",
|
|
||||||
"840 F000010MEE USD T. Rowe Price Funds Series II SICAV - Credit O... \n",
|
|
||||||
"841 F000010MEF USD T. Rowe Price Funds Series II SICAV - Credit O... \n",
|
|
||||||
"842 F000010MEG USD T. Rowe Price Funds Series II SICAV - Floating... \n",
|
|
||||||
"843 F000010PUN CHF T. Rowe Price Funds Series II SICAV - Floating... \n",
|
|
||||||
"844 F000010MEH EUR T. Rowe Price Funds Series II SICAV - Floating... \n",
|
|
||||||
"845 F0000143Y8 USD T. Rowe Price Funds Series II SICAV - Floating... \n",
|
|
||||||
"846 F0000143Y9 USD T. Rowe Price Funds Series II SICAV - Floating... \n",
|
|
||||||
"847 F00001564H USD T. Rowe Price Funds Series II SICAV - Floating... \n",
|
|
||||||
"848 F000014UPK AUD T. Rowe Price Funds Series II SICAV - Floating... \n",
|
|
||||||
"\n",
|
|
||||||
" ShareClassStatus \n",
|
|
||||||
"840 0 \n",
|
|
||||||
"841 0 \n",
|
|
||||||
"842 1 \n",
|
|
||||||
"843 0 \n",
|
|
||||||
"844 0 \n",
|
|
||||||
"845 0 \n",
|
|
||||||
"846 0 \n",
|
|
||||||
"847 0 \n",
|
|
||||||
"848 0 "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"provider_mapping[provider_mapping[\"FundName\"].str.contains(\"T. Rowe Price Funds Series II SICAV\")]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"document_mapping = query_document_fund_mapping(doc_id=\"486378555\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 10,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>DocumentId</th>\n",
|
|
||||||
" <th>EffectiveDate</th>\n",
|
|
||||||
" <th>DocumentType</th>\n",
|
|
||||||
" <th>Format</th>\n",
|
|
||||||
" <th>Language</th>\n",
|
|
||||||
" <th>DocumentStatus</th>\n",
|
|
||||||
" <th>ProviderId</th>\n",
|
|
||||||
" <th>ProviderName</th>\n",
|
|
||||||
" <th>FundId</th>\n",
|
|
||||||
" <th>FundName</th>\n",
|
|
||||||
" <th>Domicile</th>\n",
|
|
||||||
" <th>SecId</th>\n",
|
|
||||||
" <th>CurrencyId</th>\n",
|
|
||||||
" <th>ShareClassName</th>\n",
|
|
||||||
" <th>ISIN</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" <td>486378555</td>\n",
|
|
||||||
" <td>2022-06-30</td>\n",
|
|
||||||
" <td>4</td>\n",
|
|
||||||
" <td>PDF</td>\n",
|
|
||||||
" <td>0L00000122</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LUX</td>\n",
|
|
||||||
" <td>F000010MEG</td>\n",
|
|
||||||
" <td>USD</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU0993574440</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1</th>\n",
|
|
||||||
" <td>486378555</td>\n",
|
|
||||||
" <td>2022-06-30</td>\n",
|
|
||||||
" <td>4</td>\n",
|
|
||||||
" <td>PDF</td>\n",
|
|
||||||
" <td>0L00000122</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LUX</td>\n",
|
|
||||||
" <td>F000010PUN</td>\n",
|
|
||||||
" <td>CHF</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU1805616171</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>486378555</td>\n",
|
|
||||||
" <td>2022-06-30</td>\n",
|
|
||||||
" <td>4</td>\n",
|
|
||||||
" <td>PDF</td>\n",
|
|
||||||
" <td>0L00000122</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LUX</td>\n",
|
|
||||||
" <td>F000010MEH</td>\n",
|
|
||||||
" <td>EUR</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU1076358073</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>486378555</td>\n",
|
|
||||||
" <td>2022-06-30</td>\n",
|
|
||||||
" <td>4</td>\n",
|
|
||||||
" <td>PDF</td>\n",
|
|
||||||
" <td>0L00000122</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LUX</td>\n",
|
|
||||||
" <td>F0000143Y8</td>\n",
|
|
||||||
" <td>USD</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU2046740358</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>486378555</td>\n",
|
|
||||||
" <td>2022-06-30</td>\n",
|
|
||||||
" <td>4</td>\n",
|
|
||||||
" <td>PDF</td>\n",
|
|
||||||
" <td>0L00000122</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LUX</td>\n",
|
|
||||||
" <td>F0000143Y9</td>\n",
|
|
||||||
" <td>USD</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU2046740432</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>5</th>\n",
|
|
||||||
" <td>486378555</td>\n",
|
|
||||||
" <td>2022-06-30</td>\n",
|
|
||||||
" <td>4</td>\n",
|
|
||||||
" <td>PDF</td>\n",
|
|
||||||
" <td>0L00000122</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0C00008QVP</td>\n",
|
|
||||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
|
||||||
" <td>FS0000DUH5</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LUX</td>\n",
|
|
||||||
" <td>F000014UPK</td>\n",
|
|
||||||
" <td>AUD</td>\n",
|
|
||||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
|
||||||
" <td>LU2122516821</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" DocumentId EffectiveDate DocumentType Format Language DocumentStatus \\\n",
|
|
||||||
"0 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
|
||||||
"1 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
|
||||||
"2 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
|
||||||
"3 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
|
||||||
"4 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
|
||||||
"5 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
|
||||||
"\n",
|
|
||||||
" ProviderId ProviderName FundId \\\n",
|
|
||||||
"0 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"1 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"2 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"3 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"4 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"5 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
|
||||||
"\n",
|
|
||||||
" FundName Domicile SecId \\\n",
|
|
||||||
"0 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010MEG \n",
|
|
||||||
"1 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010PUN \n",
|
|
||||||
"2 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010MEH \n",
|
|
||||||
"3 T. Rowe Price Funds Series II SICAV - Floating... LUX F0000143Y8 \n",
|
|
||||||
"4 T. Rowe Price Funds Series II SICAV - Floating... LUX F0000143Y9 \n",
|
|
||||||
"5 T. Rowe Price Funds Series II SICAV - Floating... LUX F000014UPK \n",
|
|
||||||
"\n",
|
|
||||||
" CurrencyId ShareClassName ISIN \n",
|
|
||||||
"0 USD T. Rowe Price Funds Series II SICAV - Floating... LU0993574440 \n",
|
|
||||||
"1 CHF T. Rowe Price Funds Series II SICAV - Floating... LU1805616171 \n",
|
|
||||||
"2 EUR T. Rowe Price Funds Series II SICAV - Floating... LU1076358073 \n",
|
|
||||||
"3 USD T. Rowe Price Funds Series II SICAV - Floating... LU2046740358 \n",
|
|
||||||
"4 USD T. Rowe Price Funds Series II SICAV - Floating... LU2046740432 \n",
|
|
||||||
"5 AUD T. Rowe Price Funds Series II SICAV - Floating... LU2122516821 "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 10,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"document_mapping"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 11,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund I Cap',\n",
|
|
||||||
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (CHF) Cap',\n",
|
|
||||||
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (EUR) Cap',\n",
|
|
||||||
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Q (USD) Cap',\n",
|
|
||||||
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Qd (USD) Dis',\n",
|
|
||||||
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Sdn (AUD) Dis']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 11,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"list(document_mapping[\"ShareClassName\"].unique())"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import pymupdf4llm"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Processing ./data/emea_ar/pdf/501380553.pdf...\n",
|
|
||||||
"[ ] (0/47[ ] ( 1/47[= ] ( 2/4[== ] ( 3/47[=== ] ( 4/4[==== ] ( 5/47[===== ] ( 6/47[===== ] ( 7/4[====== ] ( 8/47[======= ] ( 9/4[======== ] (10/47[========= ] (11/4[========== ] (12/47[=========== ] (13/47[=========== ] (14/4[============ ] (15/47[============= ] (16/4[============== ] (17/47[=============== ] (18/4[================ ] (19/47[================= ] (20/47[================= ] (21/4[================== ] (22/47[=================== ] (23/4[==================== ] (24/47[===================== ] (25/4[====================== ] (26/4[====================== ] (27/47[======================= ] (28/4[======================== ] (29/47[========================= ] (30/4[========================== ] (31/47[=========================== ] (32/4[============================ ] (33/4[============================ ] (34/47[============================= ] (35/4[============================== ] (36/47[=============================== ] (37/4[================================ ] (38/47[================================= ] (39/4[================================== ] (40/4[================================== ] (41/47[=================================== ] (42/4[==================================== ] (43/47[===================================== ] (44/4[====================================== ] (45/47[======================================= ] (46/47[========================================] (47/47]\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"107851"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"md_text = pymupdf4llm.to_markdown(r\"./data/emea_ar/pdf/501380553.pdf\")\n",
|
|
||||||
"\n",
|
|
||||||
"# now work with the markdown text, e.g. store as a UTF8-encoded file\n",
|
|
||||||
"import pathlib\n",
|
|
||||||
"pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def get_fund_name(fund_name: str, fund_feature: str):\n",
|
|
||||||
" if not fund_name.endswith(fund_feature):\n",
|
|
||||||
" return fund_name\n",
|
|
||||||
" fund_feature = fund_feature + \" \"\n",
|
|
||||||
" fund_name_split = fund_name.split(fund_feature)\n",
|
|
||||||
" if len(fund_name_split) > 1:\n",
|
|
||||||
" last_fund = fund_name_split[-1].strip()\n",
|
|
||||||
" if len(last_fund) == 0:\n",
|
|
||||||
" last_fund = fund_name_split[-2].strip()\n",
|
|
||||||
" fund_name = f\"{last_fund} {fund_feature}\"\n",
|
|
||||||
" return fund_name"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'C Fund'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"fund_name = \"JPMorgan Investment Fund - Global Income Conservative Fund\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'- Global Income Conservative Fund Fund '"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"get_fund_name(fund_name, \"Fund\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "torch2_real",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.11"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
||||||
137
playground.py
137
playground.py
|
|
@ -1,137 +0,0 @@
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import base64
|
|
||||||
import json_repair
|
|
||||||
from utils.pdf_util import PDFUtil
|
|
||||||
from utils.logger import logger
|
|
||||||
from utils.gpt_utils import chat
|
|
||||||
|
|
||||||
|
|
||||||
def get_base64_pdf_image_list(
|
|
||||||
pdf_file: str, pdf_page_index_list: list, output_folder: str = None
|
|
||||||
) -> dict:
|
|
||||||
if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
|
|
||||||
logger.error("pdf_file is not provided")
|
|
||||||
return None
|
|
||||||
pdf_util = PDFUtil(pdf_file)
|
|
||||||
if pdf_page_index_list is None or len(pdf_page_index_list) == 0:
|
|
||||||
pdf_page_index_list = list(range(pdf_util.get_page_count()))
|
|
||||||
if output_folder is not None and len(output_folder) > 0:
|
|
||||||
os.makedirs(output_folder, exist_ok=True)
|
|
||||||
pdf_image_info = pdf_util.extract_images(
|
|
||||||
pdf_page_index_list=pdf_page_index_list, output_folder=output_folder
|
|
||||||
)
|
|
||||||
return pdf_image_info
|
|
||||||
|
|
||||||
|
|
||||||
def encode_image(image_path: str):
|
|
||||||
if image_path is None or len(image_path) == 0 or not os.path.exists(image_path):
|
|
||||||
return None
|
|
||||||
with open(image_path, "rb") as image_file:
|
|
||||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
|
||||||
|
|
||||||
|
|
||||||
def chat_with_image(
|
|
||||||
pdf_file: str,
|
|
||||||
pdf_page_index_list: list,
|
|
||||||
image_instructions_file: str,
|
|
||||||
image_folder: str,
|
|
||||||
gpt_folder: str,
|
|
||||||
):
|
|
||||||
if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
|
|
||||||
logger.error("pdf_file is not provided")
|
|
||||||
return None
|
|
||||||
pdf_image_info = get_base64_pdf_image_list(
|
|
||||||
pdf_file, pdf_page_index_list, image_folder
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(image_instructions_file, "r", encoding="utf-8") as file:
|
|
||||||
image_instructions = file.read()
|
|
||||||
os.makedirs(gpt_folder, exist_ok=True)
|
|
||||||
pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
|
|
||||||
response_list = {}
|
|
||||||
for page_index, data in pdf_image_info.items():
|
|
||||||
logger.info(f"Processing image in page {page_index}")
|
|
||||||
image_file = data.get("img_file", None)
|
|
||||||
image_base64 = data.get("img_base64", None)
|
|
||||||
response, error = chat(prompt=image_instructions, image_base64=image_base64)
|
|
||||||
if error:
|
|
||||||
logger.error(f"Error in processing image in page {page_index}")
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
response_json = json.loads(response)
|
|
||||||
except:
|
|
||||||
response_json = json_repair.loads(response)
|
|
||||||
response_json_file = os.path.join(
|
|
||||||
gpt_folder, f"{pdf_base_name}_{page_index}.json"
|
|
||||||
)
|
|
||||||
with open(response_json_file, "w", encoding="utf-8") as file:
|
|
||||||
json.dump(response_json, file, indent=4)
|
|
||||||
logger.info(f"Response for image in page {page_index}: {response}")
|
|
||||||
logger.info("Done")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# Table extraction by image
|
|
||||||
# pdf_file = r"/data/emea_ar/small_pdf/382366116.pdf"
|
|
||||||
# pdf_page_index_list = [29, 35, 71, 77, 83, 89, 97, 103, 112, 121, 130, 140, 195, 250, 305]
|
|
||||||
# pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
|
|
||||||
# pdf_page_index_list = [13]
|
|
||||||
# pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
|
|
||||||
# pdf_page_index_list = [29]
|
|
||||||
# image_instructions_file = r"./instructions/table_extraction_image_prompts_v2.txt"
|
|
||||||
# image_output_folder = r"/data/emea_ar/small_pdf_image/"
|
|
||||||
# gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/table/"
|
|
||||||
# chat_with_image(
|
|
||||||
# pdf_file,
|
|
||||||
# pdf_page_index_list,
|
|
||||||
# image_instructions_file,
|
|
||||||
# image_output_folder,
|
|
||||||
# gpt_output_folder,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# Data extraction by image
|
|
||||||
# pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
|
|
||||||
# pdf_page_index_list = [29]
|
|
||||||
pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
|
|
||||||
pdf_page_index_list = [13]
|
|
||||||
image_output_folder = r"/data/emea_ar/small_pdf_image/"
|
|
||||||
gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/data/"
|
|
||||||
image_instructions_file = r"./instructions/data_extraction_image_prompts.txt"
|
|
||||||
chat_with_image(
|
|
||||||
pdf_file,
|
|
||||||
pdf_page_index_list,
|
|
||||||
image_instructions_file,
|
|
||||||
image_output_folder,
|
|
||||||
gpt_output_folder,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Text extraction by image
|
|
||||||
# pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
|
|
||||||
# pdf_page_index_list = [13]
|
|
||||||
# image_instructions_file = r"./instructions/text_extraction_image_prompts.txt"
|
|
||||||
# image_output_folder = r"/data/emea_ar/small_pdf_image/"
|
|
||||||
# gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/text/"
|
|
||||||
# chat_with_image(
|
|
||||||
# pdf_file,
|
|
||||||
# pdf_page_index_list,
|
|
||||||
# image_instructions_file,
|
|
||||||
# image_output_folder,
|
|
||||||
# gpt_output_folder,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
|
|
||||||
# pdf_page_index_list = [13]
|
|
||||||
# image_instructions_file = r"./instructions/table_extraction_image_optimize_prompts.txt"
|
|
||||||
# image_output_folder = r"/data/emea_ar/small_pdf_image/"
|
|
||||||
# gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/optimized_instructions/"
|
|
||||||
# chat_with_image(
|
|
||||||
# pdf_file,
|
|
||||||
# pdf_page_index_list,
|
|
||||||
# image_instructions_file,
|
|
||||||
# image_output_folder,
|
|
||||||
# gpt_output_folder,
|
|
||||||
# )
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,277 +0,0 @@
|
||||||
from tqdm import tqdm
|
|
||||||
from glob import glob
|
|
||||||
import json
|
|
||||||
import pandas as pd
|
|
||||||
import os
|
|
||||||
from traceback import print_exc
|
|
||||||
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
|
|
||||||
|
|
||||||
from utils.logger import logger
|
|
||||||
|
|
||||||
|
|
||||||
def calculate_complex_document_metrics(verify_file_path: str, document_list: list = []):
|
|
||||||
data_df_1 = pd.read_excel(verify_file_path, sheet_name="data_in_doc_mapping")
|
|
||||||
# convert doc_id column to string
|
|
||||||
data_df_1["doc_id"] = data_df_1["doc_id"].astype(str)
|
|
||||||
data_df_1 = data_df_1[data_df_1["raw_check"].isin([0, 1])]
|
|
||||||
|
|
||||||
exclude_documents = ["532422548"]
|
|
||||||
# remove data by doc_id not in exclude_documents
|
|
||||||
data_df_1 = data_df_1[~data_df_1["doc_id"].isin(exclude_documents)]
|
|
||||||
|
|
||||||
if document_list is not None and len(document_list) > 0:
|
|
||||||
data_df_1 = data_df_1[data_df_1["doc_id"].isin(document_list)]
|
|
||||||
|
|
||||||
data_df_2 = pd.read_excel(verify_file_path, sheet_name="total_mapping_data")
|
|
||||||
data_df_2["doc_id"] = data_df_2["doc_id"].astype(str)
|
|
||||||
data_df_2 = data_df_2[data_df_2["raw_check"].isin([0, 1])]
|
|
||||||
|
|
||||||
data_df = pd.concat([data_df_1, data_df_2], ignore_index=True)
|
|
||||||
|
|
||||||
data_df.fillna("", inplace=True)
|
|
||||||
data_df.reset_index(drop=True, inplace=True)
|
|
||||||
|
|
||||||
metrics_df_list = []
|
|
||||||
doc_id_list = data_df["doc_id"].unique().tolist()
|
|
||||||
for doc_id in tqdm(doc_id_list):
|
|
||||||
try:
|
|
||||||
document_data_df = data_df[data_df["doc_id"] == doc_id]
|
|
||||||
document_metrics_df = calc_metrics(document_data_df, doc_id)
|
|
||||||
metrics_df_list.append(document_metrics_df)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error when calculating metrics for document {doc_id}")
|
|
||||||
print_exc()
|
|
||||||
|
|
||||||
total_metrics_df = calc_metrics(data_df, doc_id=None)
|
|
||||||
metrics_df_list.append(total_metrics_df)
|
|
||||||
|
|
||||||
all_metrics_df = pd.concat(metrics_df_list, ignore_index=True)
|
|
||||||
all_metrics_df.reset_index(drop=True, inplace=True)
|
|
||||||
|
|
||||||
output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
|
|
||||||
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
|
|
||||||
output_metrics_file = os.path.join(output_folder,
|
|
||||||
f"complex_{verify_file_name}_metrics_all.xlsx")
|
|
||||||
with pd.ExcelWriter(output_metrics_file) as writer:
|
|
||||||
all_metrics_df.to_excel(writer, index=False, sheet_name="metrics")
|
|
||||||
|
|
||||||
|
|
||||||
def calc_metrics(data_df: pd.DataFrame, doc_id: str = None):
|
|
||||||
# tor data
|
|
||||||
tor_data_df = data_df[data_df["datapoint"] == "tor"]
|
|
||||||
if len(tor_data_df) > 0:
|
|
||||||
tor_metrics = get_sub_metrics(tor_data_df, "tor", doc_id)
|
|
||||||
logger.info(f"TOR metrics: {tor_metrics}")
|
|
||||||
else:
|
|
||||||
tor_metrics = None
|
|
||||||
|
|
||||||
# ter data
|
|
||||||
ter_data_df = data_df[data_df["datapoint"] == "ter"]
|
|
||||||
if len(ter_data_df) > 0:
|
|
||||||
ter_metrics = get_sub_metrics(ter_data_df, "ter", doc_id)
|
|
||||||
logger.info(f"TER metrics: {ter_metrics}")
|
|
||||||
else:
|
|
||||||
ter_metrics = None
|
|
||||||
|
|
||||||
# ogc data
|
|
||||||
ogc_data_df = data_df[data_df["datapoint"] == "ogc"]
|
|
||||||
if len(ogc_data_df) > 0:
|
|
||||||
ogc_metrics = get_sub_metrics(ogc_data_df, "ogc", doc_id)
|
|
||||||
logger.info(f"OGC metrics: {ogc_metrics}")
|
|
||||||
else:
|
|
||||||
ogc_metrics = None
|
|
||||||
|
|
||||||
# performance_fee data
|
|
||||||
performance_fee_data_df = data_df[data_df["datapoint"] == "performance_fee"]
|
|
||||||
if len(performance_fee_data_df) > 0:
|
|
||||||
performance_fee_metrics = get_sub_metrics(performance_fee_data_df, "performance_fee", doc_id)
|
|
||||||
logger.info(f"Performance fee metrics: {performance_fee_metrics}")
|
|
||||||
else:
|
|
||||||
performance_fee_metrics = None
|
|
||||||
|
|
||||||
metrics_candidates = [tor_metrics, ter_metrics, ogc_metrics, performance_fee_metrics]
|
|
||||||
metrics_list = [metrics for metrics in metrics_candidates if metrics is not None]
|
|
||||||
metrics_df = pd.DataFrame(metrics_list)
|
|
||||||
# add average metrics
|
|
||||||
if doc_id is not None and len(doc_id) > 0:
|
|
||||||
avg_metrics = {
|
|
||||||
"DocumentId": doc_id,
|
|
||||||
"DataPoint": "average",
|
|
||||||
"F1": metrics_df["F1"].mean(),
|
|
||||||
"Precision": metrics_df["Precision"].mean(),
|
|
||||||
"Recall": metrics_df["Recall"].mean(),
|
|
||||||
"Accuracy": metrics_df["Accuracy"].mean(),
|
|
||||||
"Support": metrics_df["Support"].sum()
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
avg_metrics = {
|
|
||||||
"DocumentId": "All",
|
|
||||||
"DataPoint": "average",
|
|
||||||
"F1": metrics_df["F1"].mean(),
|
|
||||||
"Precision": metrics_df["Precision"].mean(),
|
|
||||||
"Recall": metrics_df["Recall"].mean(),
|
|
||||||
"Accuracy": metrics_df["Accuracy"].mean(),
|
|
||||||
"Support": metrics_df["Support"].sum()
|
|
||||||
}
|
|
||||||
|
|
||||||
metrics_list.append(avg_metrics)
|
|
||||||
metrics_df = pd.DataFrame(metrics_list)
|
|
||||||
metrics_df.reset_index(drop=True, inplace=True)
|
|
||||||
return metrics_df
|
|
||||||
|
|
||||||
|
|
||||||
def get_sub_metrics(data_df: pd.DataFrame, data_point: str, doc_id: str = None) -> dict:
|
|
||||||
data_df_raw_check_1 = data_df[data_df["raw_check"] == 1]
|
|
||||||
gt_list = [1] * len(data_df_raw_check_1)
|
|
||||||
pre_list = [1] * len(data_df_raw_check_1)
|
|
||||||
|
|
||||||
data_df_raw_check_0 = data_df[data_df["raw_check"] == 0]
|
|
||||||
for index, row in data_df_raw_check_0.iterrows():
|
|
||||||
if row["raw_check_comment"] == "modify":
|
|
||||||
gt_list.append(0)
|
|
||||||
pre_list.append(1)
|
|
||||||
|
|
||||||
gt_list.append(1)
|
|
||||||
pre_list.append(0)
|
|
||||||
elif row["raw_check_comment"] == "incorrect":
|
|
||||||
gt_list.append(0)
|
|
||||||
pre_list.append(1)
|
|
||||||
elif row["raw_check_comment"] == "supplement":
|
|
||||||
gt_list.append(1)
|
|
||||||
pre_list.append(0)
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# calculate metrics
|
|
||||||
accuracy = accuracy_score(gt_list, pre_list)
|
|
||||||
precision = precision_score(gt_list, pre_list)
|
|
||||||
recall = recall_score(gt_list, pre_list)
|
|
||||||
f1 = f1_score(gt_list, pre_list)
|
|
||||||
support = sum(gt_list)
|
|
||||||
if doc_id is not None and len(doc_id) > 0:
|
|
||||||
metrics = {
|
|
||||||
"DocumentId": doc_id,
|
|
||||||
"DataPoint": data_point,
|
|
||||||
"F1": f1,
|
|
||||||
"Precision": precision,
|
|
||||||
"Recall": recall,
|
|
||||||
"Accuracy": accuracy,
|
|
||||||
"Support": support
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
metrics = {
|
|
||||||
"DocumentId": "All",
|
|
||||||
"DataPoint": data_point,
|
|
||||||
"F1": f1,
|
|
||||||
"Precision": precision,
|
|
||||||
"Recall": recall,
|
|
||||||
"Accuracy": accuracy,
|
|
||||||
"Support": support
|
|
||||||
}
|
|
||||||
return metrics
|
|
||||||
|
|
||||||
|
|
||||||
def get_metrics_based_documents(metrics_file: str, document_list: list):
|
|
||||||
metrics_df = pd.read_excel(metrics_file, sheet_name="metrics")
|
|
||||||
metrics_df_list = []
|
|
||||||
for doc_id in tqdm(document_list):
|
|
||||||
try:
|
|
||||||
document_metrics_df = metrics_df[metrics_df["DocumentId"] == doc_id]
|
|
||||||
metrics_df_list.append(document_metrics_df)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error when calculating metrics for document {doc_id}")
|
|
||||||
print_exc()
|
|
||||||
metrics_document_df = pd.concat(metrics_df_list, ignore_index=True)
|
|
||||||
|
|
||||||
stats_metrics_list = []
|
|
||||||
tor_df = metrics_document_df[metrics_document_df["DataPoint"] == "tor"]
|
|
||||||
if len(tor_df) > 0:
|
|
||||||
tor_metrics = {
|
|
||||||
"DocumentId": "All",
|
|
||||||
"DataPoint": "tor",
|
|
||||||
"F1": tor_df["F1"].mean(),
|
|
||||||
"Precision": tor_df["Precision"].mean(),
|
|
||||||
"Recall": tor_df["Recall"].mean(),
|
|
||||||
"Accuracy": tor_df["Accuracy"].mean(),
|
|
||||||
"Support": tor_df["Support"].sum()
|
|
||||||
}
|
|
||||||
stats_metrics_list.append(tor_metrics)
|
|
||||||
ter_df = metrics_document_df[metrics_document_df["DataPoint"] == "ter"]
|
|
||||||
if len(ter_df) > 0:
|
|
||||||
ter_metrics = {
|
|
||||||
"DocumentId": "All",
|
|
||||||
"DataPoint": "ter",
|
|
||||||
"F1": ter_df["F1"].mean(),
|
|
||||||
"Precision": ter_df["Precision"].mean(),
|
|
||||||
"Recall": ter_df["Recall"].mean(),
|
|
||||||
"Accuracy": ter_df["Accuracy"].mean(),
|
|
||||||
"Support": ter_df["Support"].sum()
|
|
||||||
}
|
|
||||||
stats_metrics_list.append(ter_metrics)
|
|
||||||
ogc_df = metrics_document_df[metrics_document_df["DataPoint"] == "ogc"]
|
|
||||||
if len(ogc_df) > 0:
|
|
||||||
ogc_metrics = {
|
|
||||||
"DocumentId": "All",
|
|
||||||
"DataPoint": "ogc",
|
|
||||||
"F1": ogc_df["F1"].mean(),
|
|
||||||
"Precision": ogc_df["Precision"].mean(),
|
|
||||||
"Recall": ogc_df["Recall"].mean(),
|
|
||||||
"Accuracy": ogc_df["Accuracy"].mean(),
|
|
||||||
"Support": ogc_df["Support"].sum()
|
|
||||||
}
|
|
||||||
stats_metrics_list.append(ogc_metrics)
|
|
||||||
performance_fee_df = metrics_document_df[metrics_document_df["DataPoint"] == "performance_fee"]
|
|
||||||
if len(performance_fee_df) > 0:
|
|
||||||
performance_fee_metrics = {
|
|
||||||
"DocumentId": "All",
|
|
||||||
"DataPoint": "performance_fee",
|
|
||||||
"F1": performance_fee_df["F1"].mean(),
|
|
||||||
"Precision": performance_fee_df["Precision"].mean(),
|
|
||||||
"Recall": performance_fee_df["Recall"].mean(),
|
|
||||||
"Accuracy": performance_fee_df["Accuracy"].mean(),
|
|
||||||
"Support": performance_fee_df["Support"].sum()
|
|
||||||
}
|
|
||||||
stats_metrics_list.append(performance_fee_metrics)
|
|
||||||
average_df = metrics_document_df[metrics_document_df["DataPoint"] == "average"]
|
|
||||||
if len(average_df) > 0:
|
|
||||||
avg_metrics = {
|
|
||||||
"DocumentId": "All",
|
|
||||||
"DataPoint": "average",
|
|
||||||
"F1": average_df["F1"].mean(),
|
|
||||||
"Precision": average_df["Precision"].mean(),
|
|
||||||
"Recall": average_df["Recall"].mean(),
|
|
||||||
"Accuracy": average_df["Accuracy"].mean(),
|
|
||||||
"Support": average_df["Support"].sum()
|
|
||||||
}
|
|
||||||
stats_metrics_list.append(avg_metrics)
|
|
||||||
|
|
||||||
stats_metrics_df = pd.DataFrame(stats_metrics_list)
|
|
||||||
metrics_df_list.append(stats_metrics_df)
|
|
||||||
all_metrics_df = pd.concat(metrics_df_list, ignore_index=True)
|
|
||||||
all_metrics_df.reset_index(drop=True, inplace=True)
|
|
||||||
|
|
||||||
output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
|
|
||||||
verify_file_name = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_remain_7.xlsx"
|
|
||||||
output_metrics_file = os.path.join(output_folder, verify_file_name)
|
|
||||||
with pd.ExcelWriter(output_metrics_file) as writer:
|
|
||||||
all_metrics_df.to_excel(writer, index=False, sheet_name="metrics")
|
|
||||||
|
|
||||||
return all_metrics_df
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
file_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
|
|
||||||
verify_file = "mapping_data_info_31_documents_by_text_second_round.xlsx"
|
|
||||||
verify_file_path = os.path.join(file_folder, verify_file)
|
|
||||||
calculate_complex_document_metrics(verify_file_path=verify_file_path,
|
|
||||||
document_list=None)
|
|
||||||
document_list = ["492029971",
|
|
||||||
"510300817",
|
|
||||||
"512745032",
|
|
||||||
"514213638",
|
|
||||||
"527525440",
|
|
||||||
"534535767"]
|
|
||||||
metrics_file = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_all.xlsx"
|
|
||||||
metrics_file_path = os.path.join(file_folder, metrics_file)
|
|
||||||
# get_metrics_based_documents(metrics_file=metrics_file_path,
|
|
||||||
# document_list=document_list)
|
|
||||||
|
|
@ -1,70 +0,0 @@
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import pandas as pd
|
|
||||||
from glob import glob
|
|
||||||
from tqdm import tqdm
|
|
||||||
from utils.logger import logger
|
|
||||||
from utils.sql_query_util import query_document_fund_mapping
|
|
||||||
from core.page_filter import FilterPages
|
|
||||||
from core.data_extraction import DataExtraction
|
|
||||||
|
|
||||||
|
|
||||||
def test_validate_extraction_data():
|
|
||||||
document_id = "481482392"
|
|
||||||
pdf_file = f"/data/emea_ar/pdf/481482392.pdf"
|
|
||||||
output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
|
|
||||||
output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
|
|
||||||
document_mapping_info_df = query_document_fund_mapping(document_id, rerun=False)
|
|
||||||
filter_pages = FilterPages(
|
|
||||||
document_id, pdf_file, document_mapping_info_df
|
|
||||||
)
|
|
||||||
page_text_dict = filter_pages.page_text_dict
|
|
||||||
datapoint_page_info, result_details = get_datapoint_page_info(filter_pages)
|
|
||||||
datapoints = get_datapoints_from_datapoint_page_info(datapoint_page_info)
|
|
||||||
data_extraction = DataExtraction(
|
|
||||||
doc_source="emea_ar",
|
|
||||||
doc_id=document_id,
|
|
||||||
pdf_file=pdf_file,
|
|
||||||
output_data_folder=output_extract_data_child_folder,
|
|
||||||
page_text_dict=page_text_dict,
|
|
||||||
datapoint_page_info=datapoint_page_info,
|
|
||||||
datapoints=datapoints,
|
|
||||||
document_mapping_info_df=document_mapping_info_df,
|
|
||||||
extract_way="text",
|
|
||||||
output_image_folder=None
|
|
||||||
)
|
|
||||||
output_data_json_folder = os.path.join(
|
|
||||||
r"/data/emea_ar/output/extract_data/docs/by_text/", "json/"
|
|
||||||
)
|
|
||||||
os.makedirs(output_data_json_folder, exist_ok=True)
|
|
||||||
json_file = os.path.join(output_data_json_folder, f"{document_id}.json")
|
|
||||||
data_from_gpt = None
|
|
||||||
if os.path.exists(json_file):
|
|
||||||
logger.info(
|
|
||||||
f"The document: {document_id} has been parsed, loading data from {json_file}"
|
|
||||||
)
|
|
||||||
with open(json_file, "r", encoding="utf-8") as f:
|
|
||||||
data_from_gpt = json.load(f)
|
|
||||||
for extract_data in data_from_gpt:
|
|
||||||
page_index = extract_data["page_index"]
|
|
||||||
if page_index == 451:
|
|
||||||
logger.info(f"Page index: {page_index}")
|
|
||||||
raw_answer = extract_data["raw_answer"]
|
|
||||||
raw_answer_json = json.loads(raw_answer)
|
|
||||||
extract_data_info = data_extraction.validate_data(raw_answer_json)
|
|
||||||
print(extract_data_info)
|
|
||||||
|
|
||||||
def get_datapoint_page_info(filter_pages) -> tuple:
|
|
||||||
datapoint_page_info, result_details = filter_pages.start_job()
|
|
||||||
return datapoint_page_info, result_details
|
|
||||||
|
|
||||||
|
|
||||||
def get_datapoints_from_datapoint_page_info(datapoint_page_info) -> list:
|
|
||||||
datapoints = list(datapoint_page_info.keys())
|
|
||||||
if "doc_id" in datapoints:
|
|
||||||
datapoints.remove("doc_id")
|
|
||||||
return datapoints
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
test_validate_extraction_data()
|
|
||||||
Loading…
Reference in New Issue