update for deployment
This commit is contained in:
parent
fb4a6402f0
commit
f10ff8ee33
|
|
@ -7,3 +7,9 @@
|
|||
/test_metrics
|
||||
/data
|
||||
/sample_documents/japan_prospectus.txt
|
||||
/pdf_table_extraction.py
|
||||
/playground.ipynb
|
||||
/playground.py
|
||||
/specific_calc_metrics.py
|
||||
/test_specific_biz_logic.py
|
||||
/drilldown_practice.py
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ swagger = Swagger(app, template=template)
|
|||
|
||||
@app.route('/automation/api/model/emea_ar', methods=['POST'])
|
||||
@swag_from('yml/emea_ar.yml')
|
||||
def us_ar_data_extract():
|
||||
def emea_ar_data_extract():
|
||||
"""
|
||||
Extract EMEA AR cost data from EMEA LUX PDF document
|
||||
input sample:
|
||||
|
|
@ -40,6 +40,7 @@ def us_ar_data_extract():
|
|||
return jsonify({"error": "doc_id is required"}), 400
|
||||
|
||||
pdf_folder = r"./data/emea_ar/pdf/"
|
||||
output_pdf_text_folder = r"./data/emea_ar/output/pdf_text/"
|
||||
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
|
||||
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
|
||||
drilldown_folder = r"./data/emea_ar/output/drilldown/"
|
||||
|
|
@ -62,6 +63,7 @@ def us_ar_data_extract():
|
|||
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
||||
doc_source="emea_ar",
|
||||
pdf_folder=pdf_folder,
|
||||
output_pdf_text_folder=output_pdf_text_folder,
|
||||
output_extract_data_folder=output_extract_data_folder,
|
||||
output_mapping_data_folder=output_mapping_data_folder,
|
||||
extract_way=extract_way,
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import json_repair
|
|||
import re
|
||||
import fitz
|
||||
import pandas as pd
|
||||
from traceback import print_exc
|
||||
from utils.gpt_utils import chat
|
||||
from utils.pdf_util import PDFUtil
|
||||
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
|
||||
|
|
@ -294,6 +295,8 @@ class DataExtraction:
|
|||
for current_page_data in page_data_list:
|
||||
if current_page_data in next_page_data_list:
|
||||
next_page_data_list.remove(current_page_data)
|
||||
if len(next_page_data_list) == 0:
|
||||
break
|
||||
next_page_extract_data["extract_data"][
|
||||
"data"
|
||||
] = next_page_data_list
|
||||
|
|
|
|||
|
|
@ -1,159 +0,0 @@
|
|||
from tqdm import tqdm
|
||||
from glob import glob
|
||||
import json
|
||||
import pandas as pd
|
||||
import os
|
||||
from traceback import print_exc
|
||||
from sklearn.metrics import recall_score
|
||||
|
||||
from utils.logger import logger
|
||||
from utils.pdf_util import PDFUtil
|
||||
|
||||
|
||||
def drilldown_documents(pdf_folder: str, extract_data_folder: str, drilldown_folder: str):
|
||||
extract_files = glob(extract_data_folder + '*.json')
|
||||
|
||||
for index, json_file in enumerate(tqdm(extract_files)):
|
||||
try:
|
||||
# doc_id = file.split('/')[-1].split('.')[0]
|
||||
json_base_name = os.path.basename(json_file)
|
||||
doc_id = json_base_name.split('.')[0]
|
||||
logger.info(f"Processing {doc_id}")
|
||||
pdf_file = os.path.join(pdf_folder, f"{doc_id}.pdf")
|
||||
if not os.path.exists(pdf_file):
|
||||
logger.error(f"PDF file not found for {doc_id}")
|
||||
continue
|
||||
with open(json_file, "r", encoding="utf-8") as f:
|
||||
data_from_gpt = json.load(f)
|
||||
drilldown_pdf_document(doc_id=doc_id,
|
||||
pdf_file=pdf_file,
|
||||
drilldown_folder=drilldown_folder,
|
||||
data_from_gpt=data_from_gpt)
|
||||
|
||||
except Exception as e:
|
||||
print_exc()
|
||||
logger.error(f"Error in processing {doc_id}: {e}")
|
||||
|
||||
|
||||
def drilldown_pdf_document(doc_id:str,
|
||||
pdf_file: str,
|
||||
drilldown_folder: str,
|
||||
data_from_gpt: list) -> list:
|
||||
logger.info(f"Drilldown PDF document for doc_id: {doc_id}")
|
||||
pdf_util = PDFUtil(pdf_file)
|
||||
drilldown_data_list = []
|
||||
for data in data_from_gpt:
|
||||
doc_id = str(data.get("doc_id", ""))
|
||||
# if doc_id != "506326520":
|
||||
# continue
|
||||
page_index = data.get("page_index", -1)
|
||||
if page_index == -1:
|
||||
continue
|
||||
extract_data_list = data.get("extract_data", {}).get("data", [])
|
||||
dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {})
|
||||
if len(dp_reported_name_dict.keys()) == 0:
|
||||
continue
|
||||
highlighted_value_list = []
|
||||
for extract_data in extract_data_list:
|
||||
for data_point, value in extract_data.items():
|
||||
if value in highlighted_value_list:
|
||||
continue
|
||||
if data_point in ["ter", "ogc", "performance_fee"]:
|
||||
continue
|
||||
drilldown_data = {
|
||||
"doc_id": doc_id,
|
||||
"page_index": page_index,
|
||||
"data_point": data_point,
|
||||
"parent_text_block": None,
|
||||
"value": value,
|
||||
"annotation_attribute": {}
|
||||
}
|
||||
drilldown_data_list.append(drilldown_data)
|
||||
highlighted_value_list.append(value)
|
||||
|
||||
for data_point, reported_name in dp_reported_name_dict.items():
|
||||
if reported_name in highlighted_value_list:
|
||||
continue
|
||||
data_point = f"{data_point}_reported_name"
|
||||
drilldown_data = {
|
||||
"doc_id": doc_id,
|
||||
"page_index": page_index,
|
||||
"data_point": data_point,
|
||||
"parent_text_block": None,
|
||||
"value": reported_name,
|
||||
"annotation_attribute": {}
|
||||
}
|
||||
drilldown_data_list.append(drilldown_data)
|
||||
highlighted_value_list.append(reported_name)
|
||||
drilldown_result = []
|
||||
if len(drilldown_data_list) > 0:
|
||||
drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list,
|
||||
output_pdf_folder=drilldown_folder)
|
||||
if len(drilldown_result) > 0:
|
||||
logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully")
|
||||
annotation_list = drilldown_result.get("annotation_list", [])
|
||||
for annotation in annotation_list:
|
||||
annotation["doc_id"] = doc_id
|
||||
if drilldown_folder is not None and len(drilldown_folder) > 0:
|
||||
drilldown_data_folder = os.path.join(drilldown_folder, "data/")
|
||||
os.makedirs(drilldown_data_folder, exist_ok=True)
|
||||
drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx")
|
||||
|
||||
drilldown_source_df = pd.DataFrame(drilldown_data_list)
|
||||
annotation_list_df = pd.DataFrame(annotation_list)
|
||||
# set drilldown_result_df column order as doc_id, pdf_file, page_index,
|
||||
# data_point, value, matching_val_area, normalized_bbox
|
||||
annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index",
|
||||
"data_point", "value", "matching_val_area", "normalized_bbox"]]
|
||||
logger.info(f"Writing drilldown data to {drilldown_file}")
|
||||
with pd.ExcelWriter(drilldown_file) as writer:
|
||||
drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data")
|
||||
annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data")
|
||||
|
||||
|
||||
def calculate_metrics():
|
||||
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
||||
drilldown_data_folder = os.path.join(drilldown_folder, "data/")
|
||||
drilldown_files = glob(drilldown_data_folder + '*.xlsx')
|
||||
y_true_list = []
|
||||
y_pred_list = []
|
||||
series_list = []
|
||||
for drilldown_file in drilldown_files:
|
||||
drilldown_file_base_name = os.path.basename(drilldown_file)
|
||||
if drilldown_file_base_name.startswith("~"):
|
||||
continue
|
||||
drilldown_data = pd.read_excel(drilldown_file, sheet_name="drilldown_data")
|
||||
for index, row in drilldown_data.iterrows():
|
||||
matching_val_area = row["matching_val_area"]
|
||||
# transform matching_val_area to list
|
||||
if isinstance(matching_val_area, str):
|
||||
matching_val_area = eval(matching_val_area)
|
||||
y_true_list.append(1)
|
||||
if len(matching_val_area) > 0:
|
||||
y_pred_list.append(1)
|
||||
else:
|
||||
y_pred_list.append(0)
|
||||
series_list.append(row)
|
||||
recall = recall_score(y_true_list, y_pred_list)
|
||||
logger.info(f"Recall: {recall}, Support: {len(y_true_list)}")
|
||||
no_annotation_df = pd.DataFrame(series_list)
|
||||
no_annotation_df.reset_index(drop=True, inplace=True)
|
||||
metrics_folder = os.path.join(drilldown_folder, "metrics/")
|
||||
os.makedirs(metrics_folder, exist_ok=True)
|
||||
metrics_file = os.path.join(metrics_folder, "metrics.xlsx")
|
||||
metrics_result = {
|
||||
"recall": recall,
|
||||
"support": len(y_true_list)
|
||||
}
|
||||
metrics_df = pd.DataFrame([metrics_result])
|
||||
with pd.ExcelWriter(metrics_file) as writer:
|
||||
metrics_df.to_excel(writer, index=False, sheet_name="metrics")
|
||||
no_annotation_df.to_excel(writer, index=False, sheet_name="no_annotation")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pdf_folder = r"/data/emea_ar/pdf/"
|
||||
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
||||
extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/'
|
||||
drilldown_documents()
|
||||
# calculate_metrics()
|
||||
|
|
@ -1,110 +0,0 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
import tqdm
|
||||
import json_repair
|
||||
import json
|
||||
from glob import glob
|
||||
import fitz
|
||||
import re
|
||||
import time
|
||||
import traceback
|
||||
|
||||
from utils.logger import logger
|
||||
from utils.pdf_download import download_pdf_from_documents_warehouse
|
||||
from utils.pdf_util import PDFUtil
|
||||
from utils.gpt_utils import chat
|
||||
|
||||
|
||||
class PDFTableExtraction:
|
||||
"""
|
||||
Iterate PDF pages
|
||||
Extract tables from PDF pages
|
||||
Save these tables as markdown files
|
||||
"""
|
||||
def __init__(self,
|
||||
pdf_file: str,
|
||||
output_folder: str) -> None:
|
||||
self.pdf_file = pdf_file
|
||||
self.pdf_file_name = os.path.basename(pdf_file)
|
||||
self.table_extraction_prompts = self.get_table_extraction_prompts()
|
||||
|
||||
self.output_folder = output_folder
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
|
||||
self.prompts_output_folder = os.path.join(output_folder, 'pdf_table_prompts/')
|
||||
os.makedirs(self.prompts_output_folder, exist_ok=True)
|
||||
|
||||
self.json_output_folder = os.path.join(output_folder, 'pdf_table_json/')
|
||||
os.makedirs(self.json_output_folder, exist_ok=True)
|
||||
|
||||
self.table_md_output_folder = os.path.join(output_folder, 'pdf_table_markdown/')
|
||||
os.makedirs(self.table_md_output_folder, exist_ok=True)
|
||||
|
||||
def get_table_extraction_prompts(self):
|
||||
instructions_file = r'./instructions/table_extraction_prompts.txt'
|
||||
with open(instructions_file, 'r', encoding='utf-8') as file:
|
||||
return file.read()
|
||||
|
||||
def extract_tables(self):
|
||||
try:
|
||||
if self.pdf_file is None or len(self.pdf_file) == 0 or not os.path.exists(self.pdf_file):
|
||||
logger.error(f"Invalid pdf_file: {self.pdf_file}")
|
||||
return
|
||||
logger.info(f"Start processing {self.pdf_file}")
|
||||
pdf_util = PDFUtil(self.pdf_file)
|
||||
success, text, page_text_dict = pdf_util.extract_text(output_folder=self.output_folder)
|
||||
if success:
|
||||
logger.info(f"Successfully extracted text from {self.pdf_file}")
|
||||
|
||||
for page_num, page_text in page_text_dict.items():
|
||||
try:
|
||||
self.extract_tables_from_page(page_text, page_num)
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
logger.error(f"Error in extracting tables from page {page_num}: {str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in extracting PDF tables: {str(e)}")
|
||||
|
||||
|
||||
def extract_tables_from_page(self, page_text: str, page_num: int):
|
||||
pure_pdf_name = self.pdf_file_name.replace('.pdf', '')
|
||||
table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text)
|
||||
prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt')
|
||||
if os.path.exists(prompts_response_file):
|
||||
logger.info(f"Prompts response file already exists: {prompts_response_file}")
|
||||
return
|
||||
|
||||
response, with_error = chat(table_extraction_prompts)
|
||||
if with_error:
|
||||
logger.error(f"Error in extracting tables from page")
|
||||
return
|
||||
|
||||
json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response)
|
||||
if json_response is None:
|
||||
logger.info(f"Can't extract tables from page")
|
||||
return
|
||||
|
||||
table_json_text = json_response.group(1)
|
||||
table_data = {"tables": []}
|
||||
try:
|
||||
table_data = json.loads(table_json_text)
|
||||
except:
|
||||
table_data = json_repair.loads(table_json_text)
|
||||
self.save_table_data(table_data, page_num)
|
||||
|
||||
prompts_response = f'{table_extraction_prompts}\n\n{response}'
|
||||
with open(prompts_response_file, 'w', encoding='utf-8') as file:
|
||||
file.write(prompts_response)
|
||||
|
||||
def save_table_data(self, table_data: dict, page_num: int):
|
||||
pdf_pure_name = self.pdf_file_name.replace('.pdf', '')
|
||||
json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json')
|
||||
with open(json_output_file, 'w', encoding='utf-8') as file:
|
||||
file.write(json.dumps(table_data, indent=4))
|
||||
|
||||
table_list = table_data.get('tables', [])
|
||||
for table_num, table in enumerate(table_list):
|
||||
table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md')
|
||||
table = re.sub(r'(\n)+', '\n', table)
|
||||
with open(table_md_file, 'w', encoding='utf-8') as file:
|
||||
file.write(table)
|
||||
713
playground.ipynb
713
playground.ipynb
|
|
@ -1,713 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from utils.biz_utils import add_slash_to_text_as_regex\n",
|
||||
"import json\n",
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"regex = r\"Turnover \\n\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Turnover\\\\s+\\\\n'"
|
||||
]
|
||||
},
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"add_slash_to_text_as_regex(regex)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover reflects the\\n\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<re.Match object; span=(141, 151), match='Turnover \\n'>"
|
||||
]
|
||||
},
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"re.search(regex, text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"TOR no match\n",
|
||||
"Turnover\\*\\s+\\n no match\n",
|
||||
"Turnover\\s+\\n no match\n",
|
||||
"Turnover\\s+Ratio no match\n",
|
||||
"Turnover\\s+Rate no match\n",
|
||||
"Portfolio\\s+Turnover no match\n",
|
||||
"Portfolio\\s+turnover\\s+ratio no match\n",
|
||||
"Portfolio\\s+turnover\\s+rate no match\n",
|
||||
"PTR no match\n",
|
||||
"Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" datapoint_keywords_config = json.load(file)\n",
|
||||
"\n",
|
||||
"tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
|
||||
"\n",
|
||||
"for tor_regex in tor_regex_list:\n",
|
||||
" regex = add_slash_to_text_as_regex(tor_regex)\n",
|
||||
" search = re.search(regex, text)\n",
|
||||
" if search:\n",
|
||||
" print(f\"{regex} match {search.group()}\")\n",
|
||||
" else:\n",
|
||||
" print(f\"{regex} no match\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from utils.sql_query_util import query_investment_by_provider, query_document_fund_mapping\n",
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"provider_mapping = query_investment_by_provider(company_id=\"0C00008QVP\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>ProviderId</th>\n",
|
||||
" <th>ProviderName</th>\n",
|
||||
" <th>FundId</th>\n",
|
||||
" <th>FundName</th>\n",
|
||||
" <th>ISIN</th>\n",
|
||||
" <th>SecId</th>\n",
|
||||
" <th>CurrencyId</th>\n",
|
||||
" <th>ShareClassName</th>\n",
|
||||
" <th>ShareClassStatus</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>840</th>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH4</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
|
||||
" <td>LU1053597990</td>\n",
|
||||
" <td>F000010MEE</td>\n",
|
||||
" <td>USD</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>841</th>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH4</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
|
||||
" <td>LU1053597727</td>\n",
|
||||
" <td>F000010MEF</td>\n",
|
||||
" <td>USD</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>842</th>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU0993574440</td>\n",
|
||||
" <td>F000010MEG</td>\n",
|
||||
" <td>USD</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>843</th>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU1805616171</td>\n",
|
||||
" <td>F000010PUN</td>\n",
|
||||
" <td>CHF</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>844</th>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU1076358073</td>\n",
|
||||
" <td>F000010MEH</td>\n",
|
||||
" <td>EUR</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>845</th>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU2046740358</td>\n",
|
||||
" <td>F0000143Y8</td>\n",
|
||||
" <td>USD</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>846</th>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU2046740432</td>\n",
|
||||
" <td>F0000143Y9</td>\n",
|
||||
" <td>USD</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>847</th>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU0993569101</td>\n",
|
||||
" <td>F00001564H</td>\n",
|
||||
" <td>USD</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>848</th>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU2122516821</td>\n",
|
||||
" <td>F000014UPK</td>\n",
|
||||
" <td>AUD</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" ProviderId ProviderName FundId \\\n",
|
||||
"840 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH4 \n",
|
||||
"841 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH4 \n",
|
||||
"842 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"843 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"844 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"845 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"846 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"847 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"848 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"\n",
|
||||
" FundName ISIN \\\n",
|
||||
"840 T. Rowe Price Funds Series II SICAV - Credit O... LU1053597990 \n",
|
||||
"841 T. Rowe Price Funds Series II SICAV - Credit O... LU1053597727 \n",
|
||||
"842 T. Rowe Price Funds Series II SICAV - Floating... LU0993574440 \n",
|
||||
"843 T. Rowe Price Funds Series II SICAV - Floating... LU1805616171 \n",
|
||||
"844 T. Rowe Price Funds Series II SICAV - Floating... LU1076358073 \n",
|
||||
"845 T. Rowe Price Funds Series II SICAV - Floating... LU2046740358 \n",
|
||||
"846 T. Rowe Price Funds Series II SICAV - Floating... LU2046740432 \n",
|
||||
"847 T. Rowe Price Funds Series II SICAV - Floating... LU0993569101 \n",
|
||||
"848 T. Rowe Price Funds Series II SICAV - Floating... LU2122516821 \n",
|
||||
"\n",
|
||||
" SecId CurrencyId ShareClassName \\\n",
|
||||
"840 F000010MEE USD T. Rowe Price Funds Series II SICAV - Credit O... \n",
|
||||
"841 F000010MEF USD T. Rowe Price Funds Series II SICAV - Credit O... \n",
|
||||
"842 F000010MEG USD T. Rowe Price Funds Series II SICAV - Floating... \n",
|
||||
"843 F000010PUN CHF T. Rowe Price Funds Series II SICAV - Floating... \n",
|
||||
"844 F000010MEH EUR T. Rowe Price Funds Series II SICAV - Floating... \n",
|
||||
"845 F0000143Y8 USD T. Rowe Price Funds Series II SICAV - Floating... \n",
|
||||
"846 F0000143Y9 USD T. Rowe Price Funds Series II SICAV - Floating... \n",
|
||||
"847 F00001564H USD T. Rowe Price Funds Series II SICAV - Floating... \n",
|
||||
"848 F000014UPK AUD T. Rowe Price Funds Series II SICAV - Floating... \n",
|
||||
"\n",
|
||||
" ShareClassStatus \n",
|
||||
"840 0 \n",
|
||||
"841 0 \n",
|
||||
"842 1 \n",
|
||||
"843 0 \n",
|
||||
"844 0 \n",
|
||||
"845 0 \n",
|
||||
"846 0 \n",
|
||||
"847 0 \n",
|
||||
"848 0 "
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"provider_mapping[provider_mapping[\"FundName\"].str.contains(\"T. Rowe Price Funds Series II SICAV\")]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"document_mapping = query_document_fund_mapping(doc_id=\"486378555\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>DocumentId</th>\n",
|
||||
" <th>EffectiveDate</th>\n",
|
||||
" <th>DocumentType</th>\n",
|
||||
" <th>Format</th>\n",
|
||||
" <th>Language</th>\n",
|
||||
" <th>DocumentStatus</th>\n",
|
||||
" <th>ProviderId</th>\n",
|
||||
" <th>ProviderName</th>\n",
|
||||
" <th>FundId</th>\n",
|
||||
" <th>FundName</th>\n",
|
||||
" <th>Domicile</th>\n",
|
||||
" <th>SecId</th>\n",
|
||||
" <th>CurrencyId</th>\n",
|
||||
" <th>ShareClassName</th>\n",
|
||||
" <th>ISIN</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>486378555</td>\n",
|
||||
" <td>2022-06-30</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>PDF</td>\n",
|
||||
" <td>0L00000122</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LUX</td>\n",
|
||||
" <td>F000010MEG</td>\n",
|
||||
" <td>USD</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU0993574440</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>486378555</td>\n",
|
||||
" <td>2022-06-30</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>PDF</td>\n",
|
||||
" <td>0L00000122</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LUX</td>\n",
|
||||
" <td>F000010PUN</td>\n",
|
||||
" <td>CHF</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU1805616171</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>486378555</td>\n",
|
||||
" <td>2022-06-30</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>PDF</td>\n",
|
||||
" <td>0L00000122</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LUX</td>\n",
|
||||
" <td>F000010MEH</td>\n",
|
||||
" <td>EUR</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU1076358073</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>486378555</td>\n",
|
||||
" <td>2022-06-30</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>PDF</td>\n",
|
||||
" <td>0L00000122</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LUX</td>\n",
|
||||
" <td>F0000143Y8</td>\n",
|
||||
" <td>USD</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU2046740358</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>486378555</td>\n",
|
||||
" <td>2022-06-30</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>PDF</td>\n",
|
||||
" <td>0L00000122</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LUX</td>\n",
|
||||
" <td>F0000143Y9</td>\n",
|
||||
" <td>USD</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU2046740432</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>486378555</td>\n",
|
||||
" <td>2022-06-30</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>PDF</td>\n",
|
||||
" <td>0L00000122</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0C00008QVP</td>\n",
|
||||
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
|
||||
" <td>FS0000DUH5</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LUX</td>\n",
|
||||
" <td>F000014UPK</td>\n",
|
||||
" <td>AUD</td>\n",
|
||||
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
|
||||
" <td>LU2122516821</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" DocumentId EffectiveDate DocumentType Format Language DocumentStatus \\\n",
|
||||
"0 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
||||
"1 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
||||
"2 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
||||
"3 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
||||
"4 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
||||
"5 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
|
||||
"\n",
|
||||
" ProviderId ProviderName FundId \\\n",
|
||||
"0 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"1 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"2 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"3 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"4 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"5 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
|
||||
"\n",
|
||||
" FundName Domicile SecId \\\n",
|
||||
"0 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010MEG \n",
|
||||
"1 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010PUN \n",
|
||||
"2 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010MEH \n",
|
||||
"3 T. Rowe Price Funds Series II SICAV - Floating... LUX F0000143Y8 \n",
|
||||
"4 T. Rowe Price Funds Series II SICAV - Floating... LUX F0000143Y9 \n",
|
||||
"5 T. Rowe Price Funds Series II SICAV - Floating... LUX F000014UPK \n",
|
||||
"\n",
|
||||
" CurrencyId ShareClassName ISIN \n",
|
||||
"0 USD T. Rowe Price Funds Series II SICAV - Floating... LU0993574440 \n",
|
||||
"1 CHF T. Rowe Price Funds Series II SICAV - Floating... LU1805616171 \n",
|
||||
"2 EUR T. Rowe Price Funds Series II SICAV - Floating... LU1076358073 \n",
|
||||
"3 USD T. Rowe Price Funds Series II SICAV - Floating... LU2046740358 \n",
|
||||
"4 USD T. Rowe Price Funds Series II SICAV - Floating... LU2046740432 \n",
|
||||
"5 AUD T. Rowe Price Funds Series II SICAV - Floating... LU2122516821 "
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"document_mapping"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund I Cap',\n",
|
||||
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (CHF) Cap',\n",
|
||||
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (EUR) Cap',\n",
|
||||
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Q (USD) Cap',\n",
|
||||
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Qd (USD) Dis',\n",
|
||||
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Sdn (AUD) Dis']"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(document_mapping[\"ShareClassName\"].unique())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pymupdf4llm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Processing ./data/emea_ar/pdf/501380553.pdf...\n",
|
||||
"[ ] (0/47[ ] ( 1/47[= ] ( 2/4[== ] ( 3/47[=== ] ( 4/4[==== ] ( 5/47[===== ] ( 6/47[===== ] ( 7/4[====== ] ( 8/47[======= ] ( 9/4[======== ] (10/47[========= ] (11/4[========== ] (12/47[=========== ] (13/47[=========== ] (14/4[============ ] (15/47[============= ] (16/4[============== ] (17/47[=============== ] (18/4[================ ] (19/47[================= ] (20/47[================= ] (21/4[================== ] (22/47[=================== ] (23/4[==================== ] (24/47[===================== ] (25/4[====================== ] (26/4[====================== ] (27/47[======================= ] (28/4[======================== ] (29/47[========================= ] (30/4[========================== ] (31/47[=========================== ] (32/4[============================ ] (33/4[============================ ] (34/47[============================= ] (35/4[============================== ] (36/47[=============================== ] (37/4[================================ ] (38/47[================================= ] (39/4[================================== ] (40/4[================================== ] (41/47[=================================== ] (42/4[==================================== ] (43/47[===================================== ] (44/4[====================================== ] (45/47[======================================= ] (46/47[========================================] (47/47]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"107851"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"md_text = pymupdf4llm.to_markdown(r\"./data/emea_ar/pdf/501380553.pdf\")\n",
|
||||
"\n",
|
||||
"# now work with the markdown text, e.g. store as a UTF8-encoded file\n",
|
||||
"import pathlib\n",
|
||||
"pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_fund_name(fund_name: str, fund_feature: str):\n",
|
||||
" if not fund_name.endswith(fund_feature):\n",
|
||||
" return fund_name\n",
|
||||
" fund_feature = fund_feature + \" \"\n",
|
||||
" fund_name_split = fund_name.split(fund_feature)\n",
|
||||
" if len(fund_name_split) > 1:\n",
|
||||
" last_fund = fund_name_split[-1].strip()\n",
|
||||
" if len(last_fund) == 0:\n",
|
||||
" last_fund = fund_name_split[-2].strip()\n",
|
||||
" fund_name = f\"{last_fund} {fund_feature}\"\n",
|
||||
" return fund_name"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'C Fund'"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fund_name = \"JPMorgan Investment Fund - Global Income Conservative Fund\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'- Global Income Conservative Fund Fund '"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"get_fund_name(fund_name, \"Fund\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "torch2_real",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
137
playground.py
137
playground.py
|
|
@ -1,137 +0,0 @@
|
|||
import os
|
||||
import json
|
||||
import base64
|
||||
import json_repair
|
||||
from utils.pdf_util import PDFUtil
|
||||
from utils.logger import logger
|
||||
from utils.gpt_utils import chat
|
||||
|
||||
|
||||
def get_base64_pdf_image_list(
|
||||
pdf_file: str, pdf_page_index_list: list, output_folder: str = None
|
||||
) -> dict:
|
||||
if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
|
||||
logger.error("pdf_file is not provided")
|
||||
return None
|
||||
pdf_util = PDFUtil(pdf_file)
|
||||
if pdf_page_index_list is None or len(pdf_page_index_list) == 0:
|
||||
pdf_page_index_list = list(range(pdf_util.get_page_count()))
|
||||
if output_folder is not None and len(output_folder) > 0:
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
pdf_image_info = pdf_util.extract_images(
|
||||
pdf_page_index_list=pdf_page_index_list, output_folder=output_folder
|
||||
)
|
||||
return pdf_image_info
|
||||
|
||||
|
||||
def encode_image(image_path: str):
|
||||
if image_path is None or len(image_path) == 0 or not os.path.exists(image_path):
|
||||
return None
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
|
||||
|
||||
def chat_with_image(
|
||||
pdf_file: str,
|
||||
pdf_page_index_list: list,
|
||||
image_instructions_file: str,
|
||||
image_folder: str,
|
||||
gpt_folder: str,
|
||||
):
|
||||
if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
|
||||
logger.error("pdf_file is not provided")
|
||||
return None
|
||||
pdf_image_info = get_base64_pdf_image_list(
|
||||
pdf_file, pdf_page_index_list, image_folder
|
||||
)
|
||||
|
||||
with open(image_instructions_file, "r", encoding="utf-8") as file:
|
||||
image_instructions = file.read()
|
||||
os.makedirs(gpt_folder, exist_ok=True)
|
||||
pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
|
||||
response_list = {}
|
||||
for page_index, data in pdf_image_info.items():
|
||||
logger.info(f"Processing image in page {page_index}")
|
||||
image_file = data.get("img_file", None)
|
||||
image_base64 = data.get("img_base64", None)
|
||||
response, error = chat(prompt=image_instructions, image_base64=image_base64)
|
||||
if error:
|
||||
logger.error(f"Error in processing image in page {page_index}")
|
||||
continue
|
||||
try:
|
||||
response_json = json.loads(response)
|
||||
except:
|
||||
response_json = json_repair.loads(response)
|
||||
response_json_file = os.path.join(
|
||||
gpt_folder, f"{pdf_base_name}_{page_index}.json"
|
||||
)
|
||||
with open(response_json_file, "w", encoding="utf-8") as file:
|
||||
json.dump(response_json, file, indent=4)
|
||||
logger.info(f"Response for image in page {page_index}: {response}")
|
||||
logger.info("Done")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Table extraction by image
|
||||
# pdf_file = r"/data/emea_ar/small_pdf/382366116.pdf"
|
||||
# pdf_page_index_list = [29, 35, 71, 77, 83, 89, 97, 103, 112, 121, 130, 140, 195, 250, 305]
|
||||
# pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
|
||||
# pdf_page_index_list = [13]
|
||||
# pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
|
||||
# pdf_page_index_list = [29]
|
||||
# image_instructions_file = r"./instructions/table_extraction_image_prompts_v2.txt"
|
||||
# image_output_folder = r"/data/emea_ar/small_pdf_image/"
|
||||
# gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/table/"
|
||||
# chat_with_image(
|
||||
# pdf_file,
|
||||
# pdf_page_index_list,
|
||||
# image_instructions_file,
|
||||
# image_output_folder,
|
||||
# gpt_output_folder,
|
||||
# )
|
||||
|
||||
# Data extraction by image
|
||||
# pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
|
||||
# pdf_page_index_list = [29]
|
||||
pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
|
||||
pdf_page_index_list = [13]
|
||||
image_output_folder = r"/data/emea_ar/small_pdf_image/"
|
||||
gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/data/"
|
||||
image_instructions_file = r"./instructions/data_extraction_image_prompts.txt"
|
||||
chat_with_image(
|
||||
pdf_file,
|
||||
pdf_page_index_list,
|
||||
image_instructions_file,
|
||||
image_output_folder,
|
||||
gpt_output_folder,
|
||||
)
|
||||
|
||||
|
||||
# Text extraction by image
|
||||
# pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
|
||||
# pdf_page_index_list = [13]
|
||||
# image_instructions_file = r"./instructions/text_extraction_image_prompts.txt"
|
||||
# image_output_folder = r"/data/emea_ar/small_pdf_image/"
|
||||
# gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/text/"
|
||||
# chat_with_image(
|
||||
# pdf_file,
|
||||
# pdf_page_index_list,
|
||||
# image_instructions_file,
|
||||
# image_output_folder,
|
||||
# gpt_output_folder,
|
||||
# )
|
||||
|
||||
# pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
|
||||
# pdf_page_index_list = [13]
|
||||
# image_instructions_file = r"./instructions/table_extraction_image_optimize_prompts.txt"
|
||||
# image_output_folder = r"/data/emea_ar/small_pdf_image/"
|
||||
# gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/optimized_instructions/"
|
||||
# chat_with_image(
|
||||
# pdf_file,
|
||||
# pdf_page_index_list,
|
||||
# image_instructions_file,
|
||||
# image_output_folder,
|
||||
# gpt_output_folder,
|
||||
# )
|
||||
|
||||
|
||||
|
|
@ -1,277 +0,0 @@
|
|||
from tqdm import tqdm
|
||||
from glob import glob
|
||||
import json
|
||||
import pandas as pd
|
||||
import os
|
||||
from traceback import print_exc
|
||||
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
|
||||
|
||||
from utils.logger import logger
|
||||
|
||||
|
||||
def calculate_complex_document_metrics(verify_file_path: str, document_list: list = []):
|
||||
data_df_1 = pd.read_excel(verify_file_path, sheet_name="data_in_doc_mapping")
|
||||
# convert doc_id column to string
|
||||
data_df_1["doc_id"] = data_df_1["doc_id"].astype(str)
|
||||
data_df_1 = data_df_1[data_df_1["raw_check"].isin([0, 1])]
|
||||
|
||||
exclude_documents = ["532422548"]
|
||||
# remove data by doc_id not in exclude_documents
|
||||
data_df_1 = data_df_1[~data_df_1["doc_id"].isin(exclude_documents)]
|
||||
|
||||
if document_list is not None and len(document_list) > 0:
|
||||
data_df_1 = data_df_1[data_df_1["doc_id"].isin(document_list)]
|
||||
|
||||
data_df_2 = pd.read_excel(verify_file_path, sheet_name="total_mapping_data")
|
||||
data_df_2["doc_id"] = data_df_2["doc_id"].astype(str)
|
||||
data_df_2 = data_df_2[data_df_2["raw_check"].isin([0, 1])]
|
||||
|
||||
data_df = pd.concat([data_df_1, data_df_2], ignore_index=True)
|
||||
|
||||
data_df.fillna("", inplace=True)
|
||||
data_df.reset_index(drop=True, inplace=True)
|
||||
|
||||
metrics_df_list = []
|
||||
doc_id_list = data_df["doc_id"].unique().tolist()
|
||||
for doc_id in tqdm(doc_id_list):
|
||||
try:
|
||||
document_data_df = data_df[data_df["doc_id"] == doc_id]
|
||||
document_metrics_df = calc_metrics(document_data_df, doc_id)
|
||||
metrics_df_list.append(document_metrics_df)
|
||||
except Exception as e:
|
||||
logger.error(f"Error when calculating metrics for document {doc_id}")
|
||||
print_exc()
|
||||
|
||||
total_metrics_df = calc_metrics(data_df, doc_id=None)
|
||||
metrics_df_list.append(total_metrics_df)
|
||||
|
||||
all_metrics_df = pd.concat(metrics_df_list, ignore_index=True)
|
||||
all_metrics_df.reset_index(drop=True, inplace=True)
|
||||
|
||||
output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
|
||||
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
|
||||
output_metrics_file = os.path.join(output_folder,
|
||||
f"complex_{verify_file_name}_metrics_all.xlsx")
|
||||
with pd.ExcelWriter(output_metrics_file) as writer:
|
||||
all_metrics_df.to_excel(writer, index=False, sheet_name="metrics")
|
||||
|
||||
|
||||
def calc_metrics(data_df: pd.DataFrame, doc_id: str = None):
|
||||
# tor data
|
||||
tor_data_df = data_df[data_df["datapoint"] == "tor"]
|
||||
if len(tor_data_df) > 0:
|
||||
tor_metrics = get_sub_metrics(tor_data_df, "tor", doc_id)
|
||||
logger.info(f"TOR metrics: {tor_metrics}")
|
||||
else:
|
||||
tor_metrics = None
|
||||
|
||||
# ter data
|
||||
ter_data_df = data_df[data_df["datapoint"] == "ter"]
|
||||
if len(ter_data_df) > 0:
|
||||
ter_metrics = get_sub_metrics(ter_data_df, "ter", doc_id)
|
||||
logger.info(f"TER metrics: {ter_metrics}")
|
||||
else:
|
||||
ter_metrics = None
|
||||
|
||||
# ogc data
|
||||
ogc_data_df = data_df[data_df["datapoint"] == "ogc"]
|
||||
if len(ogc_data_df) > 0:
|
||||
ogc_metrics = get_sub_metrics(ogc_data_df, "ogc", doc_id)
|
||||
logger.info(f"OGC metrics: {ogc_metrics}")
|
||||
else:
|
||||
ogc_metrics = None
|
||||
|
||||
# performance_fee data
|
||||
performance_fee_data_df = data_df[data_df["datapoint"] == "performance_fee"]
|
||||
if len(performance_fee_data_df) > 0:
|
||||
performance_fee_metrics = get_sub_metrics(performance_fee_data_df, "performance_fee", doc_id)
|
||||
logger.info(f"Performance fee metrics: {performance_fee_metrics}")
|
||||
else:
|
||||
performance_fee_metrics = None
|
||||
|
||||
metrics_candidates = [tor_metrics, ter_metrics, ogc_metrics, performance_fee_metrics]
|
||||
metrics_list = [metrics for metrics in metrics_candidates if metrics is not None]
|
||||
metrics_df = pd.DataFrame(metrics_list)
|
||||
# add average metrics
|
||||
if doc_id is not None and len(doc_id) > 0:
|
||||
avg_metrics = {
|
||||
"DocumentId": doc_id,
|
||||
"DataPoint": "average",
|
||||
"F1": metrics_df["F1"].mean(),
|
||||
"Precision": metrics_df["Precision"].mean(),
|
||||
"Recall": metrics_df["Recall"].mean(),
|
||||
"Accuracy": metrics_df["Accuracy"].mean(),
|
||||
"Support": metrics_df["Support"].sum()
|
||||
}
|
||||
else:
|
||||
avg_metrics = {
|
||||
"DocumentId": "All",
|
||||
"DataPoint": "average",
|
||||
"F1": metrics_df["F1"].mean(),
|
||||
"Precision": metrics_df["Precision"].mean(),
|
||||
"Recall": metrics_df["Recall"].mean(),
|
||||
"Accuracy": metrics_df["Accuracy"].mean(),
|
||||
"Support": metrics_df["Support"].sum()
|
||||
}
|
||||
|
||||
metrics_list.append(avg_metrics)
|
||||
metrics_df = pd.DataFrame(metrics_list)
|
||||
metrics_df.reset_index(drop=True, inplace=True)
|
||||
return metrics_df
|
||||
|
||||
|
||||
def get_sub_metrics(data_df: pd.DataFrame, data_point: str, doc_id: str = None) -> dict:
|
||||
data_df_raw_check_1 = data_df[data_df["raw_check"] == 1]
|
||||
gt_list = [1] * len(data_df_raw_check_1)
|
||||
pre_list = [1] * len(data_df_raw_check_1)
|
||||
|
||||
data_df_raw_check_0 = data_df[data_df["raw_check"] == 0]
|
||||
for index, row in data_df_raw_check_0.iterrows():
|
||||
if row["raw_check_comment"] == "modify":
|
||||
gt_list.append(0)
|
||||
pre_list.append(1)
|
||||
|
||||
gt_list.append(1)
|
||||
pre_list.append(0)
|
||||
elif row["raw_check_comment"] == "incorrect":
|
||||
gt_list.append(0)
|
||||
pre_list.append(1)
|
||||
elif row["raw_check_comment"] == "supplement":
|
||||
gt_list.append(1)
|
||||
pre_list.append(0)
|
||||
else:
|
||||
pass
|
||||
|
||||
# calculate metrics
|
||||
accuracy = accuracy_score(gt_list, pre_list)
|
||||
precision = precision_score(gt_list, pre_list)
|
||||
recall = recall_score(gt_list, pre_list)
|
||||
f1 = f1_score(gt_list, pre_list)
|
||||
support = sum(gt_list)
|
||||
if doc_id is not None and len(doc_id) > 0:
|
||||
metrics = {
|
||||
"DocumentId": doc_id,
|
||||
"DataPoint": data_point,
|
||||
"F1": f1,
|
||||
"Precision": precision,
|
||||
"Recall": recall,
|
||||
"Accuracy": accuracy,
|
||||
"Support": support
|
||||
}
|
||||
else:
|
||||
metrics = {
|
||||
"DocumentId": "All",
|
||||
"DataPoint": data_point,
|
||||
"F1": f1,
|
||||
"Precision": precision,
|
||||
"Recall": recall,
|
||||
"Accuracy": accuracy,
|
||||
"Support": support
|
||||
}
|
||||
return metrics
|
||||
|
||||
|
||||
def get_metrics_based_documents(metrics_file: str, document_list: list):
|
||||
metrics_df = pd.read_excel(metrics_file, sheet_name="metrics")
|
||||
metrics_df_list = []
|
||||
for doc_id in tqdm(document_list):
|
||||
try:
|
||||
document_metrics_df = metrics_df[metrics_df["DocumentId"] == doc_id]
|
||||
metrics_df_list.append(document_metrics_df)
|
||||
except Exception as e:
|
||||
logger.error(f"Error when calculating metrics for document {doc_id}")
|
||||
print_exc()
|
||||
metrics_document_df = pd.concat(metrics_df_list, ignore_index=True)
|
||||
|
||||
stats_metrics_list = []
|
||||
tor_df = metrics_document_df[metrics_document_df["DataPoint"] == "tor"]
|
||||
if len(tor_df) > 0:
|
||||
tor_metrics = {
|
||||
"DocumentId": "All",
|
||||
"DataPoint": "tor",
|
||||
"F1": tor_df["F1"].mean(),
|
||||
"Precision": tor_df["Precision"].mean(),
|
||||
"Recall": tor_df["Recall"].mean(),
|
||||
"Accuracy": tor_df["Accuracy"].mean(),
|
||||
"Support": tor_df["Support"].sum()
|
||||
}
|
||||
stats_metrics_list.append(tor_metrics)
|
||||
ter_df = metrics_document_df[metrics_document_df["DataPoint"] == "ter"]
|
||||
if len(ter_df) > 0:
|
||||
ter_metrics = {
|
||||
"DocumentId": "All",
|
||||
"DataPoint": "ter",
|
||||
"F1": ter_df["F1"].mean(),
|
||||
"Precision": ter_df["Precision"].mean(),
|
||||
"Recall": ter_df["Recall"].mean(),
|
||||
"Accuracy": ter_df["Accuracy"].mean(),
|
||||
"Support": ter_df["Support"].sum()
|
||||
}
|
||||
stats_metrics_list.append(ter_metrics)
|
||||
ogc_df = metrics_document_df[metrics_document_df["DataPoint"] == "ogc"]
|
||||
if len(ogc_df) > 0:
|
||||
ogc_metrics = {
|
||||
"DocumentId": "All",
|
||||
"DataPoint": "ogc",
|
||||
"F1": ogc_df["F1"].mean(),
|
||||
"Precision": ogc_df["Precision"].mean(),
|
||||
"Recall": ogc_df["Recall"].mean(),
|
||||
"Accuracy": ogc_df["Accuracy"].mean(),
|
||||
"Support": ogc_df["Support"].sum()
|
||||
}
|
||||
stats_metrics_list.append(ogc_metrics)
|
||||
performance_fee_df = metrics_document_df[metrics_document_df["DataPoint"] == "performance_fee"]
|
||||
if len(performance_fee_df) > 0:
|
||||
performance_fee_metrics = {
|
||||
"DocumentId": "All",
|
||||
"DataPoint": "performance_fee",
|
||||
"F1": performance_fee_df["F1"].mean(),
|
||||
"Precision": performance_fee_df["Precision"].mean(),
|
||||
"Recall": performance_fee_df["Recall"].mean(),
|
||||
"Accuracy": performance_fee_df["Accuracy"].mean(),
|
||||
"Support": performance_fee_df["Support"].sum()
|
||||
}
|
||||
stats_metrics_list.append(performance_fee_metrics)
|
||||
average_df = metrics_document_df[metrics_document_df["DataPoint"] == "average"]
|
||||
if len(average_df) > 0:
|
||||
avg_metrics = {
|
||||
"DocumentId": "All",
|
||||
"DataPoint": "average",
|
||||
"F1": average_df["F1"].mean(),
|
||||
"Precision": average_df["Precision"].mean(),
|
||||
"Recall": average_df["Recall"].mean(),
|
||||
"Accuracy": average_df["Accuracy"].mean(),
|
||||
"Support": average_df["Support"].sum()
|
||||
}
|
||||
stats_metrics_list.append(avg_metrics)
|
||||
|
||||
stats_metrics_df = pd.DataFrame(stats_metrics_list)
|
||||
metrics_df_list.append(stats_metrics_df)
|
||||
all_metrics_df = pd.concat(metrics_df_list, ignore_index=True)
|
||||
all_metrics_df.reset_index(drop=True, inplace=True)
|
||||
|
||||
output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
|
||||
verify_file_name = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_remain_7.xlsx"
|
||||
output_metrics_file = os.path.join(output_folder, verify_file_name)
|
||||
with pd.ExcelWriter(output_metrics_file) as writer:
|
||||
all_metrics_df.to_excel(writer, index=False, sheet_name="metrics")
|
||||
|
||||
return all_metrics_df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
|
||||
verify_file = "mapping_data_info_31_documents_by_text_second_round.xlsx"
|
||||
verify_file_path = os.path.join(file_folder, verify_file)
|
||||
calculate_complex_document_metrics(verify_file_path=verify_file_path,
|
||||
document_list=None)
|
||||
document_list = ["492029971",
|
||||
"510300817",
|
||||
"512745032",
|
||||
"514213638",
|
||||
"527525440",
|
||||
"534535767"]
|
||||
metrics_file = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_all.xlsx"
|
||||
metrics_file_path = os.path.join(file_folder, metrics_file)
|
||||
# get_metrics_based_documents(metrics_file=metrics_file_path,
|
||||
# document_list=document_list)
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
import os
|
||||
import json
|
||||
import pandas as pd
|
||||
from glob import glob
|
||||
from tqdm import tqdm
|
||||
from utils.logger import logger
|
||||
from utils.sql_query_util import query_document_fund_mapping
|
||||
from core.page_filter import FilterPages
|
||||
from core.data_extraction import DataExtraction
|
||||
|
||||
|
||||
def test_validate_extraction_data():
|
||||
document_id = "481482392"
|
||||
pdf_file = f"/data/emea_ar/pdf/481482392.pdf"
|
||||
output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
|
||||
output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
|
||||
document_mapping_info_df = query_document_fund_mapping(document_id, rerun=False)
|
||||
filter_pages = FilterPages(
|
||||
document_id, pdf_file, document_mapping_info_df
|
||||
)
|
||||
page_text_dict = filter_pages.page_text_dict
|
||||
datapoint_page_info, result_details = get_datapoint_page_info(filter_pages)
|
||||
datapoints = get_datapoints_from_datapoint_page_info(datapoint_page_info)
|
||||
data_extraction = DataExtraction(
|
||||
doc_source="emea_ar",
|
||||
doc_id=document_id,
|
||||
pdf_file=pdf_file,
|
||||
output_data_folder=output_extract_data_child_folder,
|
||||
page_text_dict=page_text_dict,
|
||||
datapoint_page_info=datapoint_page_info,
|
||||
datapoints=datapoints,
|
||||
document_mapping_info_df=document_mapping_info_df,
|
||||
extract_way="text",
|
||||
output_image_folder=None
|
||||
)
|
||||
output_data_json_folder = os.path.join(
|
||||
r"/data/emea_ar/output/extract_data/docs/by_text/", "json/"
|
||||
)
|
||||
os.makedirs(output_data_json_folder, exist_ok=True)
|
||||
json_file = os.path.join(output_data_json_folder, f"{document_id}.json")
|
||||
data_from_gpt = None
|
||||
if os.path.exists(json_file):
|
||||
logger.info(
|
||||
f"The document: {document_id} has been parsed, loading data from {json_file}"
|
||||
)
|
||||
with open(json_file, "r", encoding="utf-8") as f:
|
||||
data_from_gpt = json.load(f)
|
||||
for extract_data in data_from_gpt:
|
||||
page_index = extract_data["page_index"]
|
||||
if page_index == 451:
|
||||
logger.info(f"Page index: {page_index}")
|
||||
raw_answer = extract_data["raw_answer"]
|
||||
raw_answer_json = json.loads(raw_answer)
|
||||
extract_data_info = data_extraction.validate_data(raw_answer_json)
|
||||
print(extract_data_info)
|
||||
|
||||
def get_datapoint_page_info(filter_pages) -> tuple:
|
||||
datapoint_page_info, result_details = filter_pages.start_job()
|
||||
return datapoint_page_info, result_details
|
||||
|
||||
|
||||
def get_datapoints_from_datapoint_page_info(datapoint_page_info) -> list:
|
||||
datapoints = list(datapoint_page_info.keys())
|
||||
if "doc_id" in datapoints:
|
||||
datapoints.remove("doc_id")
|
||||
return datapoints
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_validate_extraction_data()
|
||||
Loading…
Reference in New Issue