update for deployment

This commit is contained in:
Blade He 2025-01-16 20:34:43 -06:00
parent fb4a6402f0
commit f10ff8ee33
9 changed files with 12 additions and 1467 deletions

6
.gitignore vendored
View File

@ -7,3 +7,9 @@
/test_metrics /test_metrics
/data /data
/sample_documents/japan_prospectus.txt /sample_documents/japan_prospectus.txt
/pdf_table_extraction.py
/playground.ipynb
/playground.py
/specific_calc_metrics.py
/test_specific_biz_logic.py
/drilldown_practice.py

View File

@ -22,7 +22,7 @@ swagger = Swagger(app, template=template)
@app.route('/automation/api/model/emea_ar', methods=['POST']) @app.route('/automation/api/model/emea_ar', methods=['POST'])
@swag_from('yml/emea_ar.yml') @swag_from('yml/emea_ar.yml')
def us_ar_data_extract(): def emea_ar_data_extract():
""" """
Extract EMEA AR cost data from EMEA LUX PDF document Extract EMEA AR cost data from EMEA LUX PDF document
input sample: input sample:
@ -40,6 +40,7 @@ def us_ar_data_extract():
return jsonify({"error": "doc_id is required"}), 400 return jsonify({"error": "doc_id is required"}), 400
pdf_folder = r"./data/emea_ar/pdf/" pdf_folder = r"./data/emea_ar/pdf/"
output_pdf_text_folder = r"./data/emea_ar/output/pdf_text/"
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/" output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/" output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
drilldown_folder = r"./data/emea_ar/output/drilldown/" drilldown_folder = r"./data/emea_ar/output/drilldown/"
@ -62,6 +63,7 @@ def us_ar_data_extract():
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id, emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
doc_source="emea_ar", doc_source="emea_ar",
pdf_folder=pdf_folder, pdf_folder=pdf_folder,
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_folder=output_extract_data_folder, output_extract_data_folder=output_extract_data_folder,
output_mapping_data_folder=output_mapping_data_folder, output_mapping_data_folder=output_mapping_data_folder,
extract_way=extract_way, extract_way=extract_way,

View File

@ -4,6 +4,7 @@ import json_repair
import re import re
import fitz import fitz
import pandas as pd import pandas as pd
from traceback import print_exc
from utils.gpt_utils import chat from utils.gpt_utils import chat
from utils.pdf_util import PDFUtil from utils.pdf_util import PDFUtil
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
@ -294,6 +295,8 @@ class DataExtraction:
for current_page_data in page_data_list: for current_page_data in page_data_list:
if current_page_data in next_page_data_list: if current_page_data in next_page_data_list:
next_page_data_list.remove(current_page_data) next_page_data_list.remove(current_page_data)
if len(next_page_data_list) == 0:
break
next_page_extract_data["extract_data"][ next_page_extract_data["extract_data"][
"data" "data"
] = next_page_data_list ] = next_page_data_list

View File

@ -1,159 +0,0 @@
from tqdm import tqdm
from glob import glob
import json
import pandas as pd
import os
from traceback import print_exc
from sklearn.metrics import recall_score
from utils.logger import logger
from utils.pdf_util import PDFUtil
def drilldown_documents(pdf_folder: str, extract_data_folder: str, drilldown_folder: str):
extract_files = glob(extract_data_folder + '*.json')
for index, json_file in enumerate(tqdm(extract_files)):
try:
# doc_id = file.split('/')[-1].split('.')[0]
json_base_name = os.path.basename(json_file)
doc_id = json_base_name.split('.')[0]
logger.info(f"Processing {doc_id}")
pdf_file = os.path.join(pdf_folder, f"{doc_id}.pdf")
if not os.path.exists(pdf_file):
logger.error(f"PDF file not found for {doc_id}")
continue
with open(json_file, "r", encoding="utf-8") as f:
data_from_gpt = json.load(f)
drilldown_pdf_document(doc_id=doc_id,
pdf_file=pdf_file,
drilldown_folder=drilldown_folder,
data_from_gpt=data_from_gpt)
except Exception as e:
print_exc()
logger.error(f"Error in processing {doc_id}: {e}")
def drilldown_pdf_document(doc_id:str,
pdf_file: str,
drilldown_folder: str,
data_from_gpt: list) -> list:
logger.info(f"Drilldown PDF document for doc_id: {doc_id}")
pdf_util = PDFUtil(pdf_file)
drilldown_data_list = []
for data in data_from_gpt:
doc_id = str(data.get("doc_id", ""))
# if doc_id != "506326520":
# continue
page_index = data.get("page_index", -1)
if page_index == -1:
continue
extract_data_list = data.get("extract_data", {}).get("data", [])
dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {})
if len(dp_reported_name_dict.keys()) == 0:
continue
highlighted_value_list = []
for extract_data in extract_data_list:
for data_point, value in extract_data.items():
if value in highlighted_value_list:
continue
if data_point in ["ter", "ogc", "performance_fee"]:
continue
drilldown_data = {
"doc_id": doc_id,
"page_index": page_index,
"data_point": data_point,
"parent_text_block": None,
"value": value,
"annotation_attribute": {}
}
drilldown_data_list.append(drilldown_data)
highlighted_value_list.append(value)
for data_point, reported_name in dp_reported_name_dict.items():
if reported_name in highlighted_value_list:
continue
data_point = f"{data_point}_reported_name"
drilldown_data = {
"doc_id": doc_id,
"page_index": page_index,
"data_point": data_point,
"parent_text_block": None,
"value": reported_name,
"annotation_attribute": {}
}
drilldown_data_list.append(drilldown_data)
highlighted_value_list.append(reported_name)
drilldown_result = []
if len(drilldown_data_list) > 0:
drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list,
output_pdf_folder=drilldown_folder)
if len(drilldown_result) > 0:
logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully")
annotation_list = drilldown_result.get("annotation_list", [])
for annotation in annotation_list:
annotation["doc_id"] = doc_id
if drilldown_folder is not None and len(drilldown_folder) > 0:
drilldown_data_folder = os.path.join(drilldown_folder, "data/")
os.makedirs(drilldown_data_folder, exist_ok=True)
drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx")
drilldown_source_df = pd.DataFrame(drilldown_data_list)
annotation_list_df = pd.DataFrame(annotation_list)
# set drilldown_result_df column order as doc_id, pdf_file, page_index,
# data_point, value, matching_val_area, normalized_bbox
annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index",
"data_point", "value", "matching_val_area", "normalized_bbox"]]
logger.info(f"Writing drilldown data to {drilldown_file}")
with pd.ExcelWriter(drilldown_file) as writer:
drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data")
annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data")
def calculate_metrics():
drilldown_folder = r"/data/emea_ar/output/drilldown/"
drilldown_data_folder = os.path.join(drilldown_folder, "data/")
drilldown_files = glob(drilldown_data_folder + '*.xlsx')
y_true_list = []
y_pred_list = []
series_list = []
for drilldown_file in drilldown_files:
drilldown_file_base_name = os.path.basename(drilldown_file)
if drilldown_file_base_name.startswith("~"):
continue
drilldown_data = pd.read_excel(drilldown_file, sheet_name="drilldown_data")
for index, row in drilldown_data.iterrows():
matching_val_area = row["matching_val_area"]
# transform matching_val_area to list
if isinstance(matching_val_area, str):
matching_val_area = eval(matching_val_area)
y_true_list.append(1)
if len(matching_val_area) > 0:
y_pred_list.append(1)
else:
y_pred_list.append(0)
series_list.append(row)
recall = recall_score(y_true_list, y_pred_list)
logger.info(f"Recall: {recall}, Support: {len(y_true_list)}")
no_annotation_df = pd.DataFrame(series_list)
no_annotation_df.reset_index(drop=True, inplace=True)
metrics_folder = os.path.join(drilldown_folder, "metrics/")
os.makedirs(metrics_folder, exist_ok=True)
metrics_file = os.path.join(metrics_folder, "metrics.xlsx")
metrics_result = {
"recall": recall,
"support": len(y_true_list)
}
metrics_df = pd.DataFrame([metrics_result])
with pd.ExcelWriter(metrics_file) as writer:
metrics_df.to_excel(writer, index=False, sheet_name="metrics")
no_annotation_df.to_excel(writer, index=False, sheet_name="no_annotation")
if __name__ == "__main__":
pdf_folder = r"/data/emea_ar/pdf/"
drilldown_folder = r"/data/emea_ar/output/drilldown/"
extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/'
drilldown_documents()
# calculate_metrics()

View File

@ -1,110 +0,0 @@
import pandas as pd
import os
import tqdm
import json_repair
import json
from glob import glob
import fitz
import re
import time
import traceback
from utils.logger import logger
from utils.pdf_download import download_pdf_from_documents_warehouse
from utils.pdf_util import PDFUtil
from utils.gpt_utils import chat
class PDFTableExtraction:
"""
Iterate PDF pages
Extract tables from PDF pages
Save these tables as markdown files
"""
def __init__(self,
pdf_file: str,
output_folder: str) -> None:
self.pdf_file = pdf_file
self.pdf_file_name = os.path.basename(pdf_file)
self.table_extraction_prompts = self.get_table_extraction_prompts()
self.output_folder = output_folder
os.makedirs(output_folder, exist_ok=True)
self.prompts_output_folder = os.path.join(output_folder, 'pdf_table_prompts/')
os.makedirs(self.prompts_output_folder, exist_ok=True)
self.json_output_folder = os.path.join(output_folder, 'pdf_table_json/')
os.makedirs(self.json_output_folder, exist_ok=True)
self.table_md_output_folder = os.path.join(output_folder, 'pdf_table_markdown/')
os.makedirs(self.table_md_output_folder, exist_ok=True)
def get_table_extraction_prompts(self):
instructions_file = r'./instructions/table_extraction_prompts.txt'
with open(instructions_file, 'r', encoding='utf-8') as file:
return file.read()
def extract_tables(self):
try:
if self.pdf_file is None or len(self.pdf_file) == 0 or not os.path.exists(self.pdf_file):
logger.error(f"Invalid pdf_file: {self.pdf_file}")
return
logger.info(f"Start processing {self.pdf_file}")
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text(output_folder=self.output_folder)
if success:
logger.info(f"Successfully extracted text from {self.pdf_file}")
for page_num, page_text in page_text_dict.items():
try:
self.extract_tables_from_page(page_text, page_num)
except Exception as e:
traceback.print_exc()
logger.error(f"Error in extracting tables from page {page_num}: {str(e)}")
except Exception as e:
logger.error(f"Error in extracting PDF tables: {str(e)}")
def extract_tables_from_page(self, page_text: str, page_num: int):
pure_pdf_name = self.pdf_file_name.replace('.pdf', '')
table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text)
prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt')
if os.path.exists(prompts_response_file):
logger.info(f"Prompts response file already exists: {prompts_response_file}")
return
response, with_error = chat(table_extraction_prompts)
if with_error:
logger.error(f"Error in extracting tables from page")
return
json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response)
if json_response is None:
logger.info(f"Can't extract tables from page")
return
table_json_text = json_response.group(1)
table_data = {"tables": []}
try:
table_data = json.loads(table_json_text)
except:
table_data = json_repair.loads(table_json_text)
self.save_table_data(table_data, page_num)
prompts_response = f'{table_extraction_prompts}\n\n{response}'
with open(prompts_response_file, 'w', encoding='utf-8') as file:
file.write(prompts_response)
def save_table_data(self, table_data: dict, page_num: int):
pdf_pure_name = self.pdf_file_name.replace('.pdf', '')
json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json')
with open(json_output_file, 'w', encoding='utf-8') as file:
file.write(json.dumps(table_data, indent=4))
table_list = table_data.get('tables', [])
for table_num, table in enumerate(table_list):
table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md')
table = re.sub(r'(\n)+', '\n', table)
with open(table_md_file, 'w', encoding='utf-8') as file:
file.write(table)

View File

@ -1,713 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"from utils.biz_utils import add_slash_to_text_as_regex\n",
"import json\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"regex = r\"Turnover \\n\""
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Turnover\\\\s+\\\\n'"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"add_slash_to_text_as_regex(regex)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover reflects the\\n\""
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<re.Match object; span=(141, 151), match='Turnover \\n'>"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re.search(regex, text)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TOR no match\n",
"Turnover\\*\\s+\\n no match\n",
"Turnover\\s+\\n no match\n",
"Turnover\\s+Ratio no match\n",
"Turnover\\s+Rate no match\n",
"Portfolio\\s+Turnover no match\n",
"Portfolio\\s+turnover\\s+ratio no match\n",
"Portfolio\\s+turnover\\s+rate no match\n",
"PTR no match\n",
"Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
]
}
],
"source": [
"with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
" datapoint_keywords_config = json.load(file)\n",
"\n",
"tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
"\n",
"for tor_regex in tor_regex_list:\n",
" regex = add_slash_to_text_as_regex(tor_regex)\n",
" search = re.search(regex, text)\n",
" if search:\n",
" print(f\"{regex} match {search.group()}\")\n",
" else:\n",
" print(f\"{regex} no match\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from utils.sql_query_util import query_investment_by_provider, query_document_fund_mapping\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"provider_mapping = query_investment_by_provider(company_id=\"0C00008QVP\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ProviderId</th>\n",
" <th>ProviderName</th>\n",
" <th>FundId</th>\n",
" <th>FundName</th>\n",
" <th>ISIN</th>\n",
" <th>SecId</th>\n",
" <th>CurrencyId</th>\n",
" <th>ShareClassName</th>\n",
" <th>ShareClassStatus</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>840</th>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH4</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
" <td>LU1053597990</td>\n",
" <td>F000010MEE</td>\n",
" <td>USD</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>841</th>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH4</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
" <td>LU1053597727</td>\n",
" <td>F000010MEF</td>\n",
" <td>USD</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>842</th>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU0993574440</td>\n",
" <td>F000010MEG</td>\n",
" <td>USD</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>843</th>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU1805616171</td>\n",
" <td>F000010PUN</td>\n",
" <td>CHF</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>844</th>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU1076358073</td>\n",
" <td>F000010MEH</td>\n",
" <td>EUR</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>845</th>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU2046740358</td>\n",
" <td>F0000143Y8</td>\n",
" <td>USD</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>846</th>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU2046740432</td>\n",
" <td>F0000143Y9</td>\n",
" <td>USD</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>847</th>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU0993569101</td>\n",
" <td>F00001564H</td>\n",
" <td>USD</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>848</th>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU2122516821</td>\n",
" <td>F000014UPK</td>\n",
" <td>AUD</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ProviderId ProviderName FundId \\\n",
"840 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH4 \n",
"841 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH4 \n",
"842 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"843 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"844 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"845 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"846 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"847 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"848 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"\n",
" FundName ISIN \\\n",
"840 T. Rowe Price Funds Series II SICAV - Credit O... LU1053597990 \n",
"841 T. Rowe Price Funds Series II SICAV - Credit O... LU1053597727 \n",
"842 T. Rowe Price Funds Series II SICAV - Floating... LU0993574440 \n",
"843 T. Rowe Price Funds Series II SICAV - Floating... LU1805616171 \n",
"844 T. Rowe Price Funds Series II SICAV - Floating... LU1076358073 \n",
"845 T. Rowe Price Funds Series II SICAV - Floating... LU2046740358 \n",
"846 T. Rowe Price Funds Series II SICAV - Floating... LU2046740432 \n",
"847 T. Rowe Price Funds Series II SICAV - Floating... LU0993569101 \n",
"848 T. Rowe Price Funds Series II SICAV - Floating... LU2122516821 \n",
"\n",
" SecId CurrencyId ShareClassName \\\n",
"840 F000010MEE USD T. Rowe Price Funds Series II SICAV - Credit O... \n",
"841 F000010MEF USD T. Rowe Price Funds Series II SICAV - Credit O... \n",
"842 F000010MEG USD T. Rowe Price Funds Series II SICAV - Floating... \n",
"843 F000010PUN CHF T. Rowe Price Funds Series II SICAV - Floating... \n",
"844 F000010MEH EUR T. Rowe Price Funds Series II SICAV - Floating... \n",
"845 F0000143Y8 USD T. Rowe Price Funds Series II SICAV - Floating... \n",
"846 F0000143Y9 USD T. Rowe Price Funds Series II SICAV - Floating... \n",
"847 F00001564H USD T. Rowe Price Funds Series II SICAV - Floating... \n",
"848 F000014UPK AUD T. Rowe Price Funds Series II SICAV - Floating... \n",
"\n",
" ShareClassStatus \n",
"840 0 \n",
"841 0 \n",
"842 1 \n",
"843 0 \n",
"844 0 \n",
"845 0 \n",
"846 0 \n",
"847 0 \n",
"848 0 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"provider_mapping[provider_mapping[\"FundName\"].str.contains(\"T. Rowe Price Funds Series II SICAV\")]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"document_mapping = query_document_fund_mapping(doc_id=\"486378555\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>DocumentId</th>\n",
" <th>EffectiveDate</th>\n",
" <th>DocumentType</th>\n",
" <th>Format</th>\n",
" <th>Language</th>\n",
" <th>DocumentStatus</th>\n",
" <th>ProviderId</th>\n",
" <th>ProviderName</th>\n",
" <th>FundId</th>\n",
" <th>FundName</th>\n",
" <th>Domicile</th>\n",
" <th>SecId</th>\n",
" <th>CurrencyId</th>\n",
" <th>ShareClassName</th>\n",
" <th>ISIN</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>486378555</td>\n",
" <td>2022-06-30</td>\n",
" <td>4</td>\n",
" <td>PDF</td>\n",
" <td>0L00000122</td>\n",
" <td>1</td>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LUX</td>\n",
" <td>F000010MEG</td>\n",
" <td>USD</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU0993574440</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>486378555</td>\n",
" <td>2022-06-30</td>\n",
" <td>4</td>\n",
" <td>PDF</td>\n",
" <td>0L00000122</td>\n",
" <td>1</td>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LUX</td>\n",
" <td>F000010PUN</td>\n",
" <td>CHF</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU1805616171</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>486378555</td>\n",
" <td>2022-06-30</td>\n",
" <td>4</td>\n",
" <td>PDF</td>\n",
" <td>0L00000122</td>\n",
" <td>1</td>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LUX</td>\n",
" <td>F000010MEH</td>\n",
" <td>EUR</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU1076358073</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>486378555</td>\n",
" <td>2022-06-30</td>\n",
" <td>4</td>\n",
" <td>PDF</td>\n",
" <td>0L00000122</td>\n",
" <td>1</td>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LUX</td>\n",
" <td>F0000143Y8</td>\n",
" <td>USD</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU2046740358</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>486378555</td>\n",
" <td>2022-06-30</td>\n",
" <td>4</td>\n",
" <td>PDF</td>\n",
" <td>0L00000122</td>\n",
" <td>1</td>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LUX</td>\n",
" <td>F0000143Y9</td>\n",
" <td>USD</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU2046740432</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>486378555</td>\n",
" <td>2022-06-30</td>\n",
" <td>4</td>\n",
" <td>PDF</td>\n",
" <td>0L00000122</td>\n",
" <td>1</td>\n",
" <td>0C00008QVP</td>\n",
" <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
" <td>FS0000DUH5</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LUX</td>\n",
" <td>F000014UPK</td>\n",
" <td>AUD</td>\n",
" <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
" <td>LU2122516821</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" DocumentId EffectiveDate DocumentType Format Language DocumentStatus \\\n",
"0 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
"1 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
"2 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
"3 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
"4 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
"5 486378555 2022-06-30 4 PDF 0L00000122 1 \n",
"\n",
" ProviderId ProviderName FundId \\\n",
"0 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"1 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"2 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"3 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"4 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"5 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n",
"\n",
" FundName Domicile SecId \\\n",
"0 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010MEG \n",
"1 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010PUN \n",
"2 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010MEH \n",
"3 T. Rowe Price Funds Series II SICAV - Floating... LUX F0000143Y8 \n",
"4 T. Rowe Price Funds Series II SICAV - Floating... LUX F0000143Y9 \n",
"5 T. Rowe Price Funds Series II SICAV - Floating... LUX F000014UPK \n",
"\n",
" CurrencyId ShareClassName ISIN \n",
"0 USD T. Rowe Price Funds Series II SICAV - Floating... LU0993574440 \n",
"1 CHF T. Rowe Price Funds Series II SICAV - Floating... LU1805616171 \n",
"2 EUR T. Rowe Price Funds Series II SICAV - Floating... LU1076358073 \n",
"3 USD T. Rowe Price Funds Series II SICAV - Floating... LU2046740358 \n",
"4 USD T. Rowe Price Funds Series II SICAV - Floating... LU2046740432 \n",
"5 AUD T. Rowe Price Funds Series II SICAV - Floating... LU2122516821 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"document_mapping"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund I Cap',\n",
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (CHF) Cap',\n",
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (EUR) Cap',\n",
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Q (USD) Cap',\n",
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Qd (USD) Dis',\n",
" 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Sdn (AUD) Dis']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(document_mapping[\"ShareClassName\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pymupdf4llm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing ./data/emea_ar/pdf/501380553.pdf...\n",
"[ ] (0/47[ ] ( 1/47[= ] ( 2/4[== ] ( 3/47[=== ] ( 4/4[==== ] ( 5/47[===== ] ( 6/47[===== ] ( 7/4[====== ] ( 8/47[======= ] ( 9/4[======== ] (10/47[========= ] (11/4[========== ] (12/47[=========== ] (13/47[=========== ] (14/4[============ ] (15/47[============= ] (16/4[============== ] (17/47[=============== ] (18/4[================ ] (19/47[================= ] (20/47[================= ] (21/4[================== ] (22/47[=================== ] (23/4[==================== ] (24/47[===================== ] (25/4[====================== ] (26/4[====================== ] (27/47[======================= ] (28/4[======================== ] (29/47[========================= ] (30/4[========================== ] (31/47[=========================== ] (32/4[============================ ] (33/4[============================ ] (34/47[============================= ] (35/4[============================== ] (36/47[=============================== ] (37/4[================================ ] (38/47[================================= ] (39/4[================================== ] (40/4[================================== ] (41/47[=================================== ] (42/4[==================================== ] (43/47[===================================== ] (44/4[====================================== ] (45/47[======================================= ] (46/47[========================================] (47/47]\n"
]
},
{
"data": {
"text/plain": [
"107851"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"md_text = pymupdf4llm.to_markdown(r\"./data/emea_ar/pdf/501380553.pdf\")\n",
"\n",
"# now work with the markdown text, e.g. store as a UTF8-encoded file\n",
"import pathlib\n",
"pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def get_fund_name(fund_name: str, fund_feature: str):\n",
" if not fund_name.endswith(fund_feature):\n",
" return fund_name\n",
" fund_feature = fund_feature + \" \"\n",
" fund_name_split = fund_name.split(fund_feature)\n",
" if len(fund_name_split) > 1:\n",
" last_fund = fund_name_split[-1].strip()\n",
" if len(last_fund) == 0:\n",
" last_fund = fund_name_split[-2].strip()\n",
" fund_name = f\"{last_fund} {fund_feature}\"\n",
" return fund_name"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'C Fund'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"fund_name = \"JPMorgan Investment Fund - Global Income Conservative Fund\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'- Global Income Conservative Fund Fund '"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_fund_name(fund_name, \"Fund\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "torch2_real",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,137 +0,0 @@
import os
import json
import base64
import json_repair
from utils.pdf_util import PDFUtil
from utils.logger import logger
from utils.gpt_utils import chat
def get_base64_pdf_image_list(
pdf_file: str, pdf_page_index_list: list, output_folder: str = None
) -> dict:
if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
logger.error("pdf_file is not provided")
return None
pdf_util = PDFUtil(pdf_file)
if pdf_page_index_list is None or len(pdf_page_index_list) == 0:
pdf_page_index_list = list(range(pdf_util.get_page_count()))
if output_folder is not None and len(output_folder) > 0:
os.makedirs(output_folder, exist_ok=True)
pdf_image_info = pdf_util.extract_images(
pdf_page_index_list=pdf_page_index_list, output_folder=output_folder
)
return pdf_image_info
def encode_image(image_path: str):
if image_path is None or len(image_path) == 0 or not os.path.exists(image_path):
return None
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def chat_with_image(
pdf_file: str,
pdf_page_index_list: list,
image_instructions_file: str,
image_folder: str,
gpt_folder: str,
):
if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
logger.error("pdf_file is not provided")
return None
pdf_image_info = get_base64_pdf_image_list(
pdf_file, pdf_page_index_list, image_folder
)
with open(image_instructions_file, "r", encoding="utf-8") as file:
image_instructions = file.read()
os.makedirs(gpt_folder, exist_ok=True)
pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
response_list = {}
for page_index, data in pdf_image_info.items():
logger.info(f"Processing image in page {page_index}")
image_file = data.get("img_file", None)
image_base64 = data.get("img_base64", None)
response, error = chat(prompt=image_instructions, image_base64=image_base64)
if error:
logger.error(f"Error in processing image in page {page_index}")
continue
try:
response_json = json.loads(response)
except:
response_json = json_repair.loads(response)
response_json_file = os.path.join(
gpt_folder, f"{pdf_base_name}_{page_index}.json"
)
with open(response_json_file, "w", encoding="utf-8") as file:
json.dump(response_json, file, indent=4)
logger.info(f"Response for image in page {page_index}: {response}")
logger.info("Done")
if __name__ == "__main__":
# Table extraction by image
# pdf_file = r"/data/emea_ar/small_pdf/382366116.pdf"
# pdf_page_index_list = [29, 35, 71, 77, 83, 89, 97, 103, 112, 121, 130, 140, 195, 250, 305]
# pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
# pdf_page_index_list = [13]
# pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
# pdf_page_index_list = [29]
# image_instructions_file = r"./instructions/table_extraction_image_prompts_v2.txt"
# image_output_folder = r"/data/emea_ar/small_pdf_image/"
# gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/table/"
# chat_with_image(
# pdf_file,
# pdf_page_index_list,
# image_instructions_file,
# image_output_folder,
# gpt_output_folder,
# )
# Data extraction by image
# pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
# pdf_page_index_list = [29]
pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
pdf_page_index_list = [13]
image_output_folder = r"/data/emea_ar/small_pdf_image/"
gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/data/"
image_instructions_file = r"./instructions/data_extraction_image_prompts.txt"
chat_with_image(
pdf_file,
pdf_page_index_list,
image_instructions_file,
image_output_folder,
gpt_output_folder,
)
# Text extraction by image
# pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
# pdf_page_index_list = [13]
# image_instructions_file = r"./instructions/text_extraction_image_prompts.txt"
# image_output_folder = r"/data/emea_ar/small_pdf_image/"
# gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/text/"
# chat_with_image(
# pdf_file,
# pdf_page_index_list,
# image_instructions_file,
# image_output_folder,
# gpt_output_folder,
# )
# pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
# pdf_page_index_list = [13]
# image_instructions_file = r"./instructions/table_extraction_image_optimize_prompts.txt"
# image_output_folder = r"/data/emea_ar/small_pdf_image/"
# gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/optimized_instructions/"
# chat_with_image(
# pdf_file,
# pdf_page_index_list,
# image_instructions_file,
# image_output_folder,
# gpt_output_folder,
# )

View File

@ -1,277 +0,0 @@
from tqdm import tqdm
from glob import glob
import json
import pandas as pd
import os
from traceback import print_exc
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from utils.logger import logger
def calculate_complex_document_metrics(verify_file_path: str, document_list: list = []):
data_df_1 = pd.read_excel(verify_file_path, sheet_name="data_in_doc_mapping")
# convert doc_id column to string
data_df_1["doc_id"] = data_df_1["doc_id"].astype(str)
data_df_1 = data_df_1[data_df_1["raw_check"].isin([0, 1])]
exclude_documents = ["532422548"]
# remove data by doc_id not in exclude_documents
data_df_1 = data_df_1[~data_df_1["doc_id"].isin(exclude_documents)]
if document_list is not None and len(document_list) > 0:
data_df_1 = data_df_1[data_df_1["doc_id"].isin(document_list)]
data_df_2 = pd.read_excel(verify_file_path, sheet_name="total_mapping_data")
data_df_2["doc_id"] = data_df_2["doc_id"].astype(str)
data_df_2 = data_df_2[data_df_2["raw_check"].isin([0, 1])]
data_df = pd.concat([data_df_1, data_df_2], ignore_index=True)
data_df.fillna("", inplace=True)
data_df.reset_index(drop=True, inplace=True)
metrics_df_list = []
doc_id_list = data_df["doc_id"].unique().tolist()
for doc_id in tqdm(doc_id_list):
try:
document_data_df = data_df[data_df["doc_id"] == doc_id]
document_metrics_df = calc_metrics(document_data_df, doc_id)
metrics_df_list.append(document_metrics_df)
except Exception as e:
logger.error(f"Error when calculating metrics for document {doc_id}")
print_exc()
total_metrics_df = calc_metrics(data_df, doc_id=None)
metrics_df_list.append(total_metrics_df)
all_metrics_df = pd.concat(metrics_df_list, ignore_index=True)
all_metrics_df.reset_index(drop=True, inplace=True)
output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
output_metrics_file = os.path.join(output_folder,
f"complex_{verify_file_name}_metrics_all.xlsx")
with pd.ExcelWriter(output_metrics_file) as writer:
all_metrics_df.to_excel(writer, index=False, sheet_name="metrics")
def calc_metrics(data_df: pd.DataFrame, doc_id: str = None):
# tor data
tor_data_df = data_df[data_df["datapoint"] == "tor"]
if len(tor_data_df) > 0:
tor_metrics = get_sub_metrics(tor_data_df, "tor", doc_id)
logger.info(f"TOR metrics: {tor_metrics}")
else:
tor_metrics = None
# ter data
ter_data_df = data_df[data_df["datapoint"] == "ter"]
if len(ter_data_df) > 0:
ter_metrics = get_sub_metrics(ter_data_df, "ter", doc_id)
logger.info(f"TER metrics: {ter_metrics}")
else:
ter_metrics = None
# ogc data
ogc_data_df = data_df[data_df["datapoint"] == "ogc"]
if len(ogc_data_df) > 0:
ogc_metrics = get_sub_metrics(ogc_data_df, "ogc", doc_id)
logger.info(f"OGC metrics: {ogc_metrics}")
else:
ogc_metrics = None
# performance_fee data
performance_fee_data_df = data_df[data_df["datapoint"] == "performance_fee"]
if len(performance_fee_data_df) > 0:
performance_fee_metrics = get_sub_metrics(performance_fee_data_df, "performance_fee", doc_id)
logger.info(f"Performance fee metrics: {performance_fee_metrics}")
else:
performance_fee_metrics = None
metrics_candidates = [tor_metrics, ter_metrics, ogc_metrics, performance_fee_metrics]
metrics_list = [metrics for metrics in metrics_candidates if metrics is not None]
metrics_df = pd.DataFrame(metrics_list)
# add average metrics
if doc_id is not None and len(doc_id) > 0:
avg_metrics = {
"DocumentId": doc_id,
"DataPoint": "average",
"F1": metrics_df["F1"].mean(),
"Precision": metrics_df["Precision"].mean(),
"Recall": metrics_df["Recall"].mean(),
"Accuracy": metrics_df["Accuracy"].mean(),
"Support": metrics_df["Support"].sum()
}
else:
avg_metrics = {
"DocumentId": "All",
"DataPoint": "average",
"F1": metrics_df["F1"].mean(),
"Precision": metrics_df["Precision"].mean(),
"Recall": metrics_df["Recall"].mean(),
"Accuracy": metrics_df["Accuracy"].mean(),
"Support": metrics_df["Support"].sum()
}
metrics_list.append(avg_metrics)
metrics_df = pd.DataFrame(metrics_list)
metrics_df.reset_index(drop=True, inplace=True)
return metrics_df
def get_sub_metrics(data_df: pd.DataFrame, data_point: str, doc_id: str = None) -> dict:
data_df_raw_check_1 = data_df[data_df["raw_check"] == 1]
gt_list = [1] * len(data_df_raw_check_1)
pre_list = [1] * len(data_df_raw_check_1)
data_df_raw_check_0 = data_df[data_df["raw_check"] == 0]
for index, row in data_df_raw_check_0.iterrows():
if row["raw_check_comment"] == "modify":
gt_list.append(0)
pre_list.append(1)
gt_list.append(1)
pre_list.append(0)
elif row["raw_check_comment"] == "incorrect":
gt_list.append(0)
pre_list.append(1)
elif row["raw_check_comment"] == "supplement":
gt_list.append(1)
pre_list.append(0)
else:
pass
# calculate metrics
accuracy = accuracy_score(gt_list, pre_list)
precision = precision_score(gt_list, pre_list)
recall = recall_score(gt_list, pre_list)
f1 = f1_score(gt_list, pre_list)
support = sum(gt_list)
if doc_id is not None and len(doc_id) > 0:
metrics = {
"DocumentId": doc_id,
"DataPoint": data_point,
"F1": f1,
"Precision": precision,
"Recall": recall,
"Accuracy": accuracy,
"Support": support
}
else:
metrics = {
"DocumentId": "All",
"DataPoint": data_point,
"F1": f1,
"Precision": precision,
"Recall": recall,
"Accuracy": accuracy,
"Support": support
}
return metrics
def get_metrics_based_documents(metrics_file: str, document_list: list):
metrics_df = pd.read_excel(metrics_file, sheet_name="metrics")
metrics_df_list = []
for doc_id in tqdm(document_list):
try:
document_metrics_df = metrics_df[metrics_df["DocumentId"] == doc_id]
metrics_df_list.append(document_metrics_df)
except Exception as e:
logger.error(f"Error when calculating metrics for document {doc_id}")
print_exc()
metrics_document_df = pd.concat(metrics_df_list, ignore_index=True)
stats_metrics_list = []
tor_df = metrics_document_df[metrics_document_df["DataPoint"] == "tor"]
if len(tor_df) > 0:
tor_metrics = {
"DocumentId": "All",
"DataPoint": "tor",
"F1": tor_df["F1"].mean(),
"Precision": tor_df["Precision"].mean(),
"Recall": tor_df["Recall"].mean(),
"Accuracy": tor_df["Accuracy"].mean(),
"Support": tor_df["Support"].sum()
}
stats_metrics_list.append(tor_metrics)
ter_df = metrics_document_df[metrics_document_df["DataPoint"] == "ter"]
if len(ter_df) > 0:
ter_metrics = {
"DocumentId": "All",
"DataPoint": "ter",
"F1": ter_df["F1"].mean(),
"Precision": ter_df["Precision"].mean(),
"Recall": ter_df["Recall"].mean(),
"Accuracy": ter_df["Accuracy"].mean(),
"Support": ter_df["Support"].sum()
}
stats_metrics_list.append(ter_metrics)
ogc_df = metrics_document_df[metrics_document_df["DataPoint"] == "ogc"]
if len(ogc_df) > 0:
ogc_metrics = {
"DocumentId": "All",
"DataPoint": "ogc",
"F1": ogc_df["F1"].mean(),
"Precision": ogc_df["Precision"].mean(),
"Recall": ogc_df["Recall"].mean(),
"Accuracy": ogc_df["Accuracy"].mean(),
"Support": ogc_df["Support"].sum()
}
stats_metrics_list.append(ogc_metrics)
performance_fee_df = metrics_document_df[metrics_document_df["DataPoint"] == "performance_fee"]
if len(performance_fee_df) > 0:
performance_fee_metrics = {
"DocumentId": "All",
"DataPoint": "performance_fee",
"F1": performance_fee_df["F1"].mean(),
"Precision": performance_fee_df["Precision"].mean(),
"Recall": performance_fee_df["Recall"].mean(),
"Accuracy": performance_fee_df["Accuracy"].mean(),
"Support": performance_fee_df["Support"].sum()
}
stats_metrics_list.append(performance_fee_metrics)
average_df = metrics_document_df[metrics_document_df["DataPoint"] == "average"]
if len(average_df) > 0:
avg_metrics = {
"DocumentId": "All",
"DataPoint": "average",
"F1": average_df["F1"].mean(),
"Precision": average_df["Precision"].mean(),
"Recall": average_df["Recall"].mean(),
"Accuracy": average_df["Accuracy"].mean(),
"Support": average_df["Support"].sum()
}
stats_metrics_list.append(avg_metrics)
stats_metrics_df = pd.DataFrame(stats_metrics_list)
metrics_df_list.append(stats_metrics_df)
all_metrics_df = pd.concat(metrics_df_list, ignore_index=True)
all_metrics_df.reset_index(drop=True, inplace=True)
output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
verify_file_name = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_remain_7.xlsx"
output_metrics_file = os.path.join(output_folder, verify_file_name)
with pd.ExcelWriter(output_metrics_file) as writer:
all_metrics_df.to_excel(writer, index=False, sheet_name="metrics")
return all_metrics_df
if __name__ == "__main__":
file_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
verify_file = "mapping_data_info_31_documents_by_text_second_round.xlsx"
verify_file_path = os.path.join(file_folder, verify_file)
calculate_complex_document_metrics(verify_file_path=verify_file_path,
document_list=None)
document_list = ["492029971",
"510300817",
"512745032",
"514213638",
"527525440",
"534535767"]
metrics_file = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_all.xlsx"
metrics_file_path = os.path.join(file_folder, metrics_file)
# get_metrics_based_documents(metrics_file=metrics_file_path,
# document_list=document_list)

View File

@ -1,70 +0,0 @@
import os
import json
import pandas as pd
from glob import glob
from tqdm import tqdm
from utils.logger import logger
from utils.sql_query_util import query_document_fund_mapping
from core.page_filter import FilterPages
from core.data_extraction import DataExtraction
def test_validate_extraction_data():
document_id = "481482392"
pdf_file = f"/data/emea_ar/pdf/481482392.pdf"
output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
document_mapping_info_df = query_document_fund_mapping(document_id, rerun=False)
filter_pages = FilterPages(
document_id, pdf_file, document_mapping_info_df
)
page_text_dict = filter_pages.page_text_dict
datapoint_page_info, result_details = get_datapoint_page_info(filter_pages)
datapoints = get_datapoints_from_datapoint_page_info(datapoint_page_info)
data_extraction = DataExtraction(
doc_source="emea_ar",
doc_id=document_id,
pdf_file=pdf_file,
output_data_folder=output_extract_data_child_folder,
page_text_dict=page_text_dict,
datapoint_page_info=datapoint_page_info,
datapoints=datapoints,
document_mapping_info_df=document_mapping_info_df,
extract_way="text",
output_image_folder=None
)
output_data_json_folder = os.path.join(
r"/data/emea_ar/output/extract_data/docs/by_text/", "json/"
)
os.makedirs(output_data_json_folder, exist_ok=True)
json_file = os.path.join(output_data_json_folder, f"{document_id}.json")
data_from_gpt = None
if os.path.exists(json_file):
logger.info(
f"The document: {document_id} has been parsed, loading data from {json_file}"
)
with open(json_file, "r", encoding="utf-8") as f:
data_from_gpt = json.load(f)
for extract_data in data_from_gpt:
page_index = extract_data["page_index"]
if page_index == 451:
logger.info(f"Page index: {page_index}")
raw_answer = extract_data["raw_answer"]
raw_answer_json = json.loads(raw_answer)
extract_data_info = data_extraction.validate_data(raw_answer_json)
print(extract_data_info)
def get_datapoint_page_info(filter_pages) -> tuple:
datapoint_page_info, result_details = filter_pages.start_job()
return datapoint_page_info, result_details
def get_datapoints_from_datapoint_page_info(datapoint_page_info) -> list:
datapoints = list(datapoint_page_info.keys())
if "doc_id" in datapoints:
datapoints.remove("doc_id")
return datapoints
if __name__ == "__main__":
test_validate_extraction_data()