diff --git a/.gitignore b/.gitignore index 38bc785..adad70e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,9 @@ /test_metrics /data /sample_documents/japan_prospectus.txt +/pdf_table_extraction.py +/playground.ipynb +/playground.py +/specific_calc_metrics.py +/test_specific_biz_logic.py +/drilldown_practice.py diff --git a/app_emea_ar.py b/app_emea_ar.py index 133561c..5a04592 100644 --- a/app_emea_ar.py +++ b/app_emea_ar.py @@ -22,7 +22,7 @@ swagger = Swagger(app, template=template) @app.route('/automation/api/model/emea_ar', methods=['POST']) @swag_from('yml/emea_ar.yml') -def us_ar_data_extract(): +def emea_ar_data_extract(): """ Extract EMEA AR cost data from EMEA LUX PDF document input sample: @@ -40,6 +40,7 @@ def us_ar_data_extract(): return jsonify({"error": "doc_id is required"}), 400 pdf_folder = r"./data/emea_ar/pdf/" + output_pdf_text_folder = r"./data/emea_ar/output/pdf_text/" output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/" output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/" drilldown_folder = r"./data/emea_ar/output/drilldown/" @@ -62,6 +63,7 @@ def us_ar_data_extract(): emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id, doc_source="emea_ar", pdf_folder=pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, output_extract_data_folder=output_extract_data_folder, output_mapping_data_folder=output_mapping_data_folder, extract_way=extract_way, diff --git a/core/data_extraction.py b/core/data_extraction.py index 51b9cf1..9fe3b46 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -4,6 +4,7 @@ import json_repair import re import fitz import pandas as pd +from traceback import print_exc from utils.gpt_utils import chat from utils.pdf_util import PDFUtil from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider @@ -294,6 +295,8 @@ class DataExtraction: for current_page_data in page_data_list: if current_page_data in next_page_data_list: next_page_data_list.remove(current_page_data) + if len(next_page_data_list) == 0: + break next_page_extract_data["extract_data"][ "data" ] = next_page_data_list diff --git a/drilldown_practice.py b/drilldown_practice.py deleted file mode 100644 index 48742a6..0000000 --- a/drilldown_practice.py +++ /dev/null @@ -1,159 +0,0 @@ -from tqdm import tqdm -from glob import glob -import json -import pandas as pd -import os -from traceback import print_exc -from sklearn.metrics import recall_score - -from utils.logger import logger -from utils.pdf_util import PDFUtil - - -def drilldown_documents(pdf_folder: str, extract_data_folder: str, drilldown_folder: str): - extract_files = glob(extract_data_folder + '*.json') - - for index, json_file in enumerate(tqdm(extract_files)): - try: - # doc_id = file.split('/')[-1].split('.')[0] - json_base_name = os.path.basename(json_file) - doc_id = json_base_name.split('.')[0] - logger.info(f"Processing {doc_id}") - pdf_file = os.path.join(pdf_folder, f"{doc_id}.pdf") - if not os.path.exists(pdf_file): - logger.error(f"PDF file not found for {doc_id}") - continue - with open(json_file, "r", encoding="utf-8") as f: - data_from_gpt = json.load(f) - drilldown_pdf_document(doc_id=doc_id, - pdf_file=pdf_file, - drilldown_folder=drilldown_folder, - data_from_gpt=data_from_gpt) - - except Exception as e: - print_exc() - logger.error(f"Error in processing {doc_id}: {e}") - - -def drilldown_pdf_document(doc_id:str, - pdf_file: str, - drilldown_folder: str, - data_from_gpt: list) -> list: - logger.info(f"Drilldown PDF document for doc_id: {doc_id}") - pdf_util = PDFUtil(pdf_file) - drilldown_data_list = [] - for data in data_from_gpt: - doc_id = str(data.get("doc_id", "")) - # if doc_id != "506326520": - # continue - page_index = data.get("page_index", -1) - if page_index == -1: - continue - extract_data_list = data.get("extract_data", {}).get("data", []) - dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {}) - if len(dp_reported_name_dict.keys()) == 0: - continue - highlighted_value_list = [] - for extract_data in extract_data_list: - for data_point, value in extract_data.items(): - if value in highlighted_value_list: - continue - if data_point in ["ter", "ogc", "performance_fee"]: - continue - drilldown_data = { - "doc_id": doc_id, - "page_index": page_index, - "data_point": data_point, - "parent_text_block": None, - "value": value, - "annotation_attribute": {} - } - drilldown_data_list.append(drilldown_data) - highlighted_value_list.append(value) - - for data_point, reported_name in dp_reported_name_dict.items(): - if reported_name in highlighted_value_list: - continue - data_point = f"{data_point}_reported_name" - drilldown_data = { - "doc_id": doc_id, - "page_index": page_index, - "data_point": data_point, - "parent_text_block": None, - "value": reported_name, - "annotation_attribute": {} - } - drilldown_data_list.append(drilldown_data) - highlighted_value_list.append(reported_name) - drilldown_result = [] - if len(drilldown_data_list) > 0: - drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list, - output_pdf_folder=drilldown_folder) - if len(drilldown_result) > 0: - logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully") - annotation_list = drilldown_result.get("annotation_list", []) - for annotation in annotation_list: - annotation["doc_id"] = doc_id - if drilldown_folder is not None and len(drilldown_folder) > 0: - drilldown_data_folder = os.path.join(drilldown_folder, "data/") - os.makedirs(drilldown_data_folder, exist_ok=True) - drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx") - - drilldown_source_df = pd.DataFrame(drilldown_data_list) - annotation_list_df = pd.DataFrame(annotation_list) - # set drilldown_result_df column order as doc_id, pdf_file, page_index, - # data_point, value, matching_val_area, normalized_bbox - annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index", - "data_point", "value", "matching_val_area", "normalized_bbox"]] - logger.info(f"Writing drilldown data to {drilldown_file}") - with pd.ExcelWriter(drilldown_file) as writer: - drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data") - annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data") - - -def calculate_metrics(): - drilldown_folder = r"/data/emea_ar/output/drilldown/" - drilldown_data_folder = os.path.join(drilldown_folder, "data/") - drilldown_files = glob(drilldown_data_folder + '*.xlsx') - y_true_list = [] - y_pred_list = [] - series_list = [] - for drilldown_file in drilldown_files: - drilldown_file_base_name = os.path.basename(drilldown_file) - if drilldown_file_base_name.startswith("~"): - continue - drilldown_data = pd.read_excel(drilldown_file, sheet_name="drilldown_data") - for index, row in drilldown_data.iterrows(): - matching_val_area = row["matching_val_area"] - # transform matching_val_area to list - if isinstance(matching_val_area, str): - matching_val_area = eval(matching_val_area) - y_true_list.append(1) - if len(matching_val_area) > 0: - y_pred_list.append(1) - else: - y_pred_list.append(0) - series_list.append(row) - recall = recall_score(y_true_list, y_pred_list) - logger.info(f"Recall: {recall}, Support: {len(y_true_list)}") - no_annotation_df = pd.DataFrame(series_list) - no_annotation_df.reset_index(drop=True, inplace=True) - metrics_folder = os.path.join(drilldown_folder, "metrics/") - os.makedirs(metrics_folder, exist_ok=True) - metrics_file = os.path.join(metrics_folder, "metrics.xlsx") - metrics_result = { - "recall": recall, - "support": len(y_true_list) - } - metrics_df = pd.DataFrame([metrics_result]) - with pd.ExcelWriter(metrics_file) as writer: - metrics_df.to_excel(writer, index=False, sheet_name="metrics") - no_annotation_df.to_excel(writer, index=False, sheet_name="no_annotation") - - -if __name__ == "__main__": - pdf_folder = r"/data/emea_ar/pdf/" - drilldown_folder = r"/data/emea_ar/output/drilldown/" - extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/' - drilldown_documents() - # calculate_metrics() \ No newline at end of file diff --git a/pdf_table_extraction.py b/pdf_table_extraction.py deleted file mode 100644 index 852c3a5..0000000 --- a/pdf_table_extraction.py +++ /dev/null @@ -1,110 +0,0 @@ -import pandas as pd -import os -import tqdm -import json_repair -import json -from glob import glob -import fitz -import re -import time -import traceback - -from utils.logger import logger -from utils.pdf_download import download_pdf_from_documents_warehouse -from utils.pdf_util import PDFUtil -from utils.gpt_utils import chat - - -class PDFTableExtraction: - """ - Iterate PDF pages - Extract tables from PDF pages - Save these tables as markdown files - """ - def __init__(self, - pdf_file: str, - output_folder: str) -> None: - self.pdf_file = pdf_file - self.pdf_file_name = os.path.basename(pdf_file) - self.table_extraction_prompts = self.get_table_extraction_prompts() - - self.output_folder = output_folder - os.makedirs(output_folder, exist_ok=True) - - self.prompts_output_folder = os.path.join(output_folder, 'pdf_table_prompts/') - os.makedirs(self.prompts_output_folder, exist_ok=True) - - self.json_output_folder = os.path.join(output_folder, 'pdf_table_json/') - os.makedirs(self.json_output_folder, exist_ok=True) - - self.table_md_output_folder = os.path.join(output_folder, 'pdf_table_markdown/') - os.makedirs(self.table_md_output_folder, exist_ok=True) - - def get_table_extraction_prompts(self): - instructions_file = r'./instructions/table_extraction_prompts.txt' - with open(instructions_file, 'r', encoding='utf-8') as file: - return file.read() - - def extract_tables(self): - try: - if self.pdf_file is None or len(self.pdf_file) == 0 or not os.path.exists(self.pdf_file): - logger.error(f"Invalid pdf_file: {self.pdf_file}") - return - logger.info(f"Start processing {self.pdf_file}") - pdf_util = PDFUtil(self.pdf_file) - success, text, page_text_dict = pdf_util.extract_text(output_folder=self.output_folder) - if success: - logger.info(f"Successfully extracted text from {self.pdf_file}") - - for page_num, page_text in page_text_dict.items(): - try: - self.extract_tables_from_page(page_text, page_num) - except Exception as e: - traceback.print_exc() - logger.error(f"Error in extracting tables from page {page_num}: {str(e)}") - except Exception as e: - logger.error(f"Error in extracting PDF tables: {str(e)}") - - - def extract_tables_from_page(self, page_text: str, page_num: int): - pure_pdf_name = self.pdf_file_name.replace('.pdf', '') - table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text) - prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt') - if os.path.exists(prompts_response_file): - logger.info(f"Prompts response file already exists: {prompts_response_file}") - return - - response, with_error = chat(table_extraction_prompts) - if with_error: - logger.error(f"Error in extracting tables from page") - return - - json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response) - if json_response is None: - logger.info(f"Can't extract tables from page") - return - - table_json_text = json_response.group(1) - table_data = {"tables": []} - try: - table_data = json.loads(table_json_text) - except: - table_data = json_repair.loads(table_json_text) - self.save_table_data(table_data, page_num) - - prompts_response = f'{table_extraction_prompts}\n\n{response}' - with open(prompts_response_file, 'w', encoding='utf-8') as file: - file.write(prompts_response) - - def save_table_data(self, table_data: dict, page_num: int): - pdf_pure_name = self.pdf_file_name.replace('.pdf', '') - json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json') - with open(json_output_file, 'w', encoding='utf-8') as file: - file.write(json.dumps(table_data, indent=4)) - - table_list = table_data.get('tables', []) - for table_num, table in enumerate(table_list): - table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md') - table = re.sub(r'(\n)+', '\n', table) - with open(table_md_file, 'w', encoding='utf-8') as file: - file.write(table) \ No newline at end of file diff --git a/playground.ipynb b/playground.ipynb deleted file mode 100644 index 4260150..0000000 --- a/playground.ipynb +++ /dev/null @@ -1,713 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "from utils.biz_utils import add_slash_to_text_as_regex\n", - "import json\n", - "import re" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "regex = r\"Turnover \\n\"" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Turnover\\\\s+\\\\n'" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "add_slash_to_text_as_regex(regex)" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover reflects the\\n\"" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re.search(regex, text)" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\"" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TOR no match\n", - "Turnover\\*\\s+\\n no match\n", - "Turnover\\s+\\n no match\n", - "Turnover\\s+Ratio no match\n", - "Turnover\\s+Rate no match\n", - "Portfolio\\s+Turnover no match\n", - "Portfolio\\s+turnover\\s+ratio no match\n", - "Portfolio\\s+turnover\\s+rate no match\n", - "PTR no match\n", - "Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n" - ] - } - ], - "source": [ - "with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n", - " datapoint_keywords_config = json.load(file)\n", - "\n", - "tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n", - "\n", - "for tor_regex in tor_regex_list:\n", - " regex = add_slash_to_text_as_regex(tor_regex)\n", - " search = re.search(regex, text)\n", - " if search:\n", - " print(f\"{regex} match {search.group()}\")\n", - " else:\n", - " print(f\"{regex} no match\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from utils.sql_query_util import query_investment_by_provider, query_document_fund_mapping\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "provider_mapping = query_investment_by_provider(company_id=\"0C00008QVP\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ProviderIdProviderNameFundIdFundNameISINSecIdCurrencyIdShareClassNameShareClassStatus
8400C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH4T. Rowe Price Funds Series II SICAV - Credit O...LU1053597990F000010MEEUSDT. Rowe Price Funds Series II SICAV - Credit O...0
8410C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH4T. Rowe Price Funds Series II SICAV - Credit O...LU1053597727F000010MEFUSDT. Rowe Price Funds Series II SICAV - Credit O...0
8420C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU0993574440F000010MEGUSDT. Rowe Price Funds Series II SICAV - Floating...1
8430C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU1805616171F000010PUNCHFT. Rowe Price Funds Series II SICAV - Floating...0
8440C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU1076358073F000010MEHEURT. Rowe Price Funds Series II SICAV - Floating...0
8450C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU2046740358F0000143Y8USDT. Rowe Price Funds Series II SICAV - Floating...0
8460C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU2046740432F0000143Y9USDT. Rowe Price Funds Series II SICAV - Floating...0
8470C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU0993569101F00001564HUSDT. Rowe Price Funds Series II SICAV - Floating...0
8480C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LU2122516821F000014UPKAUDT. Rowe Price Funds Series II SICAV - Floating...0
\n", - "
" - ], - "text/plain": [ - " ProviderId ProviderName FundId \\\n", - "840 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH4 \n", - "841 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH4 \n", - "842 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "843 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "844 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "845 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "846 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "847 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "848 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "\n", - " FundName ISIN \\\n", - "840 T. Rowe Price Funds Series II SICAV - Credit O... LU1053597990 \n", - "841 T. Rowe Price Funds Series II SICAV - Credit O... LU1053597727 \n", - "842 T. Rowe Price Funds Series II SICAV - Floating... LU0993574440 \n", - "843 T. Rowe Price Funds Series II SICAV - Floating... LU1805616171 \n", - "844 T. Rowe Price Funds Series II SICAV - Floating... LU1076358073 \n", - "845 T. Rowe Price Funds Series II SICAV - Floating... LU2046740358 \n", - "846 T. Rowe Price Funds Series II SICAV - Floating... LU2046740432 \n", - "847 T. Rowe Price Funds Series II SICAV - Floating... LU0993569101 \n", - "848 T. Rowe Price Funds Series II SICAV - Floating... LU2122516821 \n", - "\n", - " SecId CurrencyId ShareClassName \\\n", - "840 F000010MEE USD T. Rowe Price Funds Series II SICAV - Credit O... \n", - "841 F000010MEF USD T. Rowe Price Funds Series II SICAV - Credit O... \n", - "842 F000010MEG USD T. Rowe Price Funds Series II SICAV - Floating... \n", - "843 F000010PUN CHF T. Rowe Price Funds Series II SICAV - Floating... \n", - "844 F000010MEH EUR T. Rowe Price Funds Series II SICAV - Floating... \n", - "845 F0000143Y8 USD T. Rowe Price Funds Series II SICAV - Floating... \n", - "846 F0000143Y9 USD T. Rowe Price Funds Series II SICAV - Floating... \n", - "847 F00001564H USD T. Rowe Price Funds Series II SICAV - Floating... \n", - "848 F000014UPK AUD T. Rowe Price Funds Series II SICAV - Floating... \n", - "\n", - " ShareClassStatus \n", - "840 0 \n", - "841 0 \n", - "842 1 \n", - "843 0 \n", - "844 0 \n", - "845 0 \n", - "846 0 \n", - "847 0 \n", - "848 0 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "provider_mapping[provider_mapping[\"FundName\"].str.contains(\"T. Rowe Price Funds Series II SICAV\")]" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "document_mapping = query_document_fund_mapping(doc_id=\"486378555\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DocumentIdEffectiveDateDocumentTypeFormatLanguageDocumentStatusProviderIdProviderNameFundIdFundNameDomicileSecIdCurrencyIdShareClassNameISIN
04863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF000010MEGUSDT. Rowe Price Funds Series II SICAV - Floating...LU0993574440
14863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF000010PUNCHFT. Rowe Price Funds Series II SICAV - Floating...LU1805616171
24863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF000010MEHEURT. Rowe Price Funds Series II SICAV - Floating...LU1076358073
34863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF0000143Y8USDT. Rowe Price Funds Series II SICAV - Floating...LU2046740358
44863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF0000143Y9USDT. Rowe Price Funds Series II SICAV - Floating...LU2046740432
54863785552022-06-304PDF0L0000012210C00008QVPT. Rowe Price (Luxembourg) Management S.à r.l.FS0000DUH5T. Rowe Price Funds Series II SICAV - Floating...LUXF000014UPKAUDT. Rowe Price Funds Series II SICAV - Floating...LU2122516821
\n", - "
" - ], - "text/plain": [ - " DocumentId EffectiveDate DocumentType Format Language DocumentStatus \\\n", - "0 486378555 2022-06-30 4 PDF 0L00000122 1 \n", - "1 486378555 2022-06-30 4 PDF 0L00000122 1 \n", - "2 486378555 2022-06-30 4 PDF 0L00000122 1 \n", - "3 486378555 2022-06-30 4 PDF 0L00000122 1 \n", - "4 486378555 2022-06-30 4 PDF 0L00000122 1 \n", - "5 486378555 2022-06-30 4 PDF 0L00000122 1 \n", - "\n", - " ProviderId ProviderName FundId \\\n", - "0 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "1 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "2 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "3 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "4 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "5 0C00008QVP T. Rowe Price (Luxembourg) Management S.à r.l. FS0000DUH5 \n", - "\n", - " FundName Domicile SecId \\\n", - "0 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010MEG \n", - "1 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010PUN \n", - "2 T. Rowe Price Funds Series II SICAV - Floating... LUX F000010MEH \n", - "3 T. Rowe Price Funds Series II SICAV - Floating... LUX F0000143Y8 \n", - "4 T. Rowe Price Funds Series II SICAV - Floating... LUX F0000143Y9 \n", - "5 T. Rowe Price Funds Series II SICAV - Floating... LUX F000014UPK \n", - "\n", - " CurrencyId ShareClassName ISIN \n", - "0 USD T. Rowe Price Funds Series II SICAV - Floating... LU0993574440 \n", - "1 CHF T. Rowe Price Funds Series II SICAV - Floating... LU1805616171 \n", - "2 EUR T. Rowe Price Funds Series II SICAV - Floating... LU1076358073 \n", - "3 USD T. Rowe Price Funds Series II SICAV - Floating... LU2046740358 \n", - "4 USD T. Rowe Price Funds Series II SICAV - Floating... LU2046740432 \n", - "5 AUD T. Rowe Price Funds Series II SICAV - Floating... LU2122516821 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "document_mapping" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund I Cap',\n", - " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (CHF) Cap',\n", - " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (EUR) Cap',\n", - " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Q (USD) Cap',\n", - " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Qd (USD) Dis',\n", - " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Sdn (AUD) Dis']" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(document_mapping[\"ShareClassName\"].unique())" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pymupdf4llm" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing ./data/emea_ar/pdf/501380553.pdf...\n", - "[ ] (0/47[ ] ( 1/47[= ] ( 2/4[== ] ( 3/47[=== ] ( 4/4[==== ] ( 5/47[===== ] ( 6/47[===== ] ( 7/4[====== ] ( 8/47[======= ] ( 9/4[======== ] (10/47[========= ] (11/4[========== ] (12/47[=========== ] (13/47[=========== ] (14/4[============ ] (15/47[============= ] (16/4[============== ] (17/47[=============== ] (18/4[================ ] (19/47[================= ] (20/47[================= ] (21/4[================== ] (22/47[=================== ] (23/4[==================== ] (24/47[===================== ] (25/4[====================== ] (26/4[====================== ] (27/47[======================= ] (28/4[======================== ] (29/47[========================= ] (30/4[========================== ] (31/47[=========================== ] (32/4[============================ ] (33/4[============================ ] (34/47[============================= ] (35/4[============================== ] (36/47[=============================== ] (37/4[================================ ] (38/47[================================= ] (39/4[================================== ] (40/4[================================== ] (41/47[=================================== ] (42/4[==================================== ] (43/47[===================================== ] (44/4[====================================== ] (45/47[======================================= ] (46/47[========================================] (47/47]\n" - ] - }, - { - "data": { - "text/plain": [ - "107851" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "md_text = pymupdf4llm.to_markdown(r\"./data/emea_ar/pdf/501380553.pdf\")\n", - "\n", - "# now work with the markdown text, e.g. store as a UTF8-encoded file\n", - "import pathlib\n", - "pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "def get_fund_name(fund_name: str, fund_feature: str):\n", - " if not fund_name.endswith(fund_feature):\n", - " return fund_name\n", - " fund_feature = fund_feature + \" \"\n", - " fund_name_split = fund_name.split(fund_feature)\n", - " if len(fund_name_split) > 1:\n", - " last_fund = fund_name_split[-1].strip()\n", - " if len(last_fund) == 0:\n", - " last_fund = fund_name_split[-2].strip()\n", - " fund_name = f\"{last_fund} {fund_feature}\"\n", - " return fund_name" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'C Fund'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "fund_name = \"JPMorgan Investment Fund - Global Income Conservative Fund\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'- Global Income Conservative Fund Fund '" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "get_fund_name(fund_name, \"Fund\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "torch2_real", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/playground.py b/playground.py deleted file mode 100644 index 1ef9116..0000000 --- a/playground.py +++ /dev/null @@ -1,137 +0,0 @@ -import os -import json -import base64 -import json_repair -from utils.pdf_util import PDFUtil -from utils.logger import logger -from utils.gpt_utils import chat - - -def get_base64_pdf_image_list( - pdf_file: str, pdf_page_index_list: list, output_folder: str = None -) -> dict: - if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file): - logger.error("pdf_file is not provided") - return None - pdf_util = PDFUtil(pdf_file) - if pdf_page_index_list is None or len(pdf_page_index_list) == 0: - pdf_page_index_list = list(range(pdf_util.get_page_count())) - if output_folder is not None and len(output_folder) > 0: - os.makedirs(output_folder, exist_ok=True) - pdf_image_info = pdf_util.extract_images( - pdf_page_index_list=pdf_page_index_list, output_folder=output_folder - ) - return pdf_image_info - - -def encode_image(image_path: str): - if image_path is None or len(image_path) == 0 or not os.path.exists(image_path): - return None - with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode("utf-8") - - -def chat_with_image( - pdf_file: str, - pdf_page_index_list: list, - image_instructions_file: str, - image_folder: str, - gpt_folder: str, -): - if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file): - logger.error("pdf_file is not provided") - return None - pdf_image_info = get_base64_pdf_image_list( - pdf_file, pdf_page_index_list, image_folder - ) - - with open(image_instructions_file, "r", encoding="utf-8") as file: - image_instructions = file.read() - os.makedirs(gpt_folder, exist_ok=True) - pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "") - response_list = {} - for page_index, data in pdf_image_info.items(): - logger.info(f"Processing image in page {page_index}") - image_file = data.get("img_file", None) - image_base64 = data.get("img_base64", None) - response, error = chat(prompt=image_instructions, image_base64=image_base64) - if error: - logger.error(f"Error in processing image in page {page_index}") - continue - try: - response_json = json.loads(response) - except: - response_json = json_repair.loads(response) - response_json_file = os.path.join( - gpt_folder, f"{pdf_base_name}_{page_index}.json" - ) - with open(response_json_file, "w", encoding="utf-8") as file: - json.dump(response_json, file, indent=4) - logger.info(f"Response for image in page {page_index}: {response}") - logger.info("Done") - - -if __name__ == "__main__": - # Table extraction by image - # pdf_file = r"/data/emea_ar/small_pdf/382366116.pdf" - # pdf_page_index_list = [29, 35, 71, 77, 83, 89, 97, 103, 112, 121, 130, 140, 195, 250, 305] - # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf" - # pdf_page_index_list = [13] - # pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf" - # pdf_page_index_list = [29] - # image_instructions_file = r"./instructions/table_extraction_image_prompts_v2.txt" - # image_output_folder = r"/data/emea_ar/small_pdf_image/" - # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/table/" - # chat_with_image( - # pdf_file, - # pdf_page_index_list, - # image_instructions_file, - # image_output_folder, - # gpt_output_folder, - # ) - - # Data extraction by image - # pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf" - # pdf_page_index_list = [29] - pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf" - pdf_page_index_list = [13] - image_output_folder = r"/data/emea_ar/small_pdf_image/" - gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/data/" - image_instructions_file = r"./instructions/data_extraction_image_prompts.txt" - chat_with_image( - pdf_file, - pdf_page_index_list, - image_instructions_file, - image_output_folder, - gpt_output_folder, - ) - - - # Text extraction by image - # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf" - # pdf_page_index_list = [13] - # image_instructions_file = r"./instructions/text_extraction_image_prompts.txt" - # image_output_folder = r"/data/emea_ar/small_pdf_image/" - # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/text/" - # chat_with_image( - # pdf_file, - # pdf_page_index_list, - # image_instructions_file, - # image_output_folder, - # gpt_output_folder, - # ) - - # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf" - # pdf_page_index_list = [13] - # image_instructions_file = r"./instructions/table_extraction_image_optimize_prompts.txt" - # image_output_folder = r"/data/emea_ar/small_pdf_image/" - # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/optimized_instructions/" - # chat_with_image( - # pdf_file, - # pdf_page_index_list, - # image_instructions_file, - # image_output_folder, - # gpt_output_folder, - # ) - - diff --git a/specific_calc_metrics.py b/specific_calc_metrics.py deleted file mode 100644 index 8254fd8..0000000 --- a/specific_calc_metrics.py +++ /dev/null @@ -1,277 +0,0 @@ -from tqdm import tqdm -from glob import glob -import json -import pandas as pd -import os -from traceback import print_exc -from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score - -from utils.logger import logger - - -def calculate_complex_document_metrics(verify_file_path: str, document_list: list = []): - data_df_1 = pd.read_excel(verify_file_path, sheet_name="data_in_doc_mapping") - # convert doc_id column to string - data_df_1["doc_id"] = data_df_1["doc_id"].astype(str) - data_df_1 = data_df_1[data_df_1["raw_check"].isin([0, 1])] - - exclude_documents = ["532422548"] - # remove data by doc_id not in exclude_documents - data_df_1 = data_df_1[~data_df_1["doc_id"].isin(exclude_documents)] - - if document_list is not None and len(document_list) > 0: - data_df_1 = data_df_1[data_df_1["doc_id"].isin(document_list)] - - data_df_2 = pd.read_excel(verify_file_path, sheet_name="total_mapping_data") - data_df_2["doc_id"] = data_df_2["doc_id"].astype(str) - data_df_2 = data_df_2[data_df_2["raw_check"].isin([0, 1])] - - data_df = pd.concat([data_df_1, data_df_2], ignore_index=True) - - data_df.fillna("", inplace=True) - data_df.reset_index(drop=True, inplace=True) - - metrics_df_list = [] - doc_id_list = data_df["doc_id"].unique().tolist() - for doc_id in tqdm(doc_id_list): - try: - document_data_df = data_df[data_df["doc_id"] == doc_id] - document_metrics_df = calc_metrics(document_data_df, doc_id) - metrics_df_list.append(document_metrics_df) - except Exception as e: - logger.error(f"Error when calculating metrics for document {doc_id}") - print_exc() - - total_metrics_df = calc_metrics(data_df, doc_id=None) - metrics_df_list.append(total_metrics_df) - - all_metrics_df = pd.concat(metrics_df_list, ignore_index=True) - all_metrics_df.reset_index(drop=True, inplace=True) - - output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/" - verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") - output_metrics_file = os.path.join(output_folder, - f"complex_{verify_file_name}_metrics_all.xlsx") - with pd.ExcelWriter(output_metrics_file) as writer: - all_metrics_df.to_excel(writer, index=False, sheet_name="metrics") - - -def calc_metrics(data_df: pd.DataFrame, doc_id: str = None): - # tor data - tor_data_df = data_df[data_df["datapoint"] == "tor"] - if len(tor_data_df) > 0: - tor_metrics = get_sub_metrics(tor_data_df, "tor", doc_id) - logger.info(f"TOR metrics: {tor_metrics}") - else: - tor_metrics = None - - # ter data - ter_data_df = data_df[data_df["datapoint"] == "ter"] - if len(ter_data_df) > 0: - ter_metrics = get_sub_metrics(ter_data_df, "ter", doc_id) - logger.info(f"TER metrics: {ter_metrics}") - else: - ter_metrics = None - - # ogc data - ogc_data_df = data_df[data_df["datapoint"] == "ogc"] - if len(ogc_data_df) > 0: - ogc_metrics = get_sub_metrics(ogc_data_df, "ogc", doc_id) - logger.info(f"OGC metrics: {ogc_metrics}") - else: - ogc_metrics = None - - # performance_fee data - performance_fee_data_df = data_df[data_df["datapoint"] == "performance_fee"] - if len(performance_fee_data_df) > 0: - performance_fee_metrics = get_sub_metrics(performance_fee_data_df, "performance_fee", doc_id) - logger.info(f"Performance fee metrics: {performance_fee_metrics}") - else: - performance_fee_metrics = None - - metrics_candidates = [tor_metrics, ter_metrics, ogc_metrics, performance_fee_metrics] - metrics_list = [metrics for metrics in metrics_candidates if metrics is not None] - metrics_df = pd.DataFrame(metrics_list) - # add average metrics - if doc_id is not None and len(doc_id) > 0: - avg_metrics = { - "DocumentId": doc_id, - "DataPoint": "average", - "F1": metrics_df["F1"].mean(), - "Precision": metrics_df["Precision"].mean(), - "Recall": metrics_df["Recall"].mean(), - "Accuracy": metrics_df["Accuracy"].mean(), - "Support": metrics_df["Support"].sum() - } - else: - avg_metrics = { - "DocumentId": "All", - "DataPoint": "average", - "F1": metrics_df["F1"].mean(), - "Precision": metrics_df["Precision"].mean(), - "Recall": metrics_df["Recall"].mean(), - "Accuracy": metrics_df["Accuracy"].mean(), - "Support": metrics_df["Support"].sum() - } - - metrics_list.append(avg_metrics) - metrics_df = pd.DataFrame(metrics_list) - metrics_df.reset_index(drop=True, inplace=True) - return metrics_df - - -def get_sub_metrics(data_df: pd.DataFrame, data_point: str, doc_id: str = None) -> dict: - data_df_raw_check_1 = data_df[data_df["raw_check"] == 1] - gt_list = [1] * len(data_df_raw_check_1) - pre_list = [1] * len(data_df_raw_check_1) - - data_df_raw_check_0 = data_df[data_df["raw_check"] == 0] - for index, row in data_df_raw_check_0.iterrows(): - if row["raw_check_comment"] == "modify": - gt_list.append(0) - pre_list.append(1) - - gt_list.append(1) - pre_list.append(0) - elif row["raw_check_comment"] == "incorrect": - gt_list.append(0) - pre_list.append(1) - elif row["raw_check_comment"] == "supplement": - gt_list.append(1) - pre_list.append(0) - else: - pass - - # calculate metrics - accuracy = accuracy_score(gt_list, pre_list) - precision = precision_score(gt_list, pre_list) - recall = recall_score(gt_list, pre_list) - f1 = f1_score(gt_list, pre_list) - support = sum(gt_list) - if doc_id is not None and len(doc_id) > 0: - metrics = { - "DocumentId": doc_id, - "DataPoint": data_point, - "F1": f1, - "Precision": precision, - "Recall": recall, - "Accuracy": accuracy, - "Support": support - } - else: - metrics = { - "DocumentId": "All", - "DataPoint": data_point, - "F1": f1, - "Precision": precision, - "Recall": recall, - "Accuracy": accuracy, - "Support": support - } - return metrics - - -def get_metrics_based_documents(metrics_file: str, document_list: list): - metrics_df = pd.read_excel(metrics_file, sheet_name="metrics") - metrics_df_list = [] - for doc_id in tqdm(document_list): - try: - document_metrics_df = metrics_df[metrics_df["DocumentId"] == doc_id] - metrics_df_list.append(document_metrics_df) - except Exception as e: - logger.error(f"Error when calculating metrics for document {doc_id}") - print_exc() - metrics_document_df = pd.concat(metrics_df_list, ignore_index=True) - - stats_metrics_list = [] - tor_df = metrics_document_df[metrics_document_df["DataPoint"] == "tor"] - if len(tor_df) > 0: - tor_metrics = { - "DocumentId": "All", - "DataPoint": "tor", - "F1": tor_df["F1"].mean(), - "Precision": tor_df["Precision"].mean(), - "Recall": tor_df["Recall"].mean(), - "Accuracy": tor_df["Accuracy"].mean(), - "Support": tor_df["Support"].sum() - } - stats_metrics_list.append(tor_metrics) - ter_df = metrics_document_df[metrics_document_df["DataPoint"] == "ter"] - if len(ter_df) > 0: - ter_metrics = { - "DocumentId": "All", - "DataPoint": "ter", - "F1": ter_df["F1"].mean(), - "Precision": ter_df["Precision"].mean(), - "Recall": ter_df["Recall"].mean(), - "Accuracy": ter_df["Accuracy"].mean(), - "Support": ter_df["Support"].sum() - } - stats_metrics_list.append(ter_metrics) - ogc_df = metrics_document_df[metrics_document_df["DataPoint"] == "ogc"] - if len(ogc_df) > 0: - ogc_metrics = { - "DocumentId": "All", - "DataPoint": "ogc", - "F1": ogc_df["F1"].mean(), - "Precision": ogc_df["Precision"].mean(), - "Recall": ogc_df["Recall"].mean(), - "Accuracy": ogc_df["Accuracy"].mean(), - "Support": ogc_df["Support"].sum() - } - stats_metrics_list.append(ogc_metrics) - performance_fee_df = metrics_document_df[metrics_document_df["DataPoint"] == "performance_fee"] - if len(performance_fee_df) > 0: - performance_fee_metrics = { - "DocumentId": "All", - "DataPoint": "performance_fee", - "F1": performance_fee_df["F1"].mean(), - "Precision": performance_fee_df["Precision"].mean(), - "Recall": performance_fee_df["Recall"].mean(), - "Accuracy": performance_fee_df["Accuracy"].mean(), - "Support": performance_fee_df["Support"].sum() - } - stats_metrics_list.append(performance_fee_metrics) - average_df = metrics_document_df[metrics_document_df["DataPoint"] == "average"] - if len(average_df) > 0: - avg_metrics = { - "DocumentId": "All", - "DataPoint": "average", - "F1": average_df["F1"].mean(), - "Precision": average_df["Precision"].mean(), - "Recall": average_df["Recall"].mean(), - "Accuracy": average_df["Accuracy"].mean(), - "Support": average_df["Support"].sum() - } - stats_metrics_list.append(avg_metrics) - - stats_metrics_df = pd.DataFrame(stats_metrics_list) - metrics_df_list.append(stats_metrics_df) - all_metrics_df = pd.concat(metrics_df_list, ignore_index=True) - all_metrics_df.reset_index(drop=True, inplace=True) - - output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/" - verify_file_name = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_remain_7.xlsx" - output_metrics_file = os.path.join(output_folder, verify_file_name) - with pd.ExcelWriter(output_metrics_file) as writer: - all_metrics_df.to_excel(writer, index=False, sheet_name="metrics") - - return all_metrics_df - - -if __name__ == "__main__": - file_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/" - verify_file = "mapping_data_info_31_documents_by_text_second_round.xlsx" - verify_file_path = os.path.join(file_folder, verify_file) - calculate_complex_document_metrics(verify_file_path=verify_file_path, - document_list=None) - document_list = ["492029971", - "510300817", - "512745032", - "514213638", - "527525440", - "534535767"] - metrics_file = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_all.xlsx" - metrics_file_path = os.path.join(file_folder, metrics_file) - # get_metrics_based_documents(metrics_file=metrics_file_path, - # document_list=document_list) \ No newline at end of file diff --git a/test_specific_biz_logic.py b/test_specific_biz_logic.py deleted file mode 100644 index f1006f3..0000000 --- a/test_specific_biz_logic.py +++ /dev/null @@ -1,70 +0,0 @@ -import os -import json -import pandas as pd -from glob import glob -from tqdm import tqdm -from utils.logger import logger -from utils.sql_query_util import query_document_fund_mapping -from core.page_filter import FilterPages -from core.data_extraction import DataExtraction - - -def test_validate_extraction_data(): - document_id = "481482392" - pdf_file = f"/data/emea_ar/pdf/481482392.pdf" - output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/" - output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/" - document_mapping_info_df = query_document_fund_mapping(document_id, rerun=False) - filter_pages = FilterPages( - document_id, pdf_file, document_mapping_info_df - ) - page_text_dict = filter_pages.page_text_dict - datapoint_page_info, result_details = get_datapoint_page_info(filter_pages) - datapoints = get_datapoints_from_datapoint_page_info(datapoint_page_info) - data_extraction = DataExtraction( - doc_source="emea_ar", - doc_id=document_id, - pdf_file=pdf_file, - output_data_folder=output_extract_data_child_folder, - page_text_dict=page_text_dict, - datapoint_page_info=datapoint_page_info, - datapoints=datapoints, - document_mapping_info_df=document_mapping_info_df, - extract_way="text", - output_image_folder=None - ) - output_data_json_folder = os.path.join( - r"/data/emea_ar/output/extract_data/docs/by_text/", "json/" - ) - os.makedirs(output_data_json_folder, exist_ok=True) - json_file = os.path.join(output_data_json_folder, f"{document_id}.json") - data_from_gpt = None - if os.path.exists(json_file): - logger.info( - f"The document: {document_id} has been parsed, loading data from {json_file}" - ) - with open(json_file, "r", encoding="utf-8") as f: - data_from_gpt = json.load(f) - for extract_data in data_from_gpt: - page_index = extract_data["page_index"] - if page_index == 451: - logger.info(f"Page index: {page_index}") - raw_answer = extract_data["raw_answer"] - raw_answer_json = json.loads(raw_answer) - extract_data_info = data_extraction.validate_data(raw_answer_json) - print(extract_data_info) - -def get_datapoint_page_info(filter_pages) -> tuple: - datapoint_page_info, result_details = filter_pages.start_job() - return datapoint_page_info, result_details - - -def get_datapoints_from_datapoint_page_info(datapoint_page_info) -> list: - datapoints = list(datapoint_page_info.keys()) - if "doc_id" in datapoints: - datapoints.remove("doc_id") - return datapoints - - -if __name__ == "__main__": - test_validate_extraction_data() \ No newline at end of file