update for deployment

2025-01-16 20:34:43 -06:00 · 2025-01-16 20:34:43 -06:00 · f10ff8ee33
parent fb4a6402f0
commit f10ff8ee33
9 changed files with 12 additions and 1467 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,3 +7,9 @@
 /test_metrics
 /data
 /sample_documents/japan_prospectus.txt
+/pdf_table_extraction.py
+/playground.ipynb
+/playground.py
+/specific_calc_metrics.py
+/test_specific_biz_logic.py
+/drilldown_practice.py
--- a/app_emea_ar.py
+++ b/app_emea_ar.py
@ -22,7 +22,7 @@ swagger = Swagger(app, template=template)
  
@app.route('/automation/api/model/emea_ar', methods=['POST'])
@swag_from('yml/emea_ar.yml')
-def us_ar_data_extract():
+def emea_ar_data_extract():
    """
    Extract EMEA AR cost data from EMEA LUX PDF document
    input sample:
@ -40,6 +40,7 @@ def us_ar_data_extract():
        return jsonify({"error": "doc_id is required"}), 400
    
    pdf_folder = r"./data/emea_ar/pdf/"
+    output_pdf_text_folder = r"./data/emea_ar/output/pdf_text/"
    output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
    output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
    drilldown_folder = r"./data/emea_ar/output/drilldown/"
@ -62,6 +63,7 @@ def us_ar_data_extract():
        emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
                                          doc_source="emea_ar",
                                          pdf_folder=pdf_folder,
+                                          output_pdf_text_folder=output_pdf_text_folder,
                                          output_extract_data_folder=output_extract_data_folder,
                                          output_mapping_data_folder=output_mapping_data_folder,
                                          extract_way=extract_way,
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -4,6 +4,7 @@ import json_repair
 import re
 import fitz
 import pandas as pd
+from traceback import print_exc
 from utils.gpt_utils import chat
 from utils.pdf_util import PDFUtil
 from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
@ -294,6 +295,8 @@ class DataExtraction:
                            for current_page_data in page_data_list:
                                if current_page_data in next_page_data_list:
                                    next_page_data_list.remove(current_page_data)
+                            if len(next_page_data_list) == 0:
+                                break
                            next_page_extract_data["extract_data"][
                                "data"
                            ] = next_page_data_list
--- a/drilldown_practice.py
+++ b/drilldown_practice.py
@ -1,159 +0,0 @@
-from tqdm import tqdm
-from glob import glob
-import json
-import pandas as pd
-import os
-from traceback import print_exc
-from sklearn.metrics import recall_score
-
-from utils.logger import logger
-from utils.pdf_util import PDFUtil
-
-
-def drilldown_documents(pdf_folder: str, extract_data_folder: str, drilldown_folder: str):
-    extract_files = glob(extract_data_folder + '*.json')
-    
-    for index, json_file in enumerate(tqdm(extract_files)):
-        try:
-            # doc_id = file.split('/')[-1].split('.')[0]
-            json_base_name = os.path.basename(json_file)
-            doc_id = json_base_name.split('.')[0]
-            logger.info(f"Processing {doc_id}")
-            pdf_file = os.path.join(pdf_folder, f"{doc_id}.pdf")
-            if not os.path.exists(pdf_file):
-                logger.error(f"PDF file not found for {doc_id}")
-                continue
-            with open(json_file, "r", encoding="utf-8") as f:
-                data_from_gpt = json.load(f)
-                drilldown_pdf_document(doc_id=doc_id,
-                                       pdf_file=pdf_file,
-                                       drilldown_folder=drilldown_folder,
-                                       data_from_gpt=data_from_gpt)
-                
-        except Exception as e:
-            print_exc()
-            logger.error(f"Error in processing {doc_id}: {e}")
-
-
-def drilldown_pdf_document(doc_id:str, 
-                           pdf_file: str,
-                           drilldown_folder: str,
-                           data_from_gpt: list) -> list:
-    logger.info(f"Drilldown PDF document for doc_id: {doc_id}")
-    pdf_util = PDFUtil(pdf_file)
-    drilldown_data_list = []
-    for data in data_from_gpt:
-        doc_id = str(data.get("doc_id", ""))
-        # if doc_id != "506326520":
-        #     continue
-        page_index = data.get("page_index", -1)
-        if page_index == -1:
-            continue
-        extract_data_list = data.get("extract_data", {}).get("data", [])
-        dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {})
-        if len(dp_reported_name_dict.keys()) == 0:
-            continue
-        highlighted_value_list = []
-        for extract_data in extract_data_list:
-            for data_point, value in extract_data.items():
-                if value in highlighted_value_list:
-                    continue
-                if data_point in ["ter", "ogc", "performance_fee"]:
-                    continue
-                drilldown_data = {
-                    "doc_id": doc_id,
-                    "page_index": page_index,
-                    "data_point": data_point,
-                    "parent_text_block": None,
-                    "value": value,
-                    "annotation_attribute": {}
-                }
-                drilldown_data_list.append(drilldown_data)
-                highlighted_value_list.append(value)
-        
-        for data_point, reported_name in dp_reported_name_dict.items():
-            if reported_name in highlighted_value_list:
-                continue
-            data_point = f"{data_point}_reported_name"
-            drilldown_data = {
-                    "doc_id": doc_id,
-                    "page_index": page_index,
-                    "data_point": data_point,
-                    "parent_text_block": None,
-                    "value": reported_name,
-                    "annotation_attribute": {}
-                }
-            drilldown_data_list.append(drilldown_data)
-            highlighted_value_list.append(reported_name)
-    drilldown_result = []
-    if len(drilldown_data_list) > 0:
-        drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list,
-                                                    output_pdf_folder=drilldown_folder)
-    if len(drilldown_result) > 0:
-        logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully")
-        annotation_list = drilldown_result.get("annotation_list", [])
-        for annotation in annotation_list:
-            annotation["doc_id"] = doc_id
-        if drilldown_folder is not None and len(drilldown_folder) > 0:
-            drilldown_data_folder = os.path.join(drilldown_folder, "data/")
-            os.makedirs(drilldown_data_folder, exist_ok=True)
-            drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx")
-            
-            drilldown_source_df = pd.DataFrame(drilldown_data_list)
-            annotation_list_df = pd.DataFrame(annotation_list)
-            # set drilldown_result_df column order as doc_id, pdf_file, page_index, 
-            # data_point, value, matching_val_area, normalized_bbox
-            annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index", 
-                                                        "data_point", "value", "matching_val_area", "normalized_bbox"]]
-            logger.info(f"Writing drilldown data to {drilldown_file}")
-            with pd.ExcelWriter(drilldown_file) as writer:
-                drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data")
-                annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data")
-
-
-def calculate_metrics():
-    drilldown_folder = r"/data/emea_ar/output/drilldown/"
-    drilldown_data_folder = os.path.join(drilldown_folder, "data/")
-    drilldown_files = glob(drilldown_data_folder + '*.xlsx')
-    y_true_list = []
-    y_pred_list = []
-    series_list = []
-    for drilldown_file in drilldown_files:
-        drilldown_file_base_name = os.path.basename(drilldown_file)
-        if drilldown_file_base_name.startswith("~"):
-            continue
-        drilldown_data = pd.read_excel(drilldown_file, sheet_name="drilldown_data")
-        for index, row in drilldown_data.iterrows():
-            matching_val_area = row["matching_val_area"]
-            # transform matching_val_area to list
-            if isinstance(matching_val_area, str):
-                matching_val_area = eval(matching_val_area)
-            y_true_list.append(1)
-            if len(matching_val_area) > 0:
-                y_pred_list.append(1)
-            else:
-                y_pred_list.append(0)
-                series_list.append(row)
-    recall = recall_score(y_true_list, y_pred_list)
-    logger.info(f"Recall: {recall}, Support: {len(y_true_list)}")
-    no_annotation_df = pd.DataFrame(series_list)
-    no_annotation_df.reset_index(drop=True, inplace=True)
-    metrics_folder = os.path.join(drilldown_folder, "metrics/")
-    os.makedirs(metrics_folder, exist_ok=True)
-    metrics_file = os.path.join(metrics_folder, "metrics.xlsx")
-    metrics_result = {
-        "recall": recall,
-        "support": len(y_true_list)
-    }
-    metrics_df = pd.DataFrame([metrics_result])
-    with pd.ExcelWriter(metrics_file) as writer:
-        metrics_df.to_excel(writer, index=False, sheet_name="metrics")
-        no_annotation_df.to_excel(writer, index=False, sheet_name="no_annotation")
-        
-
-if __name__ == "__main__":
-    pdf_folder = r"/data/emea_ar/pdf/"
-    drilldown_folder = r"/data/emea_ar/output/drilldown/"
-    extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/'
-    drilldown_documents()
-    # calculate_metrics()
--- a/pdf_table_extraction.py
+++ b/pdf_table_extraction.py
@ -1,110 +0,0 @@
-import pandas as pd
-import os
-import tqdm
-import json_repair
-import json
-from glob import glob
-import fitz
-import re
-import time
-import traceback
-
-from utils.logger import logger
-from utils.pdf_download import download_pdf_from_documents_warehouse
-from utils.pdf_util import PDFUtil
-from utils.gpt_utils import chat
-
-
-class PDFTableExtraction:
-    """
-    Iterate PDF pages
-    Extract tables from PDF pages
-    Save these tables as markdown files
-    """
-    def __init__(self, 
-                 pdf_file: str,
-                 output_folder: str) -> None:
-        self.pdf_file = pdf_file
-        self.pdf_file_name = os.path.basename(pdf_file)
-        self.table_extraction_prompts = self.get_table_extraction_prompts()
-        
-        self.output_folder = output_folder
-        os.makedirs(output_folder, exist_ok=True)
-        
-        self.prompts_output_folder = os.path.join(output_folder, 'pdf_table_prompts/')
-        os.makedirs(self.prompts_output_folder, exist_ok=True)
-        
-        self.json_output_folder = os.path.join(output_folder, 'pdf_table_json/')
-        os.makedirs(self.json_output_folder, exist_ok=True)
-        
-        self.table_md_output_folder = os.path.join(output_folder, 'pdf_table_markdown/')
-        os.makedirs(self.table_md_output_folder, exist_ok=True)
-    
-    def get_table_extraction_prompts(self):
-        instructions_file = r'./instructions/table_extraction_prompts.txt'
-        with open(instructions_file, 'r', encoding='utf-8') as file:
-            return file.read()
-    
-    def extract_tables(self):
-        try:
-            if self.pdf_file is None or len(self.pdf_file) == 0 or not os.path.exists(self.pdf_file):
-                logger.error(f"Invalid pdf_file: {self.pdf_file}")
-                return
-            logger.info(f"Start processing {self.pdf_file}")
-            pdf_util = PDFUtil(self.pdf_file)
-            success, text, page_text_dict = pdf_util.extract_text(output_folder=self.output_folder)
-            if success:
-                logger.info(f"Successfully extracted text from {self.pdf_file}")
-            
-            for page_num, page_text in page_text_dict.items():
-                try:
-                    self.extract_tables_from_page(page_text, page_num)
-                except Exception as e:
-                    traceback.print_exc()
-                    logger.error(f"Error in extracting tables from page {page_num}: {str(e)}")
-        except Exception as e:
-            logger.error(f"Error in extracting PDF tables: {str(e)}")
-    
-    
-    def extract_tables_from_page(self, page_text: str, page_num: int):
-        pure_pdf_name = self.pdf_file_name.replace('.pdf', '')
-        table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text)
-        prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt')
-        if os.path.exists(prompts_response_file):
-            logger.info(f"Prompts response file already exists: {prompts_response_file}")
-            return
-        
-        response, with_error = chat(table_extraction_prompts)
-        if with_error:
-            logger.error(f"Error in extracting tables from page")
-            return
-        
-        json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response)
-        if json_response is None:
-            logger.info(f"Can't extract tables from page")
-            return
-        
-        table_json_text = json_response.group(1)
-        table_data = {"tables": []}
-        try:
-            table_data = json.loads(table_json_text)
-        except:
-            table_data = json_repair.loads(table_json_text)
-        self.save_table_data(table_data, page_num)
-        
-        prompts_response = f'{table_extraction_prompts}\n\n{response}'
-        with open(prompts_response_file, 'w', encoding='utf-8') as file:
-            file.write(prompts_response)
-
-    def save_table_data(self, table_data: dict, page_num: int):
-        pdf_pure_name = self.pdf_file_name.replace('.pdf', '')
-        json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json')
-        with open(json_output_file, 'w', encoding='utf-8') as file:
-            file.write(json.dumps(table_data, indent=4))
-        
-        table_list = table_data.get('tables', [])
-        for table_num, table in enumerate(table_list):
-            table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md')
-            table = re.sub(r'(\n)+', '\n', table)
-            with open(table_md_file, 'w', encoding='utf-8') as file:
-                file.write(table)
--- a/playground.ipynb
+++ b/playground.ipynb
@ -1,713 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from utils.biz_utils import add_slash_to_text_as_regex\n",
-    "import json\n",
-    "import re"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "regex = r\"Turnover \\n\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Turnover\\\\s+\\\\n'"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "add_slash_to_text_as_regex(regex)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover reflects the\\n\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<re.Match object; span=(141, 151), match='Turnover \\n'>"
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "re.search(regex, text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "TOR no match\n",
-      "Turnover\\*\\s+\\n no match\n",
-      "Turnover\\s+\\n no match\n",
-      "Turnover\\s+Ratio no match\n",
-      "Turnover\\s+Rate no match\n",
-      "Portfolio\\s+Turnover no match\n",
-      "Portfolio\\s+turnover\\s+ratio no match\n",
-      "Portfolio\\s+turnover\\s+rate no match\n",
-      "PTR no match\n",
-      "Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
-     ]
-    }
-   ],
-   "source": [
-    "with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
-    "    datapoint_keywords_config = json.load(file)\n",
-    "\n",
-    "tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
-    "\n",
-    "for tor_regex in tor_regex_list:\n",
-    "    regex = add_slash_to_text_as_regex(tor_regex)\n",
-    "    search = re.search(regex, text)\n",
-    "    if search:\n",
-    "        print(f\"{regex} match {search.group()}\")\n",
-    "    else:\n",
-    "        print(f\"{regex} no match\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from utils.sql_query_util import query_investment_by_provider, query_document_fund_mapping\n",
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "provider_mapping = query_investment_by_provider(company_id=\"0C00008QVP\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ProviderId</th>\n",
-       "      <th>ProviderName</th>\n",
-       "      <th>FundId</th>\n",
-       "      <th>FundName</th>\n",
-       "      <th>ISIN</th>\n",
-       "      <th>SecId</th>\n",
-       "      <th>CurrencyId</th>\n",
-       "      <th>ShareClassName</th>\n",
-       "      <th>ShareClassStatus</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>840</th>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH4</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
-       "      <td>LU1053597990</td>\n",
-       "      <td>F000010MEE</td>\n",
-       "      <td>USD</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>841</th>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH4</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
-       "      <td>LU1053597727</td>\n",
-       "      <td>F000010MEF</td>\n",
-       "      <td>USD</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>842</th>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU0993574440</td>\n",
-       "      <td>F000010MEG</td>\n",
-       "      <td>USD</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>843</th>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU1805616171</td>\n",
-       "      <td>F000010PUN</td>\n",
-       "      <td>CHF</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>844</th>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU1076358073</td>\n",
-       "      <td>F000010MEH</td>\n",
-       "      <td>EUR</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>845</th>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU2046740358</td>\n",
-       "      <td>F0000143Y8</td>\n",
-       "      <td>USD</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>846</th>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU2046740432</td>\n",
-       "      <td>F0000143Y9</td>\n",
-       "      <td>USD</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>847</th>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU0993569101</td>\n",
-       "      <td>F00001564H</td>\n",
-       "      <td>USD</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>848</th>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU2122516821</td>\n",
-       "      <td>F000014UPK</td>\n",
-       "      <td>AUD</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     ProviderId                                    ProviderName      FundId  \\\n",
-       "840  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH4   \n",
-       "841  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH4   \n",
-       "842  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "843  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "844  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "845  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "846  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "847  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "848  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "\n",
-       "                                              FundName          ISIN  \\\n",
-       "840  T. Rowe Price Funds Series II SICAV - Credit O...  LU1053597990   \n",
-       "841  T. Rowe Price Funds Series II SICAV - Credit O...  LU1053597727   \n",
-       "842  T. Rowe Price Funds Series II SICAV - Floating...  LU0993574440   \n",
-       "843  T. Rowe Price Funds Series II SICAV - Floating...  LU1805616171   \n",
-       "844  T. Rowe Price Funds Series II SICAV - Floating...  LU1076358073   \n",
-       "845  T. Rowe Price Funds Series II SICAV - Floating...  LU2046740358   \n",
-       "846  T. Rowe Price Funds Series II SICAV - Floating...  LU2046740432   \n",
-       "847  T. Rowe Price Funds Series II SICAV - Floating...  LU0993569101   \n",
-       "848  T. Rowe Price Funds Series II SICAV - Floating...  LU2122516821   \n",
-       "\n",
-       "          SecId CurrencyId                                     ShareClassName  \\\n",
-       "840  F000010MEE        USD  T. Rowe Price Funds Series II SICAV - Credit O...   \n",
-       "841  F000010MEF        USD  T. Rowe Price Funds Series II SICAV - Credit O...   \n",
-       "842  F000010MEG        USD  T. Rowe Price Funds Series II SICAV - Floating...   \n",
-       "843  F000010PUN        CHF  T. Rowe Price Funds Series II SICAV - Floating...   \n",
-       "844  F000010MEH        EUR  T. Rowe Price Funds Series II SICAV - Floating...   \n",
-       "845  F0000143Y8        USD  T. Rowe Price Funds Series II SICAV - Floating...   \n",
-       "846  F0000143Y9        USD  T. Rowe Price Funds Series II SICAV - Floating...   \n",
-       "847  F00001564H        USD  T. Rowe Price Funds Series II SICAV - Floating...   \n",
-       "848  F000014UPK        AUD  T. Rowe Price Funds Series II SICAV - Floating...   \n",
-       "\n",
-       "     ShareClassStatus  \n",
-       "840                 0  \n",
-       "841                 0  \n",
-       "842                 1  \n",
-       "843                 0  \n",
-       "844                 0  \n",
-       "845                 0  \n",
-       "846                 0  \n",
-       "847                 0  \n",
-       "848                 0  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "provider_mapping[provider_mapping[\"FundName\"].str.contains(\"T. Rowe Price Funds Series II SICAV\")]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "document_mapping = query_document_fund_mapping(doc_id=\"486378555\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>DocumentId</th>\n",
-       "      <th>EffectiveDate</th>\n",
-       "      <th>DocumentType</th>\n",
-       "      <th>Format</th>\n",
-       "      <th>Language</th>\n",
-       "      <th>DocumentStatus</th>\n",
-       "      <th>ProviderId</th>\n",
-       "      <th>ProviderName</th>\n",
-       "      <th>FundId</th>\n",
-       "      <th>FundName</th>\n",
-       "      <th>Domicile</th>\n",
-       "      <th>SecId</th>\n",
-       "      <th>CurrencyId</th>\n",
-       "      <th>ShareClassName</th>\n",
-       "      <th>ISIN</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>486378555</td>\n",
-       "      <td>2022-06-30</td>\n",
-       "      <td>4</td>\n",
-       "      <td>PDF</td>\n",
-       "      <td>0L00000122</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LUX</td>\n",
-       "      <td>F000010MEG</td>\n",
-       "      <td>USD</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU0993574440</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>486378555</td>\n",
-       "      <td>2022-06-30</td>\n",
-       "      <td>4</td>\n",
-       "      <td>PDF</td>\n",
-       "      <td>0L00000122</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LUX</td>\n",
-       "      <td>F000010PUN</td>\n",
-       "      <td>CHF</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU1805616171</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>486378555</td>\n",
-       "      <td>2022-06-30</td>\n",
-       "      <td>4</td>\n",
-       "      <td>PDF</td>\n",
-       "      <td>0L00000122</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LUX</td>\n",
-       "      <td>F000010MEH</td>\n",
-       "      <td>EUR</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU1076358073</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>486378555</td>\n",
-       "      <td>2022-06-30</td>\n",
-       "      <td>4</td>\n",
-       "      <td>PDF</td>\n",
-       "      <td>0L00000122</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LUX</td>\n",
-       "      <td>F0000143Y8</td>\n",
-       "      <td>USD</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU2046740358</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>486378555</td>\n",
-       "      <td>2022-06-30</td>\n",
-       "      <td>4</td>\n",
-       "      <td>PDF</td>\n",
-       "      <td>0L00000122</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LUX</td>\n",
-       "      <td>F0000143Y9</td>\n",
-       "      <td>USD</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU2046740432</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>486378555</td>\n",
-       "      <td>2022-06-30</td>\n",
-       "      <td>4</td>\n",
-       "      <td>PDF</td>\n",
-       "      <td>0L00000122</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0C00008QVP</td>\n",
-       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
-       "      <td>FS0000DUH5</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LUX</td>\n",
-       "      <td>F000014UPK</td>\n",
-       "      <td>AUD</td>\n",
-       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
-       "      <td>LU2122516821</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   DocumentId EffectiveDate  DocumentType Format    Language  DocumentStatus  \\\n",
-       "0   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
-       "1   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
-       "2   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
-       "3   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
-       "4   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
-       "5   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
-       "\n",
-       "   ProviderId                                    ProviderName      FundId  \\\n",
-       "0  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "1  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "2  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "3  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "4  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "5  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
-       "\n",
-       "                                            FundName Domicile       SecId  \\\n",
-       "0  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F000010MEG   \n",
-       "1  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F000010PUN   \n",
-       "2  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F000010MEH   \n",
-       "3  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F0000143Y8   \n",
-       "4  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F0000143Y9   \n",
-       "5  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F000014UPK   \n",
-       "\n",
-       "  CurrencyId                                     ShareClassName          ISIN  \n",
-       "0        USD  T. Rowe Price Funds Series II SICAV - Floating...  LU0993574440  \n",
-       "1        CHF  T. Rowe Price Funds Series II SICAV - Floating...  LU1805616171  \n",
-       "2        EUR  T. Rowe Price Funds Series II SICAV - Floating...  LU1076358073  \n",
-       "3        USD  T. Rowe Price Funds Series II SICAV - Floating...  LU2046740358  \n",
-       "4        USD  T. Rowe Price Funds Series II SICAV - Floating...  LU2046740432  \n",
-       "5        AUD  T. Rowe Price Funds Series II SICAV - Floating...  LU2122516821  "
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "document_mapping"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund I Cap',\n",
-       " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (CHF) Cap',\n",
-       " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (EUR) Cap',\n",
-       " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Q (USD) Cap',\n",
-       " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Qd (USD) Dis',\n",
-       " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Sdn (AUD) Dis']"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "list(document_mapping[\"ShareClassName\"].unique())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pymupdf4llm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processing ./data/emea_ar/pdf/501380553.pdf...\n",
-      "[                                        ] (0/47[                                        ] ( 1/47[=                                       ] ( 2/4[==                                      ] ( 3/47[===                                     ] ( 4/4[====                                    ] ( 5/47[=====                                   ] ( 6/47[=====                                   ] ( 7/4[======                                  ] ( 8/47[=======                                 ] ( 9/4[========                                ] (10/47[=========                               ] (11/4[==========                              ] (12/47[===========                             ] (13/47[===========                             ] (14/4[============                            ] (15/47[=============                           ] (16/4[==============                          ] (17/47[===============                         ] (18/4[================                        ] (19/47[=================                       ] (20/47[=================                       ] (21/4[==================                      ] (22/47[===================                     ] (23/4[====================                    ] (24/47[=====================                   ] (25/4[======================                  ] (26/4[======================                  ] (27/47[=======================                 ] (28/4[========================                ] (29/47[=========================               ] (30/4[==========================              ] (31/47[===========================             ] (32/4[============================            ] (33/4[============================            ] (34/47[=============================           ] (35/4[==============================          ] (36/47[===============================         ] (37/4[================================        ] (38/47[=================================       ] (39/4[==================================      ] (40/4[==================================      ] (41/47[===================================     ] (42/4[====================================    ] (43/47[=====================================   ] (44/4[======================================  ] (45/47[======================================= ] (46/47[========================================] (47/47]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "107851"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "md_text = pymupdf4llm.to_markdown(r\"./data/emea_ar/pdf/501380553.pdf\")\n",
-    "\n",
-    "# now work with the markdown text, e.g. store as a UTF8-encoded file\n",
-    "import pathlib\n",
-    "pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_fund_name(fund_name: str, fund_feature: str):\n",
-    "        if not fund_name.endswith(fund_feature):\n",
-    "            return fund_name\n",
-    "        fund_feature = fund_feature + \" \"\n",
-    "        fund_name_split = fund_name.split(fund_feature)\n",
-    "        if len(fund_name_split) > 1:\n",
-    "            last_fund = fund_name_split[-1].strip()\n",
-    "            if len(last_fund) == 0:\n",
-    "                last_fund = fund_name_split[-2].strip()\n",
-    "            fund_name = f\"{last_fund} {fund_feature}\"\n",
-    "        return fund_name"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'C Fund'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fund_name = \"JPMorgan Investment Fund - Global Income Conservative Fund\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'- Global Income Conservative Fund Fund '"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "get_fund_name(fund_name, \"Fund\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "torch2_real",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/playground.py
+++ b/playground.py
@ -1,137 +0,0 @@
-import os
-import json
-import base64
-import json_repair
-from utils.pdf_util import PDFUtil
-from utils.logger import logger
-from utils.gpt_utils import chat
-
-
-def get_base64_pdf_image_list(
-    pdf_file: str, pdf_page_index_list: list, output_folder: str = None
-) -> dict:
-    if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
-        logger.error("pdf_file is not provided")
-        return None
-    pdf_util = PDFUtil(pdf_file)
-    if pdf_page_index_list is None or len(pdf_page_index_list) == 0:
-        pdf_page_index_list = list(range(pdf_util.get_page_count()))
-    if output_folder is not None and len(output_folder) > 0:
-        os.makedirs(output_folder, exist_ok=True)
-    pdf_image_info = pdf_util.extract_images(
-        pdf_page_index_list=pdf_page_index_list, output_folder=output_folder
-    )
-    return pdf_image_info
-
-
-def encode_image(image_path: str):
-    if image_path is None or len(image_path) == 0 or not os.path.exists(image_path):
-        return None
-    with open(image_path, "rb") as image_file:
-        return base64.b64encode(image_file.read()).decode("utf-8")
-
-
-def chat_with_image(
-    pdf_file: str,
-    pdf_page_index_list: list,
-    image_instructions_file: str,
-    image_folder: str,
-    gpt_folder: str,
-):
-    if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
-        logger.error("pdf_file is not provided")
-        return None
-    pdf_image_info = get_base64_pdf_image_list(
-        pdf_file, pdf_page_index_list, image_folder
-    )
-
-    with open(image_instructions_file, "r", encoding="utf-8") as file:
-        image_instructions = file.read()
-    os.makedirs(gpt_folder, exist_ok=True)
-    pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
-    response_list = {}
-    for page_index, data in pdf_image_info.items():
-        logger.info(f"Processing image in page {page_index}")
-        image_file = data.get("img_file", None)
-        image_base64 = data.get("img_base64", None)
-        response, error = chat(prompt=image_instructions, image_base64=image_base64)
-        if error:
-            logger.error(f"Error in processing image in page {page_index}")
-            continue
-        try:
-            response_json = json.loads(response)
-        except:
-            response_json = json_repair.loads(response)
-        response_json_file = os.path.join(
-            gpt_folder, f"{pdf_base_name}_{page_index}.json"
-        )
-        with open(response_json_file, "w", encoding="utf-8") as file:
-            json.dump(response_json, file, indent=4)
-        logger.info(f"Response for image in page {page_index}: {response}")
-    logger.info("Done")
-
-
-if __name__ == "__main__":
-    # Table extraction by image
-    # pdf_file = r"/data/emea_ar/small_pdf/382366116.pdf"
-    # pdf_page_index_list = [29, 35, 71, 77, 83, 89, 97, 103, 112, 121, 130, 140, 195, 250, 305]
-    # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
-    # pdf_page_index_list = [13]
-    # pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
-    # pdf_page_index_list = [29]
-    # image_instructions_file = r"./instructions/table_extraction_image_prompts_v2.txt"
-    # image_output_folder = r"/data/emea_ar/small_pdf_image/"
-    # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/table/"
-    # chat_with_image(
-    #     pdf_file,
-    #     pdf_page_index_list,
-    #     image_instructions_file,
-    #     image_output_folder,
-    #     gpt_output_folder,
-    # )
-    
-    # Data extraction by image
-    # pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
-    # pdf_page_index_list = [29]
-    pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
-    pdf_page_index_list = [13]
-    image_output_folder = r"/data/emea_ar/small_pdf_image/"
-    gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/data/"
-    image_instructions_file = r"./instructions/data_extraction_image_prompts.txt"
-    chat_with_image(
-        pdf_file,
-        pdf_page_index_list,
-        image_instructions_file,
-        image_output_folder,
-        gpt_output_folder,
-    )
-    
-    
-    # Text extraction by image
-    # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
-    # pdf_page_index_list = [13]
-    # image_instructions_file = r"./instructions/text_extraction_image_prompts.txt"
-    # image_output_folder = r"/data/emea_ar/small_pdf_image/"
-    # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/text/"
-    # chat_with_image(
-    #     pdf_file,
-    #     pdf_page_index_list,
-    #     image_instructions_file,
-    #     image_output_folder,
-    #     gpt_output_folder,
-    # )
-    
-    # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
-    # pdf_page_index_list = [13]
-    # image_instructions_file = r"./instructions/table_extraction_image_optimize_prompts.txt"
-    # image_output_folder = r"/data/emea_ar/small_pdf_image/"
-    # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/optimized_instructions/"
-    # chat_with_image(
-    #     pdf_file,
-    #     pdf_page_index_list,
-    #     image_instructions_file,
-    #     image_output_folder,
-    #     gpt_output_folder,
-    # )
-
-
--- a/specific_calc_metrics.py
+++ b/specific_calc_metrics.py
@ -1,277 +0,0 @@
-from tqdm import tqdm
-from glob import glob
-import json
-import pandas as pd
-import os
-from traceback import print_exc
-from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
-
-from utils.logger import logger
-
-
-def calculate_complex_document_metrics(verify_file_path: str, document_list: list = []):
-    data_df_1 = pd.read_excel(verify_file_path, sheet_name="data_in_doc_mapping")
-    # convert doc_id column to string
-    data_df_1["doc_id"] = data_df_1["doc_id"].astype(str)
-    data_df_1 = data_df_1[data_df_1["raw_check"].isin([0, 1])]
-    
-    exclude_documents = ["532422548"]
-    # remove data by doc_id not in exclude_documents
-    data_df_1 = data_df_1[~data_df_1["doc_id"].isin(exclude_documents)]
-    
-    if document_list is not None and len(document_list) > 0:
-        data_df_1 = data_df_1[data_df_1["doc_id"].isin(document_list)]
-        
-    data_df_2 = pd.read_excel(verify_file_path, sheet_name="total_mapping_data")
-    data_df_2["doc_id"] = data_df_2["doc_id"].astype(str)
-    data_df_2 = data_df_2[data_df_2["raw_check"].isin([0, 1])]
-    
-    data_df = pd.concat([data_df_1, data_df_2], ignore_index=True)
-    
-    data_df.fillna("", inplace=True)
-    data_df.reset_index(drop=True, inplace=True)
-    
-    metrics_df_list = []
-    doc_id_list = data_df["doc_id"].unique().tolist()
-    for doc_id in tqdm(doc_id_list):
-        try:
-            document_data_df = data_df[data_df["doc_id"] == doc_id]
-            document_metrics_df = calc_metrics(document_data_df, doc_id)
-            metrics_df_list.append(document_metrics_df)
-        except Exception as e:
-            logger.error(f"Error when calculating metrics for document {doc_id}")
-            print_exc()
-    
-    total_metrics_df = calc_metrics(data_df, doc_id=None)
-    metrics_df_list.append(total_metrics_df)
-    
-    all_metrics_df = pd.concat(metrics_df_list, ignore_index=True)
-    all_metrics_df.reset_index(drop=True, inplace=True)
-    
-    output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
-    verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
-    output_metrics_file = os.path.join(output_folder, 
-                                       f"complex_{verify_file_name}_metrics_all.xlsx")
-    with pd.ExcelWriter(output_metrics_file) as writer:
-        all_metrics_df.to_excel(writer, index=False, sheet_name="metrics")
-
-
-def calc_metrics(data_df: pd.DataFrame, doc_id: str = None):
-    # tor data
-    tor_data_df = data_df[data_df["datapoint"] == "tor"]
-    if len(tor_data_df) > 0:
-        tor_metrics = get_sub_metrics(tor_data_df, "tor", doc_id)
-        logger.info(f"TOR metrics: {tor_metrics}")
-    else:
-        tor_metrics = None
-    
-    # ter data
-    ter_data_df = data_df[data_df["datapoint"] == "ter"]
-    if len(ter_data_df) > 0:
-        ter_metrics = get_sub_metrics(ter_data_df, "ter", doc_id)
-        logger.info(f"TER metrics: {ter_metrics}")
-    else:
-        ter_metrics = None
-    
-    # ogc data
-    ogc_data_df = data_df[data_df["datapoint"] == "ogc"]
-    if len(ogc_data_df) > 0:
-        ogc_metrics = get_sub_metrics(ogc_data_df, "ogc", doc_id)
-        logger.info(f"OGC metrics: {ogc_metrics}")
-    else:
-        ogc_metrics = None
-    
-    # performance_fee data
-    performance_fee_data_df = data_df[data_df["datapoint"] == "performance_fee"]
-    if len(performance_fee_data_df) > 0:
-        performance_fee_metrics = get_sub_metrics(performance_fee_data_df, "performance_fee", doc_id)
-        logger.info(f"Performance fee metrics: {performance_fee_metrics}")
-    else:
-        performance_fee_metrics = None
-    
-    metrics_candidates = [tor_metrics, ter_metrics, ogc_metrics, performance_fee_metrics]
-    metrics_list = [metrics for metrics in metrics_candidates if metrics is not None]
-    metrics_df = pd.DataFrame(metrics_list)
-    # add average metrics
-    if doc_id is not None and len(doc_id) > 0:
-        avg_metrics = {
-            "DocumentId": doc_id,
-            "DataPoint": "average",
-            "F1": metrics_df["F1"].mean(),
-            "Precision": metrics_df["Precision"].mean(),
-            "Recall": metrics_df["Recall"].mean(),
-            "Accuracy": metrics_df["Accuracy"].mean(),
-            "Support": metrics_df["Support"].sum()
-        }
-    else:
-        avg_metrics = {
-            "DocumentId": "All",
-            "DataPoint": "average",
-            "F1": metrics_df["F1"].mean(),
-            "Precision": metrics_df["Precision"].mean(),
-            "Recall": metrics_df["Recall"].mean(),
-            "Accuracy": metrics_df["Accuracy"].mean(),
-            "Support": metrics_df["Support"].sum()
-        }
-    
-    metrics_list.append(avg_metrics)
-    metrics_df = pd.DataFrame(metrics_list)
-    metrics_df.reset_index(drop=True, inplace=True)
-    return metrics_df
-
-
-def get_sub_metrics(data_df: pd.DataFrame, data_point: str, doc_id: str = None) -> dict:
-    data_df_raw_check_1 = data_df[data_df["raw_check"] == 1]
-    gt_list = [1] * len(data_df_raw_check_1)
-    pre_list = [1] * len(data_df_raw_check_1)
-    
-    data_df_raw_check_0 = data_df[data_df["raw_check"] == 0]
-    for index, row in data_df_raw_check_0.iterrows():
-        if row["raw_check_comment"] == "modify":
-            gt_list.append(0)
-            pre_list.append(1)
-            
-            gt_list.append(1)
-            pre_list.append(0)
-        elif row["raw_check_comment"] == "incorrect":
-            gt_list.append(0)
-            pre_list.append(1)
-        elif row["raw_check_comment"] == "supplement":
-            gt_list.append(1)
-            pre_list.append(0)
-        else:
-            pass
-                
-    # calculate metrics
-    accuracy = accuracy_score(gt_list, pre_list)
-    precision = precision_score(gt_list, pre_list)
-    recall = recall_score(gt_list, pre_list)
-    f1 = f1_score(gt_list, pre_list)
-    support = sum(gt_list)
-    if doc_id is not None and len(doc_id) > 0:
-        metrics = {
-            "DocumentId": doc_id,
-            "DataPoint": data_point,
-            "F1": f1,
-            "Precision": precision,
-            "Recall": recall,
-            "Accuracy": accuracy,
-            "Support": support 
-        }
-    else:
-        metrics = {
-            "DocumentId": "All",
-            "DataPoint": data_point,
-            "F1": f1,
-            "Precision": precision,
-            "Recall": recall,
-            "Accuracy": accuracy,
-            "Support": support 
-        }
-    return metrics
-
-
-def get_metrics_based_documents(metrics_file: str, document_list: list):
-    metrics_df = pd.read_excel(metrics_file, sheet_name="metrics")
-    metrics_df_list = []
-    for doc_id in tqdm(document_list):
-        try:
-            document_metrics_df = metrics_df[metrics_df["DocumentId"] == doc_id]
-            metrics_df_list.append(document_metrics_df)
-        except Exception as e:
-            logger.error(f"Error when calculating metrics for document {doc_id}")
-            print_exc()
-    metrics_document_df = pd.concat(metrics_df_list, ignore_index=True)
-    
-    stats_metrics_list = []
-    tor_df = metrics_document_df[metrics_document_df["DataPoint"] == "tor"]
-    if len(tor_df) > 0:
-        tor_metrics = {
-            "DocumentId": "All",
-            "DataPoint": "tor",
-            "F1": tor_df["F1"].mean(),
-            "Precision": tor_df["Precision"].mean(),
-            "Recall": tor_df["Recall"].mean(),
-            "Accuracy": tor_df["Accuracy"].mean(),
-            "Support": tor_df["Support"].sum()
-        }
-        stats_metrics_list.append(tor_metrics)
-    ter_df = metrics_document_df[metrics_document_df["DataPoint"] == "ter"]
-    if len(ter_df) > 0:
-        ter_metrics = {
-            "DocumentId": "All",
-            "DataPoint": "ter",
-            "F1": ter_df["F1"].mean(),
-            "Precision": ter_df["Precision"].mean(),
-            "Recall": ter_df["Recall"].mean(),
-            "Accuracy": ter_df["Accuracy"].mean(),
-            "Support": ter_df["Support"].sum()
-        }
-        stats_metrics_list.append(ter_metrics)
-    ogc_df = metrics_document_df[metrics_document_df["DataPoint"] == "ogc"]
-    if len(ogc_df) > 0:
-        ogc_metrics = {
-            "DocumentId": "All",
-            "DataPoint": "ogc",
-            "F1": ogc_df["F1"].mean(),
-            "Precision": ogc_df["Precision"].mean(),
-            "Recall": ogc_df["Recall"].mean(),
-            "Accuracy": ogc_df["Accuracy"].mean(),
-            "Support": ogc_df["Support"].sum()
-        }
-        stats_metrics_list.append(ogc_metrics)
-    performance_fee_df = metrics_document_df[metrics_document_df["DataPoint"] == "performance_fee"]
-    if len(performance_fee_df) > 0:
-        performance_fee_metrics = {
-            "DocumentId": "All",
-            "DataPoint": "performance_fee",
-            "F1": performance_fee_df["F1"].mean(),
-            "Precision": performance_fee_df["Precision"].mean(),
-            "Recall": performance_fee_df["Recall"].mean(),
-            "Accuracy": performance_fee_df["Accuracy"].mean(),
-            "Support": performance_fee_df["Support"].sum()
-        }
-        stats_metrics_list.append(performance_fee_metrics)
-    average_df = metrics_document_df[metrics_document_df["DataPoint"] == "average"]
-    if len(average_df) > 0:
-        avg_metrics = {
-            "DocumentId": "All",
-            "DataPoint": "average",
-            "F1": average_df["F1"].mean(),
-            "Precision": average_df["Precision"].mean(),
-            "Recall": average_df["Recall"].mean(),
-            "Accuracy": average_df["Accuracy"].mean(),
-            "Support": average_df["Support"].sum()
-        }
-        stats_metrics_list.append(avg_metrics)
-
-    stats_metrics_df = pd.DataFrame(stats_metrics_list)
-    metrics_df_list.append(stats_metrics_df)
-    all_metrics_df = pd.concat(metrics_df_list, ignore_index=True)
-    all_metrics_df.reset_index(drop=True, inplace=True)
-    
-    output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
-    verify_file_name = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_remain_7.xlsx"
-    output_metrics_file = os.path.join(output_folder, verify_file_name)
-    with pd.ExcelWriter(output_metrics_file) as writer:
-        all_metrics_df.to_excel(writer, index=False, sheet_name="metrics")
-    
-    return all_metrics_df
-
-
-if __name__ == "__main__":
-    file_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
-    verify_file = "mapping_data_info_31_documents_by_text_second_round.xlsx"
-    verify_file_path = os.path.join(file_folder, verify_file)
-    calculate_complex_document_metrics(verify_file_path=verify_file_path, 
-                                       document_list=None)
-    document_list = ["492029971",
-                    "510300817",
-                    "512745032",
-                    "514213638",
-                    "527525440",
-                    "534535767"]
-    metrics_file = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_all.xlsx"
-    metrics_file_path = os.path.join(file_folder, metrics_file)
-    # get_metrics_based_documents(metrics_file=metrics_file_path,
-    #                              document_list=document_list)
--- a/test_specific_biz_logic.py
+++ b/test_specific_biz_logic.py
@ -1,70 +0,0 @@
-import os
-import json
-import pandas as pd
-from glob import glob
-from tqdm import tqdm
-from utils.logger import logger
-from utils.sql_query_util import query_document_fund_mapping
-from core.page_filter import FilterPages
-from core.data_extraction import DataExtraction
-
-
-def test_validate_extraction_data():
-    document_id = "481482392"
-    pdf_file = f"/data/emea_ar/pdf/481482392.pdf"
-    output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
-    output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
-    document_mapping_info_df = query_document_fund_mapping(document_id, rerun=False)
-    filter_pages = FilterPages(
-            document_id, pdf_file, document_mapping_info_df
-        )
-    page_text_dict = filter_pages.page_text_dict
-    datapoint_page_info, result_details = get_datapoint_page_info(filter_pages)
-    datapoints = get_datapoints_from_datapoint_page_info(datapoint_page_info)
-    data_extraction = DataExtraction(
-                    doc_source="emea_ar",
-                    doc_id=document_id,
-                    pdf_file=pdf_file,
-                    output_data_folder=output_extract_data_child_folder,
-                    page_text_dict=page_text_dict,
-                    datapoint_page_info=datapoint_page_info,
-                    datapoints=datapoints,
-                    document_mapping_info_df=document_mapping_info_df,
-                    extract_way="text",
-                    output_image_folder=None
-                )
-    output_data_json_folder = os.path.join(
-                r"/data/emea_ar/output/extract_data/docs/by_text/", "json/"
-            )
-    os.makedirs(output_data_json_folder, exist_ok=True)
-    json_file = os.path.join(output_data_json_folder, f"{document_id}.json")
-    data_from_gpt = None
-    if os.path.exists(json_file):
-        logger.info(
-            f"The document: {document_id} has been parsed, loading data from {json_file}"
-        )
-        with open(json_file, "r", encoding="utf-8") as f:
-            data_from_gpt = json.load(f)
-    for extract_data in data_from_gpt:
-        page_index = extract_data["page_index"]
-        if page_index == 451:
-            logger.info(f"Page index: {page_index}")
-            raw_answer = extract_data["raw_answer"]
-            raw_answer_json = json.loads(raw_answer)
-            extract_data_info = data_extraction.validate_data(raw_answer_json)
-            print(extract_data_info)
-
-def get_datapoint_page_info(filter_pages) -> tuple:
-    datapoint_page_info, result_details = filter_pages.start_job()
-    return datapoint_page_info, result_details
-
-
-def get_datapoints_from_datapoint_page_info(datapoint_page_info) -> list:
-    datapoints = list(datapoint_page_info.keys())
-    if "doc_id" in datapoints:
-        datapoints.remove("doc_id")
-    return datapoints
-
-
-if __name__ == "__main__":
-    test_validate_extraction_data()