update for deployment

2025-01-16 20:34:43 -06:00 · 2025-01-16 20:34:43 -06:00 · f10ff8ee33
parent fb4a6402f0
commit f10ff8ee33
9 changed files with 12 additions and 1467 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,3 +7,9 @@
 /test_metrics
 /data
 /sample_documents/japan_prospectus.txt
 /pdf_table_extraction.py
 /playground.ipynb
 /playground.py
 /specific_calc_metrics.py
 /test_specific_biz_logic.py
 /drilldown_practice.py
--- a/app_emea_ar.py
+++ b/app_emea_ar.py
@ -22,7 +22,7 @@ swagger = Swagger(app, template=template)
@app.route('/automation/api/model/emea_ar', methods=['POST'])
@swag_from('yml/emea_ar.yml')
-def us_ar_data_extract():
+def emea_ar_data_extract():
    """
    Extract EMEA AR cost data from EMEA LUX PDF document
    input sample:
@ -40,6 +40,7 @@ def us_ar_data_extract():
        return jsonify({"error": "doc_id is required"}), 400
    pdf_folder = r"./data/emea_ar/pdf/"
    output_pdf_text_folder = r"./data/emea_ar/output/pdf_text/"
    output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
    output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
    drilldown_folder = r"./data/emea_ar/output/drilldown/"
@ -62,6 +63,7 @@ def us_ar_data_extract():
        emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
                                          doc_source="emea_ar",
                                          pdf_folder=pdf_folder,
                                          output_pdf_text_folder=output_pdf_text_folder,
                                          output_extract_data_folder=output_extract_data_folder,
                                          output_mapping_data_folder=output_mapping_data_folder,
                                          extract_way=extract_way,
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -4,6 +4,7 @@ import json_repair
 import re
 import fitz
 import pandas as pd
 from traceback import print_exc
 from utils.gpt_utils import chat
 from utils.pdf_util import PDFUtil
 from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
@ -294,6 +295,8 @@ class DataExtraction:
                            for current_page_data in page_data_list:
                                if current_page_data in next_page_data_list:
                                    next_page_data_list.remove(current_page_data)
                            if len(next_page_data_list) == 0:
                                break
                            next_page_extract_data["extract_data"][
                                "data"
                            ] = next_page_data_list
--- a/drilldown_practice.py
+++ b/drilldown_practice.py
@ -1,159 +0,0 @@
 from tqdm import tqdm
 from glob import glob
 import json
 import pandas as pd
 import os
 from traceback import print_exc
 from sklearn.metrics import recall_score
 from utils.logger import logger
 from utils.pdf_util import PDFUtil
 def drilldown_documents(pdf_folder: str, extract_data_folder: str, drilldown_folder: str):
    extract_files = glob(extract_data_folder + '*.json')
    for index, json_file in enumerate(tqdm(extract_files)):
        try:
            # doc_id = file.split('/')[-1].split('.')[0]
            json_base_name = os.path.basename(json_file)
            doc_id = json_base_name.split('.')[0]
            logger.info(f"Processing {doc_id}")
            pdf_file = os.path.join(pdf_folder, f"{doc_id}.pdf")
            if not os.path.exists(pdf_file):
                logger.error(f"PDF file not found for {doc_id}")
                continue
            with open(json_file, "r", encoding="utf-8") as f:
                data_from_gpt = json.load(f)
                drilldown_pdf_document(doc_id=doc_id,
                                       pdf_file=pdf_file,
                                       drilldown_folder=drilldown_folder,
                                       data_from_gpt=data_from_gpt)
        except Exception as e:
            print_exc()
            logger.error(f"Error in processing {doc_id}: {e}")
 def drilldown_pdf_document(doc_id:str, 
                           pdf_file: str,
                           drilldown_folder: str,
                           data_from_gpt: list) -> list:
    logger.info(f"Drilldown PDF document for doc_id: {doc_id}")
    pdf_util = PDFUtil(pdf_file)
    drilldown_data_list = []
    for data in data_from_gpt:
        doc_id = str(data.get("doc_id", ""))
        # if doc_id != "506326520":
        #     continue
        page_index = data.get("page_index", -1)
        if page_index == -1:
            continue
        extract_data_list = data.get("extract_data", {}).get("data", [])
        dp_reported_name_dict = data.get("extract_data", {}).get("dp_reported_name", {})
        if len(dp_reported_name_dict.keys()) == 0:
            continue
        highlighted_value_list = []
        for extract_data in extract_data_list:
            for data_point, value in extract_data.items():
                if value in highlighted_value_list:
                    continue
                if data_point in ["ter", "ogc", "performance_fee"]:
                    continue
                drilldown_data = {
                    "doc_id": doc_id,
                    "page_index": page_index,
                    "data_point": data_point,
                    "parent_text_block": None,
                    "value": value,
                    "annotation_attribute": {}
                }
                drilldown_data_list.append(drilldown_data)
                highlighted_value_list.append(value)
        for data_point, reported_name in dp_reported_name_dict.items():
            if reported_name in highlighted_value_list:
                continue
            data_point = f"{data_point}_reported_name"
            drilldown_data = {
                    "doc_id": doc_id,
                    "page_index": page_index,
                    "data_point": data_point,
                    "parent_text_block": None,
                    "value": reported_name,
                    "annotation_attribute": {}
                }
            drilldown_data_list.append(drilldown_data)
            highlighted_value_list.append(reported_name)
    drilldown_result = []
    if len(drilldown_data_list) > 0:
        drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list,
                                                    output_pdf_folder=drilldown_folder)
    if len(drilldown_result) > 0:
        logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully")
        annotation_list = drilldown_result.get("annotation_list", [])
        for annotation in annotation_list:
            annotation["doc_id"] = doc_id
        if drilldown_folder is not None and len(drilldown_folder) > 0:
            drilldown_data_folder = os.path.join(drilldown_folder, "data/")
            os.makedirs(drilldown_data_folder, exist_ok=True)
            drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx")
            drilldown_source_df = pd.DataFrame(drilldown_data_list)
            annotation_list_df = pd.DataFrame(annotation_list)
            # set drilldown_result_df column order as doc_id, pdf_file, page_index, 
            # data_point, value, matching_val_area, normalized_bbox
            annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index", 
                                                        "data_point", "value", "matching_val_area", "normalized_bbox"]]
            logger.info(f"Writing drilldown data to {drilldown_file}")
            with pd.ExcelWriter(drilldown_file) as writer:
                drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data")
                annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data")
 def calculate_metrics():
    drilldown_folder = r"/data/emea_ar/output/drilldown/"
    drilldown_data_folder = os.path.join(drilldown_folder, "data/")
    drilldown_files = glob(drilldown_data_folder + '*.xlsx')
    y_true_list = []
    y_pred_list = []
    series_list = []
    for drilldown_file in drilldown_files:
        drilldown_file_base_name = os.path.basename(drilldown_file)
        if drilldown_file_base_name.startswith("~"):
            continue
        drilldown_data = pd.read_excel(drilldown_file, sheet_name="drilldown_data")
        for index, row in drilldown_data.iterrows():
            matching_val_area = row["matching_val_area"]
            # transform matching_val_area to list
            if isinstance(matching_val_area, str):
                matching_val_area = eval(matching_val_area)
            y_true_list.append(1)
            if len(matching_val_area) > 0:
                y_pred_list.append(1)
            else:
                y_pred_list.append(0)
                series_list.append(row)
    recall = recall_score(y_true_list, y_pred_list)
    logger.info(f"Recall: {recall}, Support: {len(y_true_list)}")
    no_annotation_df = pd.DataFrame(series_list)
    no_annotation_df.reset_index(drop=True, inplace=True)
    metrics_folder = os.path.join(drilldown_folder, "metrics/")
    os.makedirs(metrics_folder, exist_ok=True)
    metrics_file = os.path.join(metrics_folder, "metrics.xlsx")
    metrics_result = {
        "recall": recall,
        "support": len(y_true_list)
    }
    metrics_df = pd.DataFrame([metrics_result])
    with pd.ExcelWriter(metrics_file) as writer:
        metrics_df.to_excel(writer, index=False, sheet_name="metrics")
        no_annotation_df.to_excel(writer, index=False, sheet_name="no_annotation")
 if __name__ == "__main__":
    pdf_folder = r"/data/emea_ar/pdf/"
    drilldown_folder = r"/data/emea_ar/output/drilldown/"
    extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/'
    drilldown_documents()
    # calculate_metrics()
--- a/pdf_table_extraction.py
+++ b/pdf_table_extraction.py
@ -1,110 +0,0 @@
 import pandas as pd
 import os
 import tqdm
 import json_repair
 import json
 from glob import glob
 import fitz
 import re
 import time
 import traceback
 from utils.logger import logger
 from utils.pdf_download import download_pdf_from_documents_warehouse
 from utils.pdf_util import PDFUtil
 from utils.gpt_utils import chat
 class PDFTableExtraction:
    """
    Iterate PDF pages
    Extract tables from PDF pages
    Save these tables as markdown files
    """
    def __init__(self, 
                 pdf_file: str,
                 output_folder: str) -> None:
        self.pdf_file = pdf_file
        self.pdf_file_name = os.path.basename(pdf_file)
        self.table_extraction_prompts = self.get_table_extraction_prompts()
        self.output_folder = output_folder
        os.makedirs(output_folder, exist_ok=True)
        self.prompts_output_folder = os.path.join(output_folder, 'pdf_table_prompts/')
        os.makedirs(self.prompts_output_folder, exist_ok=True)
        self.json_output_folder = os.path.join(output_folder, 'pdf_table_json/')
        os.makedirs(self.json_output_folder, exist_ok=True)
        self.table_md_output_folder = os.path.join(output_folder, 'pdf_table_markdown/')
        os.makedirs(self.table_md_output_folder, exist_ok=True)
    def get_table_extraction_prompts(self):
        instructions_file = r'./instructions/table_extraction_prompts.txt'
        with open(instructions_file, 'r', encoding='utf-8') as file:
            return file.read()
    def extract_tables(self):
        try:
            if self.pdf_file is None or len(self.pdf_file) == 0 or not os.path.exists(self.pdf_file):
                logger.error(f"Invalid pdf_file: {self.pdf_file}")
                return
            logger.info(f"Start processing {self.pdf_file}")
            pdf_util = PDFUtil(self.pdf_file)
            success, text, page_text_dict = pdf_util.extract_text(output_folder=self.output_folder)
            if success:
                logger.info(f"Successfully extracted text from {self.pdf_file}")
            for page_num, page_text in page_text_dict.items():
                try:
                    self.extract_tables_from_page(page_text, page_num)
                except Exception as e:
                    traceback.print_exc()
                    logger.error(f"Error in extracting tables from page {page_num}: {str(e)}")
        except Exception as e:
            logger.error(f"Error in extracting PDF tables: {str(e)}")
    def extract_tables_from_page(self, page_text: str, page_num: int):
        pure_pdf_name = self.pdf_file_name.replace('.pdf', '')
        table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text)
        prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt')
        if os.path.exists(prompts_response_file):
            logger.info(f"Prompts response file already exists: {prompts_response_file}")
            return
        response, with_error = chat(table_extraction_prompts)
        if with_error:
            logger.error(f"Error in extracting tables from page")
            return
        json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response)
        if json_response is None:
            logger.info(f"Can't extract tables from page")
            return
        table_json_text = json_response.group(1)
        table_data = {"tables": []}
        try:
            table_data = json.loads(table_json_text)
        except:
            table_data = json_repair.loads(table_json_text)
        self.save_table_data(table_data, page_num)
        prompts_response = f'{table_extraction_prompts}\n\n{response}'
        with open(prompts_response_file, 'w', encoding='utf-8') as file:
            file.write(prompts_response)
    def save_table_data(self, table_data: dict, page_num: int):
        pdf_pure_name = self.pdf_file_name.replace('.pdf', '')
        json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json')
        with open(json_output_file, 'w', encoding='utf-8') as file:
            file.write(json.dumps(table_data, indent=4))
        table_list = table_data.get('tables', [])
        for table_num, table in enumerate(table_list):
            table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md')
            table = re.sub(r'(\n)+', '\n', table)
            with open(table_md_file, 'w', encoding='utf-8') as file:
                file.write(table)
--- a/playground.ipynb
+++ b/playground.ipynb
@ -1,713 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils.biz_utils import add_slash_to_text_as_regex\n",
    "import json\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex = r\"Turnover \\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Turnover\\\\s+\\\\n'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "add_slash_to_text_as_regex(regex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"What was the share of investments made in transitional and enabling activities? \\nTaxonomy-aligned\\nactivities are expressed \\nas a share of\\n\\u2022\\t Turnover reflects the\\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<re.Match object; span=(141, 151), match='Turnover \\n'>"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.search(regex, text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "datapoint_keywords_config_file = r\"./configuration/datapoint_keyword.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TOR no match\n",
      "Turnover\\*\\s+\\n no match\n",
      "Turnover\\s+\\n no match\n",
      "Turnover\\s+Ratio no match\n",
      "Turnover\\s+Rate no match\n",
      "Portfolio\\s+Turnover no match\n",
      "Portfolio\\s+turnover\\s+ratio no match\n",
      "Portfolio\\s+turnover\\s+rate no match\n",
      "PTR no match\n",
      "Annual\\s+Portfolio\\s+Turnover\\s+Ratio no match\n"
     ]
    }
   ],
   "source": [
    "with open(datapoint_keywords_config_file, \"r\", encoding=\"utf-8\") as file:\n",
    "    datapoint_keywords_config = json.load(file)\n",
    "\n",
    "tor_regex_list = datapoint_keywords_config.get(\"tor\", {}).get(\"english\", [])\n",
    "\n",
    "for tor_regex in tor_regex_list:\n",
    "    regex = add_slash_to_text_as_regex(tor_regex)\n",
    "    search = re.search(regex, text)\n",
    "    if search:\n",
    "        print(f\"{regex} match {search.group()}\")\n",
    "    else:\n",
    "        print(f\"{regex} no match\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils.sql_query_util import query_investment_by_provider, query_document_fund_mapping\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "provider_mapping = query_investment_by_provider(company_id=\"0C00008QVP\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ProviderId</th>\n",
       "      <th>ProviderName</th>\n",
       "      <th>FundId</th>\n",
       "      <th>FundName</th>\n",
       "      <th>ISIN</th>\n",
       "      <th>SecId</th>\n",
       "      <th>CurrencyId</th>\n",
       "      <th>ShareClassName</th>\n",
       "      <th>ShareClassStatus</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>840</th>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH4</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
       "      <td>LU1053597990</td>\n",
       "      <td>F000010MEE</td>\n",
       "      <td>USD</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>841</th>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH4</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
       "      <td>LU1053597727</td>\n",
       "      <td>F000010MEF</td>\n",
       "      <td>USD</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Credit O...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>842</th>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU0993574440</td>\n",
       "      <td>F000010MEG</td>\n",
       "      <td>USD</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>843</th>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU1805616171</td>\n",
       "      <td>F000010PUN</td>\n",
       "      <td>CHF</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>844</th>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU1076358073</td>\n",
       "      <td>F000010MEH</td>\n",
       "      <td>EUR</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>845</th>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU2046740358</td>\n",
       "      <td>F0000143Y8</td>\n",
       "      <td>USD</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>846</th>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU2046740432</td>\n",
       "      <td>F0000143Y9</td>\n",
       "      <td>USD</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>847</th>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU0993569101</td>\n",
       "      <td>F00001564H</td>\n",
       "      <td>USD</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>848</th>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU2122516821</td>\n",
       "      <td>F000014UPK</td>\n",
       "      <td>AUD</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     ProviderId                                    ProviderName      FundId  \\\n",
       "840  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH4   \n",
       "841  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH4   \n",
       "842  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "843  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "844  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "845  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "846  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "847  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "848  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "\n",
       "                                              FundName          ISIN  \\\n",
       "840  T. Rowe Price Funds Series II SICAV - Credit O...  LU1053597990   \n",
       "841  T. Rowe Price Funds Series II SICAV - Credit O...  LU1053597727   \n",
       "842  T. Rowe Price Funds Series II SICAV - Floating...  LU0993574440   \n",
       "843  T. Rowe Price Funds Series II SICAV - Floating...  LU1805616171   \n",
       "844  T. Rowe Price Funds Series II SICAV - Floating...  LU1076358073   \n",
       "845  T. Rowe Price Funds Series II SICAV - Floating...  LU2046740358   \n",
       "846  T. Rowe Price Funds Series II SICAV - Floating...  LU2046740432   \n",
       "847  T. Rowe Price Funds Series II SICAV - Floating...  LU0993569101   \n",
       "848  T. Rowe Price Funds Series II SICAV - Floating...  LU2122516821   \n",
       "\n",
       "          SecId CurrencyId                                     ShareClassName  \\\n",
       "840  F000010MEE        USD  T. Rowe Price Funds Series II SICAV - Credit O...   \n",
       "841  F000010MEF        USD  T. Rowe Price Funds Series II SICAV - Credit O...   \n",
       "842  F000010MEG        USD  T. Rowe Price Funds Series II SICAV - Floating...   \n",
       "843  F000010PUN        CHF  T. Rowe Price Funds Series II SICAV - Floating...   \n",
       "844  F000010MEH        EUR  T. Rowe Price Funds Series II SICAV - Floating...   \n",
       "845  F0000143Y8        USD  T. Rowe Price Funds Series II SICAV - Floating...   \n",
       "846  F0000143Y9        USD  T. Rowe Price Funds Series II SICAV - Floating...   \n",
       "847  F00001564H        USD  T. Rowe Price Funds Series II SICAV - Floating...   \n",
       "848  F000014UPK        AUD  T. Rowe Price Funds Series II SICAV - Floating...   \n",
       "\n",
       "     ShareClassStatus  \n",
       "840                 0  \n",
       "841                 0  \n",
       "842                 1  \n",
       "843                 0  \n",
       "844                 0  \n",
       "845                 0  \n",
       "846                 0  \n",
       "847                 0  \n",
       "848                 0  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "provider_mapping[provider_mapping[\"FundName\"].str.contains(\"T. Rowe Price Funds Series II SICAV\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "document_mapping = query_document_fund_mapping(doc_id=\"486378555\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>DocumentId</th>\n",
       "      <th>EffectiveDate</th>\n",
       "      <th>DocumentType</th>\n",
       "      <th>Format</th>\n",
       "      <th>Language</th>\n",
       "      <th>DocumentStatus</th>\n",
       "      <th>ProviderId</th>\n",
       "      <th>ProviderName</th>\n",
       "      <th>FundId</th>\n",
       "      <th>FundName</th>\n",
       "      <th>Domicile</th>\n",
       "      <th>SecId</th>\n",
       "      <th>CurrencyId</th>\n",
       "      <th>ShareClassName</th>\n",
       "      <th>ISIN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>486378555</td>\n",
       "      <td>2022-06-30</td>\n",
       "      <td>4</td>\n",
       "      <td>PDF</td>\n",
       "      <td>0L00000122</td>\n",
       "      <td>1</td>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LUX</td>\n",
       "      <td>F000010MEG</td>\n",
       "      <td>USD</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU0993574440</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>486378555</td>\n",
       "      <td>2022-06-30</td>\n",
       "      <td>4</td>\n",
       "      <td>PDF</td>\n",
       "      <td>0L00000122</td>\n",
       "      <td>1</td>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LUX</td>\n",
       "      <td>F000010PUN</td>\n",
       "      <td>CHF</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU1805616171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>486378555</td>\n",
       "      <td>2022-06-30</td>\n",
       "      <td>4</td>\n",
       "      <td>PDF</td>\n",
       "      <td>0L00000122</td>\n",
       "      <td>1</td>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LUX</td>\n",
       "      <td>F000010MEH</td>\n",
       "      <td>EUR</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU1076358073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>486378555</td>\n",
       "      <td>2022-06-30</td>\n",
       "      <td>4</td>\n",
       "      <td>PDF</td>\n",
       "      <td>0L00000122</td>\n",
       "      <td>1</td>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LUX</td>\n",
       "      <td>F0000143Y8</td>\n",
       "      <td>USD</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU2046740358</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>486378555</td>\n",
       "      <td>2022-06-30</td>\n",
       "      <td>4</td>\n",
       "      <td>PDF</td>\n",
       "      <td>0L00000122</td>\n",
       "      <td>1</td>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LUX</td>\n",
       "      <td>F0000143Y9</td>\n",
       "      <td>USD</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU2046740432</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>486378555</td>\n",
       "      <td>2022-06-30</td>\n",
       "      <td>4</td>\n",
       "      <td>PDF</td>\n",
       "      <td>0L00000122</td>\n",
       "      <td>1</td>\n",
       "      <td>0C00008QVP</td>\n",
       "      <td>T. Rowe Price (Luxembourg) Management S.à r.l.</td>\n",
       "      <td>FS0000DUH5</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LUX</td>\n",
       "      <td>F000014UPK</td>\n",
       "      <td>AUD</td>\n",
       "      <td>T. Rowe Price Funds Series II SICAV - Floating...</td>\n",
       "      <td>LU2122516821</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   DocumentId EffectiveDate  DocumentType Format    Language  DocumentStatus  \\\n",
       "0   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
       "1   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
       "2   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
       "3   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
       "4   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
       "5   486378555    2022-06-30             4    PDF  0L00000122               1   \n",
       "\n",
       "   ProviderId                                    ProviderName      FundId  \\\n",
       "0  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "1  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "2  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "3  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "4  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "5  0C00008QVP  T. Rowe Price (Luxembourg) Management S.à r.l.  FS0000DUH5   \n",
       "\n",
       "                                            FundName Domicile       SecId  \\\n",
       "0  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F000010MEG   \n",
       "1  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F000010PUN   \n",
       "2  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F000010MEH   \n",
       "3  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F0000143Y8   \n",
       "4  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F0000143Y9   \n",
       "5  T. Rowe Price Funds Series II SICAV - Floating...      LUX  F000014UPK   \n",
       "\n",
       "  CurrencyId                                     ShareClassName          ISIN  \n",
       "0        USD  T. Rowe Price Funds Series II SICAV - Floating...  LU0993574440  \n",
       "1        CHF  T. Rowe Price Funds Series II SICAV - Floating...  LU1805616171  \n",
       "2        EUR  T. Rowe Price Funds Series II SICAV - Floating...  LU1076358073  \n",
       "3        USD  T. Rowe Price Funds Series II SICAV - Floating...  LU2046740358  \n",
       "4        USD  T. Rowe Price Funds Series II SICAV - Floating...  LU2046740432  \n",
       "5        AUD  T. Rowe Price Funds Series II SICAV - Floating...  LU2122516821  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "document_mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund I Cap',\n",
       " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (CHF) Cap',\n",
       " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Ih (EUR) Cap',\n",
       " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Q (USD) Cap',\n",
       " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Qd (USD) Dis',\n",
       " 'T. Rowe Price Funds Series II SICAV - Floating Rate Loan Fund Sdn (AUD) Dis']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(document_mapping[\"ShareClassName\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pymupdf4llm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing ./data/emea_ar/pdf/501380553.pdf...\n",
      "[                                        ] (0/47[                                        ] ( 1/47[=                                       ] ( 2/4[==                                      ] ( 3/47[===                                     ] ( 4/4[====                                    ] ( 5/47[=====                                   ] ( 6/47[=====                                   ] ( 7/4[======                                  ] ( 8/47[=======                                 ] ( 9/4[========                                ] (10/47[=========                               ] (11/4[==========                              ] (12/47[===========                             ] (13/47[===========                             ] (14/4[============                            ] (15/47[=============                           ] (16/4[==============                          ] (17/47[===============                         ] (18/4[================                        ] (19/47[=================                       ] (20/47[=================                       ] (21/4[==================                      ] (22/47[===================                     ] (23/4[====================                    ] (24/47[=====================                   ] (25/4[======================                  ] (26/4[======================                  ] (27/47[=======================                 ] (28/4[========================                ] (29/47[=========================               ] (30/4[==========================              ] (31/47[===========================             ] (32/4[============================            ] (33/4[============================            ] (34/47[=============================           ] (35/4[==============================          ] (36/47[===============================         ] (37/4[================================        ] (38/47[=================================       ] (39/4[==================================      ] (40/4[==================================      ] (41/47[===================================     ] (42/4[====================================    ] (43/47[=====================================   ] (44/4[======================================  ] (45/47[======================================= ] (46/47[========================================] (47/47]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "107851"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "md_text = pymupdf4llm.to_markdown(r\"./data/emea_ar/pdf/501380553.pdf\")\n",
    "\n",
    "# now work with the markdown text, e.g. store as a UTF8-encoded file\n",
    "import pathlib\n",
    "pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_fund_name(fund_name: str, fund_feature: str):\n",
    "        if not fund_name.endswith(fund_feature):\n",
    "            return fund_name\n",
    "        fund_feature = fund_feature + \" \"\n",
    "        fund_name_split = fund_name.split(fund_feature)\n",
    "        if len(fund_name_split) > 1:\n",
    "            last_fund = fund_name_split[-1].strip()\n",
    "            if len(last_fund) == 0:\n",
    "                last_fund = fund_name_split[-2].strip()\n",
    "            fund_name = f\"{last_fund} {fund_feature}\"\n",
    "        return fund_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'C Fund'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "fund_name = \"JPMorgan Investment Fund - Global Income Conservative Fund\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'- Global Income Conservative Fund Fund '"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_fund_name(fund_name, \"Fund\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch2_real",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/playground.py
+++ b/playground.py
@ -1,137 +0,0 @@
 import os
 import json
 import base64
 import json_repair
 from utils.pdf_util import PDFUtil
 from utils.logger import logger
 from utils.gpt_utils import chat
 def get_base64_pdf_image_list(
    pdf_file: str, pdf_page_index_list: list, output_folder: str = None
 ) -> dict:
    if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
        logger.error("pdf_file is not provided")
        return None
    pdf_util = PDFUtil(pdf_file)
    if pdf_page_index_list is None or len(pdf_page_index_list) == 0:
        pdf_page_index_list = list(range(pdf_util.get_page_count()))
    if output_folder is not None and len(output_folder) > 0:
        os.makedirs(output_folder, exist_ok=True)
    pdf_image_info = pdf_util.extract_images(
        pdf_page_index_list=pdf_page_index_list, output_folder=output_folder
    )
    return pdf_image_info
 def encode_image(image_path: str):
    if image_path is None or len(image_path) == 0 or not os.path.exists(image_path):
        return None
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")
 def chat_with_image(
    pdf_file: str,
    pdf_page_index_list: list,
    image_instructions_file: str,
    image_folder: str,
    gpt_folder: str,
 ):
    if pdf_file is None or pdf_file == "" or not os.path.exists(pdf_file):
        logger.error("pdf_file is not provided")
        return None
    pdf_image_info = get_base64_pdf_image_list(
        pdf_file, pdf_page_index_list, image_folder
    )
    with open(image_instructions_file, "r", encoding="utf-8") as file:
        image_instructions = file.read()
    os.makedirs(gpt_folder, exist_ok=True)
    pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
    response_list = {}
    for page_index, data in pdf_image_info.items():
        logger.info(f"Processing image in page {page_index}")
        image_file = data.get("img_file", None)
        image_base64 = data.get("img_base64", None)
        response, error = chat(prompt=image_instructions, image_base64=image_base64)
        if error:
            logger.error(f"Error in processing image in page {page_index}")
            continue
        try:
            response_json = json.loads(response)
        except:
            response_json = json_repair.loads(response)
        response_json_file = os.path.join(
            gpt_folder, f"{pdf_base_name}_{page_index}.json"
        )
        with open(response_json_file, "w", encoding="utf-8") as file:
            json.dump(response_json, file, indent=4)
        logger.info(f"Response for image in page {page_index}: {response}")
    logger.info("Done")
 if __name__ == "__main__":
    # Table extraction by image
    # pdf_file = r"/data/emea_ar/small_pdf/382366116.pdf"
    # pdf_page_index_list = [29, 35, 71, 77, 83, 89, 97, 103, 112, 121, 130, 140, 195, 250, 305]
    # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
    # pdf_page_index_list = [13]
    # pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
    # pdf_page_index_list = [29]
    # image_instructions_file = r"./instructions/table_extraction_image_prompts_v2.txt"
    # image_output_folder = r"/data/emea_ar/small_pdf_image/"
    # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/table/"
    # chat_with_image(
    #     pdf_file,
    #     pdf_page_index_list,
    #     image_instructions_file,
    #     image_output_folder,
    #     gpt_output_folder,
    # )
    # Data extraction by image
    # pdf_file = r"/data/emea_ar/small_pdf/402181770.pdf"
    # pdf_page_index_list = [29]
    pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
    pdf_page_index_list = [13]
    image_output_folder = r"/data/emea_ar/small_pdf_image/"
    gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/data/"
    image_instructions_file = r"./instructions/data_extraction_image_prompts.txt"
    chat_with_image(
        pdf_file,
        pdf_page_index_list,
        image_instructions_file,
        image_output_folder,
        gpt_output_folder,
    )
    # Text extraction by image
    # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
    # pdf_page_index_list = [13]
    # image_instructions_file = r"./instructions/text_extraction_image_prompts.txt"
    # image_output_folder = r"/data/emea_ar/small_pdf_image/"
    # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/text/"
    # chat_with_image(
    #     pdf_file,
    #     pdf_page_index_list,
    #     image_instructions_file,
    #     image_output_folder,
    #     gpt_output_folder,
    # )
    # pdf_file = r"/data/emea_ar/small_pdf/389171486.pdf"
    # pdf_page_index_list = [13]
    # image_instructions_file = r"./instructions/table_extraction_image_optimize_prompts.txt"
    # image_output_folder = r"/data/emea_ar/small_pdf_image/"
    # gpt_output_folder = r"/data/emea_ar/output/gpt_image_response/optimized_instructions/"
    # chat_with_image(
    #     pdf_file,
    #     pdf_page_index_list,
    #     image_instructions_file,
    #     image_output_folder,
    #     gpt_output_folder,
    # )
--- a/specific_calc_metrics.py
+++ b/specific_calc_metrics.py
@ -1,277 +0,0 @@
 from tqdm import tqdm
 from glob import glob
 import json
 import pandas as pd
 import os
 from traceback import print_exc
 from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
 from utils.logger import logger
 def calculate_complex_document_metrics(verify_file_path: str, document_list: list = []):
    data_df_1 = pd.read_excel(verify_file_path, sheet_name="data_in_doc_mapping")
    # convert doc_id column to string
    data_df_1["doc_id"] = data_df_1["doc_id"].astype(str)
    data_df_1 = data_df_1[data_df_1["raw_check"].isin([0, 1])]
    exclude_documents = ["532422548"]
    # remove data by doc_id not in exclude_documents
    data_df_1 = data_df_1[~data_df_1["doc_id"].isin(exclude_documents)]
    if document_list is not None and len(document_list) > 0:
        data_df_1 = data_df_1[data_df_1["doc_id"].isin(document_list)]
    data_df_2 = pd.read_excel(verify_file_path, sheet_name="total_mapping_data")
    data_df_2["doc_id"] = data_df_2["doc_id"].astype(str)
    data_df_2 = data_df_2[data_df_2["raw_check"].isin([0, 1])]
    data_df = pd.concat([data_df_1, data_df_2], ignore_index=True)
    data_df.fillna("", inplace=True)
    data_df.reset_index(drop=True, inplace=True)
    metrics_df_list = []
    doc_id_list = data_df["doc_id"].unique().tolist()
    for doc_id in tqdm(doc_id_list):
        try:
            document_data_df = data_df[data_df["doc_id"] == doc_id]
            document_metrics_df = calc_metrics(document_data_df, doc_id)
            metrics_df_list.append(document_metrics_df)
        except Exception as e:
            logger.error(f"Error when calculating metrics for document {doc_id}")
            print_exc()
    total_metrics_df = calc_metrics(data_df, doc_id=None)
    metrics_df_list.append(total_metrics_df)
    all_metrics_df = pd.concat(metrics_df_list, ignore_index=True)
    all_metrics_df.reset_index(drop=True, inplace=True)
    output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
    verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
    output_metrics_file = os.path.join(output_folder, 
                                       f"complex_{verify_file_name}_metrics_all.xlsx")
    with pd.ExcelWriter(output_metrics_file) as writer:
        all_metrics_df.to_excel(writer, index=False, sheet_name="metrics")
 def calc_metrics(data_df: pd.DataFrame, doc_id: str = None):
    # tor data
    tor_data_df = data_df[data_df["datapoint"] == "tor"]
    if len(tor_data_df) > 0:
        tor_metrics = get_sub_metrics(tor_data_df, "tor", doc_id)
        logger.info(f"TOR metrics: {tor_metrics}")
    else:
        tor_metrics = None
    # ter data
    ter_data_df = data_df[data_df["datapoint"] == "ter"]
    if len(ter_data_df) > 0:
        ter_metrics = get_sub_metrics(ter_data_df, "ter", doc_id)
        logger.info(f"TER metrics: {ter_metrics}")
    else:
        ter_metrics = None
    # ogc data
    ogc_data_df = data_df[data_df["datapoint"] == "ogc"]
    if len(ogc_data_df) > 0:
        ogc_metrics = get_sub_metrics(ogc_data_df, "ogc", doc_id)
        logger.info(f"OGC metrics: {ogc_metrics}")
    else:
        ogc_metrics = None
    # performance_fee data
    performance_fee_data_df = data_df[data_df["datapoint"] == "performance_fee"]
    if len(performance_fee_data_df) > 0:
        performance_fee_metrics = get_sub_metrics(performance_fee_data_df, "performance_fee", doc_id)
        logger.info(f"Performance fee metrics: {performance_fee_metrics}")
    else:
        performance_fee_metrics = None
    metrics_candidates = [tor_metrics, ter_metrics, ogc_metrics, performance_fee_metrics]
    metrics_list = [metrics for metrics in metrics_candidates if metrics is not None]
    metrics_df = pd.DataFrame(metrics_list)
    # add average metrics
    if doc_id is not None and len(doc_id) > 0:
        avg_metrics = {
            "DocumentId": doc_id,
            "DataPoint": "average",
            "F1": metrics_df["F1"].mean(),
            "Precision": metrics_df["Precision"].mean(),
            "Recall": metrics_df["Recall"].mean(),
            "Accuracy": metrics_df["Accuracy"].mean(),
            "Support": metrics_df["Support"].sum()
        }
    else:
        avg_metrics = {
            "DocumentId": "All",
            "DataPoint": "average",
            "F1": metrics_df["F1"].mean(),
            "Precision": metrics_df["Precision"].mean(),
            "Recall": metrics_df["Recall"].mean(),
            "Accuracy": metrics_df["Accuracy"].mean(),
            "Support": metrics_df["Support"].sum()
        }
    metrics_list.append(avg_metrics)
    metrics_df = pd.DataFrame(metrics_list)
    metrics_df.reset_index(drop=True, inplace=True)
    return metrics_df
 def get_sub_metrics(data_df: pd.DataFrame, data_point: str, doc_id: str = None) -> dict:
    data_df_raw_check_1 = data_df[data_df["raw_check"] == 1]
    gt_list = [1] * len(data_df_raw_check_1)
    pre_list = [1] * len(data_df_raw_check_1)
    data_df_raw_check_0 = data_df[data_df["raw_check"] == 0]
    for index, row in data_df_raw_check_0.iterrows():
        if row["raw_check_comment"] == "modify":
            gt_list.append(0)
            pre_list.append(1)
            gt_list.append(1)
            pre_list.append(0)
        elif row["raw_check_comment"] == "incorrect":
            gt_list.append(0)
            pre_list.append(1)
        elif row["raw_check_comment"] == "supplement":
            gt_list.append(1)
            pre_list.append(0)
        else:
            pass
    # calculate metrics
    accuracy = accuracy_score(gt_list, pre_list)
    precision = precision_score(gt_list, pre_list)
    recall = recall_score(gt_list, pre_list)
    f1 = f1_score(gt_list, pre_list)
    support = sum(gt_list)
    if doc_id is not None and len(doc_id) > 0:
        metrics = {
            "DocumentId": doc_id,
            "DataPoint": data_point,
            "F1": f1,
            "Precision": precision,
            "Recall": recall,
            "Accuracy": accuracy,
            "Support": support 
        }
    else:
        metrics = {
            "DocumentId": "All",
            "DataPoint": data_point,
            "F1": f1,
            "Precision": precision,
            "Recall": recall,
            "Accuracy": accuracy,
            "Support": support 
        }
    return metrics
 def get_metrics_based_documents(metrics_file: str, document_list: list):
    metrics_df = pd.read_excel(metrics_file, sheet_name="metrics")
    metrics_df_list = []
    for doc_id in tqdm(document_list):
        try:
            document_metrics_df = metrics_df[metrics_df["DocumentId"] == doc_id]
            metrics_df_list.append(document_metrics_df)
        except Exception as e:
            logger.error(f"Error when calculating metrics for document {doc_id}")
            print_exc()
    metrics_document_df = pd.concat(metrics_df_list, ignore_index=True)
    stats_metrics_list = []
    tor_df = metrics_document_df[metrics_document_df["DataPoint"] == "tor"]
    if len(tor_df) > 0:
        tor_metrics = {
            "DocumentId": "All",
            "DataPoint": "tor",
            "F1": tor_df["F1"].mean(),
            "Precision": tor_df["Precision"].mean(),
            "Recall": tor_df["Recall"].mean(),
            "Accuracy": tor_df["Accuracy"].mean(),
            "Support": tor_df["Support"].sum()
        }
        stats_metrics_list.append(tor_metrics)
    ter_df = metrics_document_df[metrics_document_df["DataPoint"] == "ter"]
    if len(ter_df) > 0:
        ter_metrics = {
            "DocumentId": "All",
            "DataPoint": "ter",
            "F1": ter_df["F1"].mean(),
            "Precision": ter_df["Precision"].mean(),
            "Recall": ter_df["Recall"].mean(),
            "Accuracy": ter_df["Accuracy"].mean(),
            "Support": ter_df["Support"].sum()
        }
        stats_metrics_list.append(ter_metrics)
    ogc_df = metrics_document_df[metrics_document_df["DataPoint"] == "ogc"]
    if len(ogc_df) > 0:
        ogc_metrics = {
            "DocumentId": "All",
            "DataPoint": "ogc",
            "F1": ogc_df["F1"].mean(),
            "Precision": ogc_df["Precision"].mean(),
            "Recall": ogc_df["Recall"].mean(),
            "Accuracy": ogc_df["Accuracy"].mean(),
            "Support": ogc_df["Support"].sum()
        }
        stats_metrics_list.append(ogc_metrics)
    performance_fee_df = metrics_document_df[metrics_document_df["DataPoint"] == "performance_fee"]
    if len(performance_fee_df) > 0:
        performance_fee_metrics = {
            "DocumentId": "All",
            "DataPoint": "performance_fee",
            "F1": performance_fee_df["F1"].mean(),
            "Precision": performance_fee_df["Precision"].mean(),
            "Recall": performance_fee_df["Recall"].mean(),
            "Accuracy": performance_fee_df["Accuracy"].mean(),
            "Support": performance_fee_df["Support"].sum()
        }
        stats_metrics_list.append(performance_fee_metrics)
    average_df = metrics_document_df[metrics_document_df["DataPoint"] == "average"]
    if len(average_df) > 0:
        avg_metrics = {
            "DocumentId": "All",
            "DataPoint": "average",
            "F1": average_df["F1"].mean(),
            "Precision": average_df["Precision"].mean(),
            "Recall": average_df["Recall"].mean(),
            "Accuracy": average_df["Accuracy"].mean(),
            "Support": average_df["Support"].sum()
        }
        stats_metrics_list.append(avg_metrics)
    stats_metrics_df = pd.DataFrame(stats_metrics_list)
    metrics_df_list.append(stats_metrics_df)
    all_metrics_df = pd.concat(metrics_df_list, ignore_index=True)
    all_metrics_df.reset_index(drop=True, inplace=True)
    output_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
    verify_file_name = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_remain_7.xlsx"
    output_metrics_file = os.path.join(output_folder, verify_file_name)
    with pd.ExcelWriter(output_metrics_file) as writer:
        all_metrics_df.to_excel(writer, index=False, sheet_name="metrics")
    return all_metrics_df
 if __name__ == "__main__":
    file_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
    verify_file = "mapping_data_info_31_documents_by_text_second_round.xlsx"
    verify_file_path = os.path.join(file_folder, verify_file)
    calculate_complex_document_metrics(verify_file_path=verify_file_path, 
                                       document_list=None)
    document_list = ["492029971",
                    "510300817",
                    "512745032",
                    "514213638",
                    "527525440",
                    "534535767"]
    metrics_file = "complex_mapping_data_info_31_documents_by_text_second_round_metrics_all.xlsx"
    metrics_file_path = os.path.join(file_folder, metrics_file)
    # get_metrics_based_documents(metrics_file=metrics_file_path,
    #                              document_list=document_list)
--- a/test_specific_biz_logic.py
+++ b/test_specific_biz_logic.py
@ -1,70 +0,0 @@
 import os
 import json
 import pandas as pd
 from glob import glob
 from tqdm import tqdm
 from utils.logger import logger
 from utils.sql_query_util import query_document_fund_mapping
 from core.page_filter import FilterPages
 from core.data_extraction import DataExtraction
 def test_validate_extraction_data():
    document_id = "481482392"
    pdf_file = f"/data/emea_ar/pdf/481482392.pdf"
    output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
    output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
    document_mapping_info_df = query_document_fund_mapping(document_id, rerun=False)
    filter_pages = FilterPages(
            document_id, pdf_file, document_mapping_info_df
        )
    page_text_dict = filter_pages.page_text_dict
    datapoint_page_info, result_details = get_datapoint_page_info(filter_pages)
    datapoints = get_datapoints_from_datapoint_page_info(datapoint_page_info)
    data_extraction = DataExtraction(
                    doc_source="emea_ar",
                    doc_id=document_id,
                    pdf_file=pdf_file,
                    output_data_folder=output_extract_data_child_folder,
                    page_text_dict=page_text_dict,
                    datapoint_page_info=datapoint_page_info,
                    datapoints=datapoints,
                    document_mapping_info_df=document_mapping_info_df,
                    extract_way="text",
                    output_image_folder=None
                )
    output_data_json_folder = os.path.join(
                r"/data/emea_ar/output/extract_data/docs/by_text/", "json/"
            )
    os.makedirs(output_data_json_folder, exist_ok=True)
    json_file = os.path.join(output_data_json_folder, f"{document_id}.json")
    data_from_gpt = None
    if os.path.exists(json_file):
        logger.info(
            f"The document: {document_id} has been parsed, loading data from {json_file}"
        )
        with open(json_file, "r", encoding="utf-8") as f:
            data_from_gpt = json.load(f)
    for extract_data in data_from_gpt:
        page_index = extract_data["page_index"]
        if page_index == 451:
            logger.info(f"Page index: {page_index}")
            raw_answer = extract_data["raw_answer"]
            raw_answer_json = json.loads(raw_answer)
            extract_data_info = data_extraction.validate_data(raw_answer_json)
            print(extract_data_info)
 def get_datapoint_page_info(filter_pages) -> tuple:
    datapoint_page_info, result_details = filter_pages.start_job()
    return datapoint_page_info, result_details
 def get_datapoints_from_datapoint_page_info(datapoint_page_info) -> list:
    datapoints = list(datapoint_page_info.keys())
    if "doc_id" in datapoints:
        datapoints.remove("doc_id")
    return datapoints
 if __name__ == "__main__":
    test_validate_extraction_data()