From f91e0cf1a807f40c1d90261533ffa2670a76d9cf Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 19 Aug 2024 17:59:32 -0500 Subject: [PATCH] auto-fix json data format --- pdf_table_extraction.py | 27 +++++++++++++++++++-------- prepare_data.py | 25 ++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/pdf_table_extraction.py b/pdf_table_extraction.py index ad462c5..852c3a5 100644 --- a/pdf_table_extraction.py +++ b/pdf_table_extraction.py @@ -1,11 +1,13 @@ import pandas as pd import os import tqdm +import json_repair import json from glob import glob import fitz import re import time +import traceback from utils.logger import logger from utils.pdf_download import download_pdf_from_documents_warehouse @@ -58,23 +60,24 @@ class PDFTableExtraction: try: self.extract_tables_from_page(page_text, page_num) except Exception as e: + traceback.print_exc() logger.error(f"Error in extracting tables from page {page_num}: {str(e)}") except Exception as e: logger.error(f"Error in extracting PDF tables: {str(e)}") def extract_tables_from_page(self, page_text: str, page_num: int): + pure_pdf_name = self.pdf_file_name.replace('.pdf', '') table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text) + prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt') + if os.path.exists(prompts_response_file): + logger.info(f"Prompts response file already exists: {prompts_response_file}") + return + response, with_error = chat(table_extraction_prompts) if with_error: logger.error(f"Error in extracting tables from page") return - prompts_response = f'{table_extraction_prompts}\n\n{response}' - - pure_pdf_name = self.pdf_file_name.replace('.pdf', '') - prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt') - with open(prompts_response_file, 'w', encoding='utf-8') as file: - file.write(prompts_response) json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response) if json_response is None: @@ -82,9 +85,16 @@ class PDFTableExtraction: return table_json_text = json_response.group(1) - table_data = json.loads(table_json_text) - prompts_response + table_data = {"tables": []} + try: + table_data = json.loads(table_json_text) + except: + table_data = json_repair.loads(table_json_text) self.save_table_data(table_data, page_num) + + prompts_response = f'{table_extraction_prompts}\n\n{response}' + with open(prompts_response_file, 'w', encoding='utf-8') as file: + file.write(prompts_response) def save_table_data(self, table_data: dict, page_num: int): pdf_pure_name = self.pdf_file_name.replace('.pdf', '') @@ -95,5 +105,6 @@ class PDFTableExtraction: table_list = table_data.get('tables', []) for table_num, table in enumerate(table_list): table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md') + table = re.sub(r'(\n)+', '\n', table) with open(table_md_file, 'w', encoding='utf-8') as file: file.write(table) \ No newline at end of file diff --git a/prepare_data.py b/prepare_data.py index c9ad9da..120d301 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -6,6 +6,8 @@ from glob import glob import fitz import re import time +import traceback +import json_repair from utils.logger import logger from utils.pdf_download import download_pdf_from_documents_warehouse @@ -75,6 +77,26 @@ def extract_pdf_table(pdf_folder: str, output_folder: str): pdf_table_extraction.extract_tables() +def analyze_json_error(): + text_file = r"/data/emea_ar/output/pdf_table_prompts/445877368_4.txt" + with open(text_file, 'r', encoding='utf-8') as file: + text = file.read() + json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', text) + if json_response: + json_text = json_response.group(1) + json_data = {"tables": []} + try: + json_data = json.loads(json_text) + except: + json_data = json_repair.loads(json_text) + table_list = json_data.get('tables', []) + for table_num, table in enumerate(table_list): + table_md_file = os.path.join("/temp/", f'temp_{table_num}.md') + table = re.sub(r'(\n)+', '\n', table) + with open(table_md_file, 'w', encoding='utf-8') as file: + file.write(table) + + if __name__ == '__main__': doc_provider_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx" pdf_folder = r"/data/emea_ar/pdf/" @@ -82,4 +104,5 @@ if __name__ == '__main__': # get_unique_docids_from_doc_provider_data(doc_provider_file_path) # download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder) # output_pdf_page_text(pdf_folder, output_folder) - extract_pdf_table(pdf_folder, output_folder) \ No newline at end of file + extract_pdf_table(pdf_folder, output_folder) + # analyze_json_error() \ No newline at end of file