auto-fix json data format

This commit is contained in:
Blade He 2024-08-19 17:59:32 -05:00
parent fa46b45ad5
commit f91e0cf1a8
2 changed files with 43 additions and 9 deletions

View File

@ -1,11 +1,13 @@
import pandas as pd import pandas as pd
import os import os
import tqdm import tqdm
import json_repair
import json import json
from glob import glob from glob import glob
import fitz import fitz
import re import re
import time import time
import traceback
from utils.logger import logger from utils.logger import logger
from utils.pdf_download import download_pdf_from_documents_warehouse from utils.pdf_download import download_pdf_from_documents_warehouse
@ -58,23 +60,24 @@ class PDFTableExtraction:
try: try:
self.extract_tables_from_page(page_text, page_num) self.extract_tables_from_page(page_text, page_num)
except Exception as e: except Exception as e:
traceback.print_exc()
logger.error(f"Error in extracting tables from page {page_num}: {str(e)}") logger.error(f"Error in extracting tables from page {page_num}: {str(e)}")
except Exception as e: except Exception as e:
logger.error(f"Error in extracting PDF tables: {str(e)}") logger.error(f"Error in extracting PDF tables: {str(e)}")
def extract_tables_from_page(self, page_text: str, page_num: int): def extract_tables_from_page(self, page_text: str, page_num: int):
pure_pdf_name = self.pdf_file_name.replace('.pdf', '')
table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text) table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text)
prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt')
if os.path.exists(prompts_response_file):
logger.info(f"Prompts response file already exists: {prompts_response_file}")
return
response, with_error = chat(table_extraction_prompts) response, with_error = chat(table_extraction_prompts)
if with_error: if with_error:
logger.error(f"Error in extracting tables from page") logger.error(f"Error in extracting tables from page")
return return
prompts_response = f'{table_extraction_prompts}\n\n{response}'
pure_pdf_name = self.pdf_file_name.replace('.pdf', '')
prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt')
with open(prompts_response_file, 'w', encoding='utf-8') as file:
file.write(prompts_response)
json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response) json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response)
if json_response is None: if json_response is None:
@ -82,10 +85,17 @@ class PDFTableExtraction:
return return
table_json_text = json_response.group(1) table_json_text = json_response.group(1)
table_data = json.loads(table_json_text) table_data = {"tables": []}
prompts_response try:
table_data = json.loads(table_json_text)
except:
table_data = json_repair.loads(table_json_text)
self.save_table_data(table_data, page_num) self.save_table_data(table_data, page_num)
prompts_response = f'{table_extraction_prompts}\n\n{response}'
with open(prompts_response_file, 'w', encoding='utf-8') as file:
file.write(prompts_response)
def save_table_data(self, table_data: dict, page_num: int): def save_table_data(self, table_data: dict, page_num: int):
pdf_pure_name = self.pdf_file_name.replace('.pdf', '') pdf_pure_name = self.pdf_file_name.replace('.pdf', '')
json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json') json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json')
@ -95,5 +105,6 @@ class PDFTableExtraction:
table_list = table_data.get('tables', []) table_list = table_data.get('tables', [])
for table_num, table in enumerate(table_list): for table_num, table in enumerate(table_list):
table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md') table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md')
table = re.sub(r'(\n)+', '\n', table)
with open(table_md_file, 'w', encoding='utf-8') as file: with open(table_md_file, 'w', encoding='utf-8') as file:
file.write(table) file.write(table)

View File

@ -6,6 +6,8 @@ from glob import glob
import fitz import fitz
import re import re
import time import time
import traceback
import json_repair
from utils.logger import logger from utils.logger import logger
from utils.pdf_download import download_pdf_from_documents_warehouse from utils.pdf_download import download_pdf_from_documents_warehouse
@ -75,6 +77,26 @@ def extract_pdf_table(pdf_folder: str, output_folder: str):
pdf_table_extraction.extract_tables() pdf_table_extraction.extract_tables()
def analyze_json_error():
text_file = r"/data/emea_ar/output/pdf_table_prompts/445877368_4.txt"
with open(text_file, 'r', encoding='utf-8') as file:
text = file.read()
json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', text)
if json_response:
json_text = json_response.group(1)
json_data = {"tables": []}
try:
json_data = json.loads(json_text)
except:
json_data = json_repair.loads(json_text)
table_list = json_data.get('tables', [])
for table_num, table in enumerate(table_list):
table_md_file = os.path.join("/temp/", f'temp_{table_num}.md')
table = re.sub(r'(\n)+', '\n', table)
with open(table_md_file, 'w', encoding='utf-8') as file:
file.write(table)
if __name__ == '__main__': if __name__ == '__main__':
doc_provider_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx" doc_provider_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
pdf_folder = r"/data/emea_ar/pdf/" pdf_folder = r"/data/emea_ar/pdf/"
@ -83,3 +105,4 @@ if __name__ == '__main__':
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder) # download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
# output_pdf_page_text(pdf_folder, output_folder) # output_pdf_page_text(pdf_folder, output_folder)
extract_pdf_table(pdf_folder, output_folder) extract_pdf_table(pdf_folder, output_folder)
# analyze_json_error()