auto-fix json data format
This commit is contained in:
parent
fa46b45ad5
commit
f91e0cf1a8
|
|
@ -1,11 +1,13 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
import tqdm
|
import tqdm
|
||||||
|
import json_repair
|
||||||
import json
|
import json
|
||||||
from glob import glob
|
from glob import glob
|
||||||
import fitz
|
import fitz
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
import traceback
|
||||||
|
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
from utils.pdf_download import download_pdf_from_documents_warehouse
|
from utils.pdf_download import download_pdf_from_documents_warehouse
|
||||||
|
|
@ -58,23 +60,24 @@ class PDFTableExtraction:
|
||||||
try:
|
try:
|
||||||
self.extract_tables_from_page(page_text, page_num)
|
self.extract_tables_from_page(page_text, page_num)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
logger.error(f"Error in extracting tables from page {page_num}: {str(e)}")
|
logger.error(f"Error in extracting tables from page {page_num}: {str(e)}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in extracting PDF tables: {str(e)}")
|
logger.error(f"Error in extracting PDF tables: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
def extract_tables_from_page(self, page_text: str, page_num: int):
|
def extract_tables_from_page(self, page_text: str, page_num: int):
|
||||||
|
pure_pdf_name = self.pdf_file_name.replace('.pdf', '')
|
||||||
table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text)
|
table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text)
|
||||||
|
prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt')
|
||||||
|
if os.path.exists(prompts_response_file):
|
||||||
|
logger.info(f"Prompts response file already exists: {prompts_response_file}")
|
||||||
|
return
|
||||||
|
|
||||||
response, with_error = chat(table_extraction_prompts)
|
response, with_error = chat(table_extraction_prompts)
|
||||||
if with_error:
|
if with_error:
|
||||||
logger.error(f"Error in extracting tables from page")
|
logger.error(f"Error in extracting tables from page")
|
||||||
return
|
return
|
||||||
prompts_response = f'{table_extraction_prompts}\n\n{response}'
|
|
||||||
|
|
||||||
pure_pdf_name = self.pdf_file_name.replace('.pdf', '')
|
|
||||||
prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt')
|
|
||||||
with open(prompts_response_file, 'w', encoding='utf-8') as file:
|
|
||||||
file.write(prompts_response)
|
|
||||||
|
|
||||||
json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response)
|
json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response)
|
||||||
if json_response is None:
|
if json_response is None:
|
||||||
|
|
@ -82,10 +85,17 @@ class PDFTableExtraction:
|
||||||
return
|
return
|
||||||
|
|
||||||
table_json_text = json_response.group(1)
|
table_json_text = json_response.group(1)
|
||||||
|
table_data = {"tables": []}
|
||||||
|
try:
|
||||||
table_data = json.loads(table_json_text)
|
table_data = json.loads(table_json_text)
|
||||||
prompts_response
|
except:
|
||||||
|
table_data = json_repair.loads(table_json_text)
|
||||||
self.save_table_data(table_data, page_num)
|
self.save_table_data(table_data, page_num)
|
||||||
|
|
||||||
|
prompts_response = f'{table_extraction_prompts}\n\n{response}'
|
||||||
|
with open(prompts_response_file, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(prompts_response)
|
||||||
|
|
||||||
def save_table_data(self, table_data: dict, page_num: int):
|
def save_table_data(self, table_data: dict, page_num: int):
|
||||||
pdf_pure_name = self.pdf_file_name.replace('.pdf', '')
|
pdf_pure_name = self.pdf_file_name.replace('.pdf', '')
|
||||||
json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json')
|
json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json')
|
||||||
|
|
@ -95,5 +105,6 @@ class PDFTableExtraction:
|
||||||
table_list = table_data.get('tables', [])
|
table_list = table_data.get('tables', [])
|
||||||
for table_num, table in enumerate(table_list):
|
for table_num, table in enumerate(table_list):
|
||||||
table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md')
|
table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md')
|
||||||
|
table = re.sub(r'(\n)+', '\n', table)
|
||||||
with open(table_md_file, 'w', encoding='utf-8') as file:
|
with open(table_md_file, 'w', encoding='utf-8') as file:
|
||||||
file.write(table)
|
file.write(table)
|
||||||
|
|
@ -6,6 +6,8 @@ from glob import glob
|
||||||
import fitz
|
import fitz
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
import traceback
|
||||||
|
import json_repair
|
||||||
|
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
from utils.pdf_download import download_pdf_from_documents_warehouse
|
from utils.pdf_download import download_pdf_from_documents_warehouse
|
||||||
|
|
@ -75,6 +77,26 @@ def extract_pdf_table(pdf_folder: str, output_folder: str):
|
||||||
pdf_table_extraction.extract_tables()
|
pdf_table_extraction.extract_tables()
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_json_error():
|
||||||
|
text_file = r"/data/emea_ar/output/pdf_table_prompts/445877368_4.txt"
|
||||||
|
with open(text_file, 'r', encoding='utf-8') as file:
|
||||||
|
text = file.read()
|
||||||
|
json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', text)
|
||||||
|
if json_response:
|
||||||
|
json_text = json_response.group(1)
|
||||||
|
json_data = {"tables": []}
|
||||||
|
try:
|
||||||
|
json_data = json.loads(json_text)
|
||||||
|
except:
|
||||||
|
json_data = json_repair.loads(json_text)
|
||||||
|
table_list = json_data.get('tables', [])
|
||||||
|
for table_num, table in enumerate(table_list):
|
||||||
|
table_md_file = os.path.join("/temp/", f'temp_{table_num}.md')
|
||||||
|
table = re.sub(r'(\n)+', '\n', table)
|
||||||
|
with open(table_md_file, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(table)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
doc_provider_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
doc_provider_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
|
|
@ -83,3 +105,4 @@ if __name__ == '__main__':
|
||||||
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
|
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
|
||||||
# output_pdf_page_text(pdf_folder, output_folder)
|
# output_pdf_page_text(pdf_folder, output_folder)
|
||||||
extract_pdf_table(pdf_folder, output_folder)
|
extract_pdf_table(pdf_folder, output_folder)
|
||||||
|
# analyze_json_error()
|
||||||
Loading…
Reference in New Issue