support output tables as markdown format from pdf documents
This commit is contained in:
parent
424c30853c
commit
fa46b45ad5
|
|
@ -0,0 +1,21 @@
|
|||
Context:
|
||||
{page_text}
|
||||
|
||||
Instructions:
|
||||
Please read the contex carefully.
|
||||
Answer below questions:
|
||||
1. Please find the table or tables in the context.
|
||||
2. Output the table contents as markdown format, it's like:
|
||||
|name|age|hobby|
|
||||
|Annie|18|music|
|
||||
The contents should be exactly precise as the context.
|
||||
3. Please output the results as JSON format, the result member is with legal markdown table format, the example is:
|
||||
{
|
||||
"tables": ["
|
||||
|name|age|hobby|
|
||||
|Annie|18|music|
|
||||
"]
|
||||
}
|
||||
4. Only output JSON with tables
|
||||
|
||||
Answer:
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
def main():
|
||||
print("Hello World!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
import tqdm
|
||||
import json
|
||||
from glob import glob
|
||||
import fitz
|
||||
import re
|
||||
import time
|
||||
|
||||
from utils.logger import logger
|
||||
from utils.pdf_download import download_pdf_from_documents_warehouse
|
||||
from utils.pdf_util import PDFUtil
|
||||
from utils.gpt_utils import chat
|
||||
|
||||
|
||||
class PDFTableExtraction:
|
||||
"""
|
||||
Iterate PDF pages
|
||||
Extract tables from PDF pages
|
||||
Save these tables as markdown files
|
||||
"""
|
||||
def __init__(self,
|
||||
pdf_file: str,
|
||||
output_folder: str) -> None:
|
||||
self.pdf_file = pdf_file
|
||||
self.pdf_file_name = os.path.basename(pdf_file)
|
||||
self.table_extraction_prompts = self.get_table_extraction_prompts()
|
||||
|
||||
self.output_folder = output_folder
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
|
||||
self.prompts_output_folder = os.path.join(output_folder, 'pdf_table_prompts/')
|
||||
os.makedirs(self.prompts_output_folder, exist_ok=True)
|
||||
|
||||
self.json_output_folder = os.path.join(output_folder, 'pdf_table_json/')
|
||||
os.makedirs(self.json_output_folder, exist_ok=True)
|
||||
|
||||
self.table_md_output_folder = os.path.join(output_folder, 'pdf_table_markdown/')
|
||||
os.makedirs(self.table_md_output_folder, exist_ok=True)
|
||||
|
||||
def get_table_extraction_prompts(self):
|
||||
instructions_file = r'./instructions/table_extraction_prompts.txt'
|
||||
with open(instructions_file, 'r', encoding='utf-8') as file:
|
||||
return file.read()
|
||||
|
||||
def extract_tables(self):
|
||||
try:
|
||||
if self.pdf_file is None or len(self.pdf_file) == 0 or not os.path.exists(self.pdf_file):
|
||||
logger.error(f"Invalid pdf_file: {self.pdf_file}")
|
||||
return
|
||||
logger.info(f"Start processing {self.pdf_file}")
|
||||
pdf_util = PDFUtil(self.pdf_file)
|
||||
success, text, page_text_dict = pdf_util.extract_text(output_folder=self.output_folder)
|
||||
if success:
|
||||
logger.info(f"Successfully extracted text from {self.pdf_file}")
|
||||
|
||||
for page_num, page_text in page_text_dict.items():
|
||||
try:
|
||||
self.extract_tables_from_page(page_text, page_num)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in extracting tables from page {page_num}: {str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in extracting PDF tables: {str(e)}")
|
||||
|
||||
|
||||
def extract_tables_from_page(self, page_text: str, page_num: int):
|
||||
table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text)
|
||||
response, with_error = chat(table_extraction_prompts)
|
||||
if with_error:
|
||||
logger.error(f"Error in extracting tables from page")
|
||||
return
|
||||
prompts_response = f'{table_extraction_prompts}\n\n{response}'
|
||||
|
||||
pure_pdf_name = self.pdf_file_name.replace('.pdf', '')
|
||||
prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt')
|
||||
with open(prompts_response_file, 'w', encoding='utf-8') as file:
|
||||
file.write(prompts_response)
|
||||
|
||||
json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response)
|
||||
if json_response is None:
|
||||
logger.info(f"Can't extract tables from page")
|
||||
return
|
||||
|
||||
table_json_text = json_response.group(1)
|
||||
table_data = json.loads(table_json_text)
|
||||
prompts_response
|
||||
self.save_table_data(table_data, page_num)
|
||||
|
||||
def save_table_data(self, table_data: dict, page_num: int):
|
||||
pdf_pure_name = self.pdf_file_name.replace('.pdf', '')
|
||||
json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json')
|
||||
with open(json_output_file, 'w', encoding='utf-8') as file:
|
||||
file.write(json.dumps(table_data, indent=4))
|
||||
|
||||
table_list = table_data.get('tables', [])
|
||||
for table_num, table in enumerate(table_list):
|
||||
table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md')
|
||||
with open(table_md_file, 'w', encoding='utf-8') as file:
|
||||
file.write(table)
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
import tqdm
|
||||
import json
|
||||
from glob import glob
|
||||
import fitz
|
||||
import re
|
||||
import time
|
||||
|
||||
from utils.logger import logger
|
||||
from utils.pdf_download import download_pdf_from_documents_warehouse
|
||||
from utils.pdf_util import PDFUtil
|
||||
from pdf_table_extraction import PDFTableExtraction
|
||||
|
||||
def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
|
||||
doc_provider_data = pd.read_excel(doc_provider_file_path)
|
||||
# get new data by grouping by docid, and count the number of rows for each docid,
|
||||
# set the new data with 2 columns: docid and provider_count
|
||||
doc_provider_count = doc_provider_data.groupby('DocumentId').size().reset_index(name='provider_count')
|
||||
# sort new data by provider_count in descending order
|
||||
doc_provider_count = doc_provider_count.sort_values(by='provider_count', ascending=False)
|
||||
|
||||
# save excel by doc_provider_data and new_data
|
||||
with pd.ExcelWriter(doc_provider_file_path) as writer:
|
||||
doc_provider_data.to_excel(writer, sheet_name='doc_provider_details', index=False)
|
||||
doc_provider_count.to_excel(writer, sheet_name='doc_provider_count', index=False)
|
||||
|
||||
def download_pdf(doc_provider_file_path: str, sheet_name: str, pdf_path: str):
|
||||
document_data = pd.read_excel(doc_provider_file_path, sheet_name=sheet_name)
|
||||
# get all unique docids as list
|
||||
doc_id_list = [str(doc_id) for doc_id
|
||||
in document_data['DocumentId'].unique().tolist()]
|
||||
# download pdfs
|
||||
logger.info(f"Start downloading {len(doc_id_list)} pdfs")
|
||||
os.makedirs(pdf_path, exist_ok=True)
|
||||
for doc_id in tqdm.tqdm(doc_id_list):
|
||||
logger.info(f"Downloading pdf for docid: {doc_id}")
|
||||
download_pdf_from_documents_warehouse(pdf_directory=pdf_path, doc_id=doc_id)
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def output_pdf_page_text(pdf_folder: str, output_folder: str):
|
||||
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
|
||||
logger.error(f"Invalid pdf_folder: {pdf_folder}")
|
||||
return
|
||||
if output_folder is None or len(output_folder) == 0:
|
||||
logger.error(f"Invalid output_folder: {output_folder}")
|
||||
return
|
||||
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
pdf_files = glob(os.path.join(pdf_folder, '*.pdf'))
|
||||
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
||||
for pdf_file in pdf_files:
|
||||
logger.info(f"Start processing {pdf_file}")
|
||||
pdf_util = PDFUtil(pdf_file)
|
||||
success, text, page_text_dict = pdf_util.extract_text(output_folder=output_folder)
|
||||
if success:
|
||||
logger.info(f"Successfully extracted text from {pdf_file}")
|
||||
|
||||
|
||||
def extract_pdf_table(pdf_folder: str, output_folder: str):
|
||||
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
|
||||
logger.error(f"Invalid pdf_folder: {pdf_folder}")
|
||||
return
|
||||
if output_folder is None or len(output_folder) == 0:
|
||||
logger.error(f"Invalid output_folder: {output_folder}")
|
||||
return
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
|
||||
pdf_files = glob(os.path.join(pdf_folder, '*.pdf'))
|
||||
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
||||
for pdf_file in pdf_files:
|
||||
logger.info(f"Start processing {pdf_file}")
|
||||
pdf_table_extraction = PDFTableExtraction(pdf_file, output_folder)
|
||||
pdf_table_extraction.extract_tables()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
doc_provider_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
||||
pdf_folder = r"/data/emea_ar/pdf/"
|
||||
output_folder = r"/data/emea_ar/output/"
|
||||
# get_unique_docids_from_doc_provider_data(doc_provider_file_path)
|
||||
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
|
||||
# output_pdf_page_text(pdf_folder, output_folder)
|
||||
extract_pdf_table(pdf_folder, output_folder)
|
||||
|
|
@ -8,31 +8,8 @@ import dotenv
|
|||
# loads .env file with your OPENAI_API_KEY
|
||||
dotenv.load_dotenv()
|
||||
|
||||
|
||||
def set_environment_variables(engine=os.getenv("Engine_0613_16k")):
|
||||
if engine.startswith('gpt4') or engine.startswith('gpt-4'):
|
||||
openai.api_base = os.getenv("OPENAI_API_BASE_DC")
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY_GPT4")
|
||||
elif engine.startswith('modc-stg-gpt4'):
|
||||
openai.api_base = os.getenv("OPENAI_API_BASE_GPT4_MODC")
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY_GPT4_MODC")
|
||||
elif engine.upper() == 'ENGINE_GPT4_TURBO':
|
||||
openai.api_base = os.getenv("OPENAI_API_BASE_GPT4_TURBO")
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY_GPT4_TURBO")
|
||||
elif engine.startswith('modc-stg-gpt35turbo16k'):
|
||||
openai.api_base = os.getenv("OPENAI_API_BASE_GPT3_MODC")
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY_GPT3_MODC")
|
||||
else:
|
||||
openai.api_base = os.getenv("OPENAI_API_BASE")
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||
openai.Engine = engine
|
||||
openai.api_type = os.getenv("OPENAI_API_TYPE")
|
||||
openai.api_version = os.getenv("OPENAI_API_VERSION")
|
||||
|
||||
# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
|
||||
def get_embedding(text, engine=os.getenv("EMBEDDING_ENGINE")):
|
||||
count = 0
|
||||
error = ''
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ class PDFUtil:
|
|||
pdf_doc.close()
|
||||
return True, output
|
||||
|
||||
def extract_text(self, output_file: str = None) -> Tuple[bool, str, dict]:
|
||||
def extract_text(self, output_folder: str = None) -> Tuple[bool, str, dict]:
|
||||
"""
|
||||
Extracts text from PDF
|
||||
"""
|
||||
|
|
@ -92,9 +92,18 @@ class PDFUtil:
|
|||
# "######################################################################"
|
||||
# )
|
||||
# Save to file
|
||||
if output_file:
|
||||
with open(output_file, "w", encoding="utf-8") as file:
|
||||
if output_folder:
|
||||
txt_output_folder = os.path.join(output_folder, 'pdf_text/')
|
||||
os.makedirs(txt_output_folder, exist_ok=True)
|
||||
txt_file = os.path.join(txt_output_folder, self.simple_pdf_file.replace(".pdf", ".txt"))
|
||||
with open(txt_file, "w", encoding="utf-8") as file:
|
||||
file.write(text.strip())
|
||||
|
||||
json_output_folder = os.path.join(output_folder, 'pdf_json/')
|
||||
os.makedirs(json_output_folder, exist_ok=True)
|
||||
json_file = os.path.join(json_output_folder, self.simple_pdf_file.replace(".pdf", ".json"))
|
||||
with open(json_file, "w", encoding="utf-8") as file:
|
||||
json.dump(page_text_dict, file, indent=4)
|
||||
pdf_doc.close()
|
||||
return True, text, page_text_dict
|
||||
except Exception as e:
|
||||
|
|
|
|||
Loading…
Reference in New Issue