From fa46b45ad511ec36dbc5fd03e2d88eb9fd11040c Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 19 Aug 2024 15:49:45 -0500 Subject: [PATCH] support output tables as markdown format from pdf documents --- instructions/table_extraction_prompts.txt | 21 +++++ main.py | 5 ++ pdf_table_extraction.py | 99 +++++++++++++++++++++++ prepare_data.py | 85 +++++++++++++++++++ utils/gpt_utils.py | 23 ------ utils/pdf_util.py | 15 +++- 6 files changed, 222 insertions(+), 26 deletions(-) create mode 100644 instructions/table_extraction_prompts.txt create mode 100644 main.py create mode 100644 pdf_table_extraction.py create mode 100644 prepare_data.py diff --git a/instructions/table_extraction_prompts.txt b/instructions/table_extraction_prompts.txt new file mode 100644 index 0000000..2ea34f2 --- /dev/null +++ b/instructions/table_extraction_prompts.txt @@ -0,0 +1,21 @@ +Context: +{page_text} + +Instructions: +Please read the contex carefully. +Answer below questions: +1. Please find the table or tables in the context. +2. Output the table contents as markdown format, it's like: +|name|age|hobby| +|Annie|18|music| +The contents should be exactly precise as the context. +3. Please output the results as JSON format, the result member is with legal markdown table format, the example is: +{ +"tables": [" +|name|age|hobby| +|Annie|18|music| +"] +} +4. Only output JSON with tables + +Answer: \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..5df5440 --- /dev/null +++ b/main.py @@ -0,0 +1,5 @@ +def main(): + print("Hello World!") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pdf_table_extraction.py b/pdf_table_extraction.py new file mode 100644 index 0000000..ad462c5 --- /dev/null +++ b/pdf_table_extraction.py @@ -0,0 +1,99 @@ +import pandas as pd +import os +import tqdm +import json +from glob import glob +import fitz +import re +import time + +from utils.logger import logger +from utils.pdf_download import download_pdf_from_documents_warehouse +from utils.pdf_util import PDFUtil +from utils.gpt_utils import chat + + +class PDFTableExtraction: + """ + Iterate PDF pages + Extract tables from PDF pages + Save these tables as markdown files + """ + def __init__(self, + pdf_file: str, + output_folder: str) -> None: + self.pdf_file = pdf_file + self.pdf_file_name = os.path.basename(pdf_file) + self.table_extraction_prompts = self.get_table_extraction_prompts() + + self.output_folder = output_folder + os.makedirs(output_folder, exist_ok=True) + + self.prompts_output_folder = os.path.join(output_folder, 'pdf_table_prompts/') + os.makedirs(self.prompts_output_folder, exist_ok=True) + + self.json_output_folder = os.path.join(output_folder, 'pdf_table_json/') + os.makedirs(self.json_output_folder, exist_ok=True) + + self.table_md_output_folder = os.path.join(output_folder, 'pdf_table_markdown/') + os.makedirs(self.table_md_output_folder, exist_ok=True) + + def get_table_extraction_prompts(self): + instructions_file = r'./instructions/table_extraction_prompts.txt' + with open(instructions_file, 'r', encoding='utf-8') as file: + return file.read() + + def extract_tables(self): + try: + if self.pdf_file is None or len(self.pdf_file) == 0 or not os.path.exists(self.pdf_file): + logger.error(f"Invalid pdf_file: {self.pdf_file}") + return + logger.info(f"Start processing {self.pdf_file}") + pdf_util = PDFUtil(self.pdf_file) + success, text, page_text_dict = pdf_util.extract_text(output_folder=self.output_folder) + if success: + logger.info(f"Successfully extracted text from {self.pdf_file}") + + for page_num, page_text in page_text_dict.items(): + try: + self.extract_tables_from_page(page_text, page_num) + except Exception as e: + logger.error(f"Error in extracting tables from page {page_num}: {str(e)}") + except Exception as e: + logger.error(f"Error in extracting PDF tables: {str(e)}") + + + def extract_tables_from_page(self, page_text: str, page_num: int): + table_extraction_prompts = self.table_extraction_prompts.replace(r'{page_text}', page_text) + response, with_error = chat(table_extraction_prompts) + if with_error: + logger.error(f"Error in extracting tables from page") + return + prompts_response = f'{table_extraction_prompts}\n\n{response}' + + pure_pdf_name = self.pdf_file_name.replace('.pdf', '') + prompts_response_file = os.path.join(self.prompts_output_folder, f'{pure_pdf_name}_{page_num}.txt') + with open(prompts_response_file, 'w', encoding='utf-8') as file: + file.write(prompts_response) + + json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', response) + if json_response is None: + logger.info(f"Can't extract tables from page") + return + + table_json_text = json_response.group(1) + table_data = json.loads(table_json_text) + prompts_response + self.save_table_data(table_data, page_num) + + def save_table_data(self, table_data: dict, page_num: int): + pdf_pure_name = self.pdf_file_name.replace('.pdf', '') + json_output_file = os.path.join(self.json_output_folder, f'{pdf_pure_name}_{page_num}.json') + with open(json_output_file, 'w', encoding='utf-8') as file: + file.write(json.dumps(table_data, indent=4)) + + table_list = table_data.get('tables', []) + for table_num, table in enumerate(table_list): + table_md_file = os.path.join(self.table_md_output_folder, f'{pdf_pure_name}_{page_num}_{table_num}.md') + with open(table_md_file, 'w', encoding='utf-8') as file: + file.write(table) \ No newline at end of file diff --git a/prepare_data.py b/prepare_data.py new file mode 100644 index 0000000..c9ad9da --- /dev/null +++ b/prepare_data.py @@ -0,0 +1,85 @@ +import pandas as pd +import os +import tqdm +import json +from glob import glob +import fitz +import re +import time + +from utils.logger import logger +from utils.pdf_download import download_pdf_from_documents_warehouse +from utils.pdf_util import PDFUtil +from pdf_table_extraction import PDFTableExtraction + +def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str): + doc_provider_data = pd.read_excel(doc_provider_file_path) + # get new data by grouping by docid, and count the number of rows for each docid, + # set the new data with 2 columns: docid and provider_count + doc_provider_count = doc_provider_data.groupby('DocumentId').size().reset_index(name='provider_count') + # sort new data by provider_count in descending order + doc_provider_count = doc_provider_count.sort_values(by='provider_count', ascending=False) + + # save excel by doc_provider_data and new_data + with pd.ExcelWriter(doc_provider_file_path) as writer: + doc_provider_data.to_excel(writer, sheet_name='doc_provider_details', index=False) + doc_provider_count.to_excel(writer, sheet_name='doc_provider_count', index=False) + +def download_pdf(doc_provider_file_path: str, sheet_name: str, pdf_path: str): + document_data = pd.read_excel(doc_provider_file_path, sheet_name=sheet_name) + # get all unique docids as list + doc_id_list = [str(doc_id) for doc_id + in document_data['DocumentId'].unique().tolist()] + # download pdfs + logger.info(f"Start downloading {len(doc_id_list)} pdfs") + os.makedirs(pdf_path, exist_ok=True) + for doc_id in tqdm.tqdm(doc_id_list): + logger.info(f"Downloading pdf for docid: {doc_id}") + download_pdf_from_documents_warehouse(pdf_directory=pdf_path, doc_id=doc_id) + time.sleep(1) + + +def output_pdf_page_text(pdf_folder: str, output_folder: str): + if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder): + logger.error(f"Invalid pdf_folder: {pdf_folder}") + return + if output_folder is None or len(output_folder) == 0: + logger.error(f"Invalid output_folder: {output_folder}") + return + + os.makedirs(output_folder, exist_ok=True) + pdf_files = glob(os.path.join(pdf_folder, '*.pdf')) + logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}") + for pdf_file in pdf_files: + logger.info(f"Start processing {pdf_file}") + pdf_util = PDFUtil(pdf_file) + success, text, page_text_dict = pdf_util.extract_text(output_folder=output_folder) + if success: + logger.info(f"Successfully extracted text from {pdf_file}") + + +def extract_pdf_table(pdf_folder: str, output_folder: str): + if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder): + logger.error(f"Invalid pdf_folder: {pdf_folder}") + return + if output_folder is None or len(output_folder) == 0: + logger.error(f"Invalid output_folder: {output_folder}") + return + os.makedirs(output_folder, exist_ok=True) + + pdf_files = glob(os.path.join(pdf_folder, '*.pdf')) + logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}") + for pdf_file in pdf_files: + logger.info(f"Start processing {pdf_file}") + pdf_table_extraction = PDFTableExtraction(pdf_file, output_folder) + pdf_table_extraction.extract_tables() + + +if __name__ == '__main__': + doc_provider_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx" + pdf_folder = r"/data/emea_ar/pdf/" + output_folder = r"/data/emea_ar/output/" + # get_unique_docids_from_doc_provider_data(doc_provider_file_path) + # download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder) + # output_pdf_page_text(pdf_folder, output_folder) + extract_pdf_table(pdf_folder, output_folder) \ No newline at end of file diff --git a/utils/gpt_utils.py b/utils/gpt_utils.py index 463e11b..e248e92 100644 --- a/utils/gpt_utils.py +++ b/utils/gpt_utils.py @@ -8,31 +8,8 @@ import dotenv # loads .env file with your OPENAI_API_KEY dotenv.load_dotenv() - -def set_environment_variables(engine=os.getenv("Engine_0613_16k")): - if engine.startswith('gpt4') or engine.startswith('gpt-4'): - openai.api_base = os.getenv("OPENAI_API_BASE_DC") - openai.api_key = os.getenv("OPENAI_API_KEY_GPT4") - elif engine.startswith('modc-stg-gpt4'): - openai.api_base = os.getenv("OPENAI_API_BASE_GPT4_MODC") - openai.api_key = os.getenv("OPENAI_API_KEY_GPT4_MODC") - elif engine.upper() == 'ENGINE_GPT4_TURBO': - openai.api_base = os.getenv("OPENAI_API_BASE_GPT4_TURBO") - openai.api_key = os.getenv("OPENAI_API_KEY_GPT4_TURBO") - elif engine.startswith('modc-stg-gpt35turbo16k'): - openai.api_base = os.getenv("OPENAI_API_BASE_GPT3_MODC") - openai.api_key = os.getenv("OPENAI_API_KEY_GPT3_MODC") - else: - openai.api_base = os.getenv("OPENAI_API_BASE") - openai.api_key = os.getenv("OPENAI_API_KEY") - openai.Engine = engine - openai.api_type = os.getenv("OPENAI_API_TYPE") - openai.api_version = os.getenv("OPENAI_API_VERSION") - # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") tokenizer = tiktoken.get_encoding("cl100k_base") - - def get_embedding(text, engine=os.getenv("EMBEDDING_ENGINE")): count = 0 error = '' diff --git a/utils/pdf_util.py b/utils/pdf_util.py index ab11998..78b8ecf 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -63,7 +63,7 @@ class PDFUtil: pdf_doc.close() return True, output - def extract_text(self, output_file: str = None) -> Tuple[bool, str, dict]: + def extract_text(self, output_folder: str = None) -> Tuple[bool, str, dict]: """ Extracts text from PDF """ @@ -92,9 +92,18 @@ class PDFUtil: # "######################################################################" # ) # Save to file - if output_file: - with open(output_file, "w", encoding="utf-8") as file: + if output_folder: + txt_output_folder = os.path.join(output_folder, 'pdf_text/') + os.makedirs(txt_output_folder, exist_ok=True) + txt_file = os.path.join(txt_output_folder, self.simple_pdf_file.replace(".pdf", ".txt")) + with open(txt_file, "w", encoding="utf-8") as file: file.write(text.strip()) + + json_output_folder = os.path.join(output_folder, 'pdf_json/') + os.makedirs(json_output_folder, exist_ok=True) + json_file = os.path.join(json_output_folder, self.simple_pdf_file.replace(".pdf", ".json")) + with open(json_file, "w", encoding="utf-8") as file: + json.dump(page_text_dict, file, indent=4) pdf_doc.close() return True, text, page_text_dict except Exception as e: