import pandas as pd import os import tqdm import json from glob import glob import fitz import re import time from utils.logger import logger from utils.pdf_download import download_pdf_from_documents_warehouse from utils.pdf_util import PDFUtil from pdf_table_extraction import PDFTableExtraction def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str): doc_provider_data = pd.read_excel(doc_provider_file_path) # get new data by grouping by docid, and count the number of rows for each docid, # set the new data with 2 columns: docid and provider_count doc_provider_count = doc_provider_data.groupby('DocumentId').size().reset_index(name='provider_count') # sort new data by provider_count in descending order doc_provider_count = doc_provider_count.sort_values(by='provider_count', ascending=False) # save excel by doc_provider_data and new_data with pd.ExcelWriter(doc_provider_file_path) as writer: doc_provider_data.to_excel(writer, sheet_name='doc_provider_details', index=False) doc_provider_count.to_excel(writer, sheet_name='doc_provider_count', index=False) def download_pdf(doc_provider_file_path: str, sheet_name: str, pdf_path: str): document_data = pd.read_excel(doc_provider_file_path, sheet_name=sheet_name) # get all unique docids as list doc_id_list = [str(doc_id) for doc_id in document_data['DocumentId'].unique().tolist()] # download pdfs logger.info(f"Start downloading {len(doc_id_list)} pdfs") os.makedirs(pdf_path, exist_ok=True) for doc_id in tqdm.tqdm(doc_id_list): logger.info(f"Downloading pdf for docid: {doc_id}") download_pdf_from_documents_warehouse(pdf_directory=pdf_path, doc_id=doc_id) time.sleep(1) def output_pdf_page_text(pdf_folder: str, output_folder: str): if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder): logger.error(f"Invalid pdf_folder: {pdf_folder}") return if output_folder is None or len(output_folder) == 0: logger.error(f"Invalid output_folder: {output_folder}") return os.makedirs(output_folder, exist_ok=True) pdf_files = glob(os.path.join(pdf_folder, '*.pdf')) logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}") for pdf_file in pdf_files: logger.info(f"Start processing {pdf_file}") pdf_util = PDFUtil(pdf_file) success, text, page_text_dict = pdf_util.extract_text(output_folder=output_folder) if success: logger.info(f"Successfully extracted text from {pdf_file}") def extract_pdf_table(pdf_folder: str, output_folder: str): if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder): logger.error(f"Invalid pdf_folder: {pdf_folder}") return if output_folder is None or len(output_folder) == 0: logger.error(f"Invalid output_folder: {output_folder}") return os.makedirs(output_folder, exist_ok=True) pdf_files = glob(os.path.join(pdf_folder, '*.pdf')) logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}") for pdf_file in pdf_files: logger.info(f"Start processing {pdf_file}") pdf_table_extraction = PDFTableExtraction(pdf_file, output_folder) pdf_table_extraction.extract_tables() if __name__ == '__main__': doc_provider_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx" pdf_folder = r"/data/emea_ar/pdf/" output_folder = r"/data/emea_ar/output/" # get_unique_docids_from_doc_provider_data(doc_provider_file_path) # download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder) # output_pdf_page_text(pdf_folder, output_folder) extract_pdf_table(pdf_folder, output_folder)