dc-ml-emea-ar/prepare_data.py

85 lines
3.7 KiB
Python
Raw Normal View History

import pandas as pd
import os
import tqdm
import json
from glob import glob
import fitz
import re
import time
from utils.logger import logger
from utils.pdf_download import download_pdf_from_documents_warehouse
from utils.pdf_util import PDFUtil
from pdf_table_extraction import PDFTableExtraction
def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
doc_provider_data = pd.read_excel(doc_provider_file_path)
# get new data by grouping by docid, and count the number of rows for each docid,
# set the new data with 2 columns: docid and provider_count
doc_provider_count = doc_provider_data.groupby('DocumentId').size().reset_index(name='provider_count')
# sort new data by provider_count in descending order
doc_provider_count = doc_provider_count.sort_values(by='provider_count', ascending=False)
# save excel by doc_provider_data and new_data
with pd.ExcelWriter(doc_provider_file_path) as writer:
doc_provider_data.to_excel(writer, sheet_name='doc_provider_details', index=False)
doc_provider_count.to_excel(writer, sheet_name='doc_provider_count', index=False)
def download_pdf(doc_provider_file_path: str, sheet_name: str, pdf_path: str):
document_data = pd.read_excel(doc_provider_file_path, sheet_name=sheet_name)
# get all unique docids as list
doc_id_list = [str(doc_id) for doc_id
in document_data['DocumentId'].unique().tolist()]
# download pdfs
logger.info(f"Start downloading {len(doc_id_list)} pdfs")
os.makedirs(pdf_path, exist_ok=True)
for doc_id in tqdm.tqdm(doc_id_list):
logger.info(f"Downloading pdf for docid: {doc_id}")
download_pdf_from_documents_warehouse(pdf_directory=pdf_path, doc_id=doc_id)
time.sleep(1)
def output_pdf_page_text(pdf_folder: str, output_folder: str):
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
logger.error(f"Invalid pdf_folder: {pdf_folder}")
return
if output_folder is None or len(output_folder) == 0:
logger.error(f"Invalid output_folder: {output_folder}")
return
os.makedirs(output_folder, exist_ok=True)
pdf_files = glob(os.path.join(pdf_folder, '*.pdf'))
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
for pdf_file in pdf_files:
logger.info(f"Start processing {pdf_file}")
pdf_util = PDFUtil(pdf_file)
success, text, page_text_dict = pdf_util.extract_text(output_folder=output_folder)
if success:
logger.info(f"Successfully extracted text from {pdf_file}")
def extract_pdf_table(pdf_folder: str, output_folder: str):
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
logger.error(f"Invalid pdf_folder: {pdf_folder}")
return
if output_folder is None or len(output_folder) == 0:
logger.error(f"Invalid output_folder: {output_folder}")
return
os.makedirs(output_folder, exist_ok=True)
pdf_files = glob(os.path.join(pdf_folder, '*.pdf'))
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
for pdf_file in pdf_files:
logger.info(f"Start processing {pdf_file}")
pdf_table_extraction = PDFTableExtraction(pdf_file, output_folder)
pdf_table_extraction.extract_tables()
if __name__ == '__main__':
doc_provider_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
pdf_folder = r"/data/emea_ar/pdf/"
output_folder = r"/data/emea_ar/output/"
# get_unique_docids_from_doc_provider_data(doc_provider_file_path)
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
# output_pdf_page_text(pdf_folder, output_folder)
extract_pdf_table(pdf_folder, output_folder)