2024-08-19 20:49:45 +00:00
|
|
|
import pandas as pd
|
|
|
|
|
import os
|
|
|
|
|
import tqdm
|
|
|
|
|
import json
|
|
|
|
|
from glob import glob
|
|
|
|
|
import fitz
|
|
|
|
|
import re
|
|
|
|
|
import time
|
2024-08-19 22:59:32 +00:00
|
|
|
import traceback
|
|
|
|
|
import json_repair
|
2024-08-19 20:49:45 +00:00
|
|
|
|
|
|
|
|
from utils.logger import logger
|
|
|
|
|
from utils.pdf_download import download_pdf_from_documents_warehouse
|
|
|
|
|
from utils.pdf_util import PDFUtil
|
|
|
|
|
from pdf_table_extraction import PDFTableExtraction
|
|
|
|
|
|
|
|
|
|
def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
|
|
|
|
|
doc_provider_data = pd.read_excel(doc_provider_file_path)
|
|
|
|
|
# get new data by grouping by docid, and count the number of rows for each docid,
|
|
|
|
|
# set the new data with 2 columns: docid and provider_count
|
|
|
|
|
doc_provider_count = doc_provider_data.groupby('DocumentId').size().reset_index(name='provider_count')
|
|
|
|
|
# sort new data by provider_count in descending order
|
|
|
|
|
doc_provider_count = doc_provider_count.sort_values(by='provider_count', ascending=False)
|
|
|
|
|
|
|
|
|
|
# save excel by doc_provider_data and new_data
|
|
|
|
|
with pd.ExcelWriter(doc_provider_file_path) as writer:
|
|
|
|
|
doc_provider_data.to_excel(writer, sheet_name='doc_provider_details', index=False)
|
|
|
|
|
doc_provider_count.to_excel(writer, sheet_name='doc_provider_count', index=False)
|
|
|
|
|
|
|
|
|
|
def download_pdf(doc_provider_file_path: str, sheet_name: str, pdf_path: str):
|
|
|
|
|
document_data = pd.read_excel(doc_provider_file_path, sheet_name=sheet_name)
|
|
|
|
|
# get all unique docids as list
|
|
|
|
|
doc_id_list = [str(doc_id) for doc_id
|
|
|
|
|
in document_data['DocumentId'].unique().tolist()]
|
|
|
|
|
# download pdfs
|
|
|
|
|
logger.info(f"Start downloading {len(doc_id_list)} pdfs")
|
|
|
|
|
os.makedirs(pdf_path, exist_ok=True)
|
|
|
|
|
for doc_id in tqdm.tqdm(doc_id_list):
|
|
|
|
|
logger.info(f"Downloading pdf for docid: {doc_id}")
|
|
|
|
|
download_pdf_from_documents_warehouse(pdf_directory=pdf_path, doc_id=doc_id)
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def output_pdf_page_text(pdf_folder: str, output_folder: str):
|
|
|
|
|
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
|
|
|
|
|
logger.error(f"Invalid pdf_folder: {pdf_folder}")
|
|
|
|
|
return
|
|
|
|
|
if output_folder is None or len(output_folder) == 0:
|
|
|
|
|
logger.error(f"Invalid output_folder: {output_folder}")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
|
|
|
pdf_files = glob(os.path.join(pdf_folder, '*.pdf'))
|
|
|
|
|
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
|
|
|
|
for pdf_file in pdf_files:
|
|
|
|
|
logger.info(f"Start processing {pdf_file}")
|
|
|
|
|
pdf_util = PDFUtil(pdf_file)
|
|
|
|
|
success, text, page_text_dict = pdf_util.extract_text(output_folder=output_folder)
|
|
|
|
|
if success:
|
|
|
|
|
logger.info(f"Successfully extracted text from {pdf_file}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_pdf_table(pdf_folder: str, output_folder: str):
|
|
|
|
|
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
|
|
|
|
|
logger.error(f"Invalid pdf_folder: {pdf_folder}")
|
|
|
|
|
return
|
|
|
|
|
if output_folder is None or len(output_folder) == 0:
|
|
|
|
|
logger.error(f"Invalid output_folder: {output_folder}")
|
|
|
|
|
return
|
|
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
pdf_files = glob(os.path.join(pdf_folder, '*.pdf'))
|
|
|
|
|
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
|
|
|
|
for pdf_file in pdf_files:
|
|
|
|
|
logger.info(f"Start processing {pdf_file}")
|
|
|
|
|
pdf_table_extraction = PDFTableExtraction(pdf_file, output_folder)
|
|
|
|
|
pdf_table_extraction.extract_tables()
|
|
|
|
|
|
|
|
|
|
|
2024-08-19 22:59:32 +00:00
|
|
|
def analyze_json_error():
|
|
|
|
|
text_file = r"/data/emea_ar/output/pdf_table_prompts/445877368_4.txt"
|
|
|
|
|
with open(text_file, 'r', encoding='utf-8') as file:
|
|
|
|
|
text = file.read()
|
|
|
|
|
json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', text)
|
|
|
|
|
if json_response:
|
|
|
|
|
json_text = json_response.group(1)
|
|
|
|
|
json_data = {"tables": []}
|
|
|
|
|
try:
|
|
|
|
|
json_data = json.loads(json_text)
|
|
|
|
|
except:
|
|
|
|
|
json_data = json_repair.loads(json_text)
|
|
|
|
|
table_list = json_data.get('tables', [])
|
|
|
|
|
for table_num, table in enumerate(table_list):
|
|
|
|
|
table_md_file = os.path.join("/temp/", f'temp_{table_num}.md')
|
|
|
|
|
table = re.sub(r'(\n)+', '\n', table)
|
|
|
|
|
with open(table_md_file, 'w', encoding='utf-8') as file:
|
|
|
|
|
file.write(table)
|
|
|
|
|
|
|
|
|
|
|
2024-08-19 20:49:45 +00:00
|
|
|
if __name__ == '__main__':
|
|
|
|
|
doc_provider_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
|
|
|
|
pdf_folder = r"/data/emea_ar/pdf/"
|
|
|
|
|
output_folder = r"/data/emea_ar/output/"
|
|
|
|
|
# get_unique_docids_from_doc_provider_data(doc_provider_file_path)
|
|
|
|
|
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
|
|
|
|
|
# output_pdf_page_text(pdf_folder, output_folder)
|
2024-08-19 22:59:32 +00:00
|
|
|
extract_pdf_table(pdf_folder, output_folder)
|
|
|
|
|
# analyze_json_error()
|