diff --git a/core/page_filter.py b/core/page_filter.py index 9104813..748ed12 100644 --- a/core/page_filter.py +++ b/core/page_filter.py @@ -40,6 +40,9 @@ class FilterPages: page_text_dict = {} if self.apply_pdf2html: page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path") + if len(page_text_dict.keys()) == 0: + pdf_util = PDFUtil(self.pdf_file) + success, text, page_text_dict = pdf_util.extract_text() else: pdf_util = PDFUtil(self.pdf_file) success, text, page_text_dict = pdf_util.extract_text() diff --git a/main.py b/main.py index b3132d1..d8e1acf 100644 --- a/main.py +++ b/main.py @@ -1165,7 +1165,7 @@ if __name__ == "__main__": "554851189", "555377021", "555654388"] - # special_doc_id_list: list = ["539790009", "542301117"] + # special_doc_id_list: list = ["542301117"] pdf_folder:str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"