if fail to get text by pdf to html API, then try to get text by pymupdf.

2025-01-15 18:36:02 -06:00 · 2025-01-15 18:36:02 -06:00 · acc30d4b72
parent ace0ac2674
commit acc30d4b72
2 changed files with 4 additions and 1 deletions
--- a/core/page_filter.py
+++ b/core/page_filter.py
@ -40,6 +40,9 @@ class FilterPages:
        page_text_dict = {}
        if self.apply_pdf2html:
            page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path")
+            if len(page_text_dict.keys()) == 0:
+                pdf_util = PDFUtil(self.pdf_file)
+                success, text, page_text_dict = pdf_util.extract_text()
        else:
            pdf_util = PDFUtil(self.pdf_file)
            success, text, page_text_dict = pdf_util.extract_text()
--- a/main.py
+++ b/main.py
@ -1165,7 +1165,7 @@ if __name__ == "__main__":
                                "554851189",
                                "555377021",
                                "555654388"]
-    # special_doc_id_list: list = ["539790009", "542301117"]
+    # special_doc_id_list: list = ["542301117"]
    pdf_folder:str = r"/data/aus_prospectus/pdf/"
    output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/"
    output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"