if fail to get text by pdf to html API, then try to get text by pymupdf.

This commit is contained in:
Blade He 2025-01-15 18:36:02 -06:00
parent ace0ac2674
commit acc30d4b72
2 changed files with 4 additions and 1 deletions

View File

@ -40,6 +40,9 @@ class FilterPages:
page_text_dict = {}
if self.apply_pdf2html:
page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path")
if len(page_text_dict.keys()) == 0:
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text()
else:
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text()

View File

@ -1165,7 +1165,7 @@ if __name__ == "__main__":
"554851189",
"555377021",
"555654388"]
# special_doc_id_list: list = ["539790009", "542301117"]
# special_doc_id_list: list = ["542301117"]
pdf_folder:str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"