if fail to get text by pdf to html API, then try to get text by pymupdf.
This commit is contained in:
parent
ace0ac2674
commit
acc30d4b72
|
|
@ -40,6 +40,9 @@ class FilterPages:
|
|||
page_text_dict = {}
|
||||
if self.apply_pdf2html:
|
||||
page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path")
|
||||
if len(page_text_dict.keys()) == 0:
|
||||
pdf_util = PDFUtil(self.pdf_file)
|
||||
success, text, page_text_dict = pdf_util.extract_text()
|
||||
else:
|
||||
pdf_util = PDFUtil(self.pdf_file)
|
||||
success, text, page_text_dict = pdf_util.extract_text()
|
||||
|
|
|
|||
2
main.py
2
main.py
|
|
@ -1165,7 +1165,7 @@ if __name__ == "__main__":
|
|||
"554851189",
|
||||
"555377021",
|
||||
"555654388"]
|
||||
# special_doc_id_list: list = ["539790009", "542301117"]
|
||||
# special_doc_id_list: list = ["542301117"]
|
||||
pdf_folder:str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"
|
||||
|
|
|
|||
Loading…
Reference in New Issue