From 1156eafc5cdb0f2aa3391c503c6d6642f61436f6 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Mon, 27 Apr 2020 10:04:55 -0700 Subject: [PATCH] Return sorted image paths from pdf_to_images --- pdf_table_extraction_and_ocr.org | 3 ++- table_ocr/pdf_to_images/__init__.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index 52d01e6..247d1c8 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -126,6 +126,7 @@ This code calls out to [[https://manpages.debian.org/testing/poppler-utils/pdfim def pdf_to_images(pdf_filepath): """ Turn a pdf into images + Returns the filenames of the created images sorted lexicographically. """ directory, filename = os.path.split(pdf_filepath) with working_dir(directory): @@ -133,7 +134,7 @@ def pdf_to_images(pdf_filepath): # Since pdfimages creates a number of files named each for there page number # and doesn't return us the list that it created - return [os.path.join(directory, f) for f in image_filenames] + return sorted([os.path.join(directory, f) for f in image_filenames]) def pdfimages(pdf_filepath): diff --git a/table_ocr/pdf_to_images/__init__.py b/table_ocr/pdf_to_images/__init__.py index 0906153..4a362af 100644 --- a/table_ocr/pdf_to_images/__init__.py +++ b/table_ocr/pdf_to_images/__init__.py @@ -11,6 +11,7 @@ logger = get_logger(__name__) def pdf_to_images(pdf_filepath): """ Turn a pdf into images + Returns the filenames of the created images sorted lexicographically. """ directory, filename = os.path.split(pdf_filepath) with working_dir(directory): @@ -18,7 +19,7 @@ def pdf_to_images(pdf_filepath): # Since pdfimages creates a number of files named each for there page number # and doesn't return us the list that it created - return [os.path.join(directory, f) for f in image_filenames] + return sorted([os.path.join(directory, f) for f in image_filenames]) def pdfimages(pdf_filepath):