Refactor utilities to modules

Rather than have them all tangled into __main__. This makes the package more usable as python modules rather than just a command line utility.
5 years ago · 98ef6ffd85
parent bea192678e
commit 98ef6ffd85
5 changed files with 144 additions and 29 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -66,6 +66,27 @@ probably aren't necessary.

 This code calls out to [[https://manpages.debian.org/testing/poppler-utils/pdfimages.1.en.html][pdfimages]] from [[https://poppler.freedesktop.org/][Poppler]].

+#+NAME: pdf-to-images-overview
+#+HEADER: :mkdirp yes :tangle table_ocr/pdf_to_images/__init__.py
+#+BEGIN_SRC python :noweb strip-export :results none
+import os
+import re
+import subprocess
+
+from table_ocr.util import get_logger, working_dir
+
+logger = get_logger(__name__)
+
+# Wrapper around the Poppler command line utility "pdfimages" and helpers for
+# finding the output files of that command.
+<<pdf-to-images>>
+
+# Helpers to detect orientation of the images that Poppler extracted and if the
+# images are rotated or skewed, use ImageMagick's `mogrify` to correct the
+# rotation. (Makes OCR more straightforward.)
+<<fix-orientation>>
+#+END_SRC
+
 #+NAME: pdf-to-images
 #+BEGIN_SRC python :results none
 def pdf_to_images(pdf_filepath):
@ -94,7 +115,9 @@ def pdfimages(pdf_filepath):
    filename_sans_ext = filename.split(".pdf")[0]
    subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
    image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
-    logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
+    logger.debug(
+        "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
+    )
    return image_filenames


@ -665,9 +688,6 @@ import tempfile

 <<get-logger>>

-logger = get_logger()
-
-
@contextmanager
 def working_dir(directory):
    original_working_dir = os.getcwd()
@ -682,7 +702,7 @@ def make_tempdir(identifier):
    return tempfile.mkdtemp(prefix="{}_".format(identifier))
 #+END_SRC

-*** table_ocr/prepare_pdfs.py
+*** table_ocr/pdf_to_images/__main__.py

 Takes a variable number of pdf files and creates images out of each page of the
 file using ~pdfimages~ from Poppler. Images are created in the same directory
@ -696,24 +716,20 @@ python -m table_ocr.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
 #+END_SRC


-#+BEGIN_SRC python :tangle table_ocr/prepare_pdfs.py :noweb yes
+#+NAME: pdf_to_images/__main__.py
+#+HEADER: :tangle table_ocr/pdf_to_images/__main__.py
+#+BEGIN_SRC python
 import argparse
-import logging
-import os
-import re
-import subprocess
-import sys

-from table_ocr.util import working_dir, make_tempdir
+from table_ocr.util import working_dir, make_tempdir, get_logger
+from table_ocr.pdf_to_images import pdf_to_images, preprocess_img

-
-<<get-logger>>
-
-logger = get_logger()
+logger = get_logger(__name__)

 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")

+
 def main(files):
    pdf_images = []
    for f in files:
@ -727,9 +743,6 @@ def main(files):
        print("{}\n{}\n".format(pdf, "\n".join(images)))


-<<pdf-to-images>>
-<<fix-orientation>>
-
 if __name__ == "__main__":
    args = parser.parse_args()
    main(args.files)
@ -961,8 +974,8 @@ with ~advice-add~.
 ** Logging

 #+BEGIN_SRC python :eval query :noweb-ref get-logger
-def get_logger():
-    logger = logging.getLogger(__name__)
+def get_logger(name):
+    logger = logging.getLogger(name)
    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
    handler = logging.StreamHandler()
    formatter = logging.Formatter(logging.BASIC_FORMAT)
--- a/table_ocr/pdf_to_images/init.py
+++ b/table_ocr/pdf_to_images/init.py
@ -0,0 +1,76 @@
+import os
+import re
+import subprocess
+
+from table_ocr.util import get_logger, working_dir
+
+logger = get_logger(__name__)
+
+# Wrapper around the Poppler command line utility "pdfimages" and helpers for
+# finding the output files of that command.
+def pdf_to_images(pdf_filepath):
+    """
+    Turn a pdf into images
+    """
+    directory, filename = os.path.split(pdf_filepath)
+    with working_dir(directory):
+        image_filenames = pdfimages(pdf_filepath)
+
+    # Since pdfimages creates a number of files named each for there page number
+    # and doesn't return us the list that it created
+    return [os.path.join(directory, f) for f in image_filenames]
+
+
+def pdfimages(pdf_filepath):
+    """
+    Uses the `pdfimages` utility from Poppler
+    (https://poppler.freedesktop.org/). Creates images out of each page. Images
+    are prefixed by their name sans extension and suffixed by their page number.
+
+    This should work up to pdfs with 999 pages since find matching files in dir
+    uses 3 digits in its regex.
+    """
+    directory, filename = os.path.split(pdf_filepath)
+    filename_sans_ext = filename.split(".pdf")[0]
+    subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
+    image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
+    logger.debug(
+        "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
+    )
+    return image_filenames
+
+
+def find_matching_files_in_dir(file_prefix, directory):
+    files = [
+        filename
+        for filename in os.listdir(directory)
+        if re.match(r"{}-\d{{3}}.*\.png".format(re.escape(file_prefix)), filename)
+    ]
+    return files
+
+# Helpers to detect orientation of the images that Poppler extracted and if the
+# images are rotated or skewed, use ImageMagick's `mogrify` to correct the
+# rotation. (Makes OCR more straightforward.)
+def preprocess_img(filepath):
+    """
+    Processing that involves running shell executables,
+    like mogrify to rotate.
+    """
+    rotate = get_rotate(filepath)
+    logger.debug("Rotating {} by {}.".format(filepath, rotate))
+    mogrify(filepath, rotate)
+
+
+def get_rotate(image_filepath):
+    output = (
+        subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
+        .decode("utf-8")
+        .split("\n")
+    )
+    output = next(l for l in output if "Rotate: " in l)
+    output = output.split(": ")[1]
+    return output
+
+
+def mogrify(image_filepath, rotate):
+    subprocess.run(["mogrify", "-rotate", rotate, image_filepath])
--- a/table_ocr/pdf_to_images/main.py
+++ b/table_ocr/pdf_to_images/main.py
@ -0,0 +1,27 @@
+import argparse
+
+from table_ocr.util import working_dir, make_tempdir, get_logger
+from table_ocr.pdf_to_images import pdf_to_images, preprocess_img
+
+logger = get_logger(__name__)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+
+def main(files):
+    pdf_images = []
+    for f in files:
+        pdf_images.append((f, pdf_to_images(f)))
+
+    for pdf, images in pdf_images:
+        for image in images:
+            preprocess_img(image)
+
+    for pdf, images in pdf_images:
+        print("{}\n{}\n".format(pdf, "\n".join(images)))
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.files)
--- a/table_ocr/prepare_pdfs.py
+++ b/table_ocr/prepare_pdfs.py
@ -8,8 +8,8 @@ import sys
 from table_ocr.util import working_dir, make_tempdir


-def get_logger():
-    logger = logging.getLogger(__name__)
+def get_logger(name):
+    logger = logging.getLogger(name)
    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
    handler = logging.StreamHandler()
    formatter = logging.Formatter(logging.BASIC_FORMAT)
@ -63,7 +63,9 @@ def pdfimages(pdf_filepath):
    filename_sans_ext = filename.split(".pdf")[0]
    subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
    image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
-    logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
+    logger.debug(
+        "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
+    )
    return image_filenames


--- a/table_ocr/util.py
+++ b/table_ocr/util.py
@ -4,8 +4,8 @@ import logging
 import os
 import tempfile

-def get_logger():
-    logger = logging.getLogger(__name__)
+def get_logger(name):
+    logger = logging.getLogger(name)
    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
    handler = logging.StreamHandler()
    formatter = logging.Formatter(logging.BASIC_FORMAT)
@ -15,9 +15,6 @@ def get_logger():
    logger.setLevel(lvl)
    return logger

-logger = get_logger()
-
-
@contextmanager
 def working_dir(directory):
    original_working_dir = os.getcwd()