Refactor utilities to modules

Rather than have them all tangled into __main__. This makes the package
more usable as python modules rather than just a command line utility.
main
Eric Ihli 5 years ago
parent bea192678e
commit 98ef6ffd85

@ -66,6 +66,27 @@ probably aren't necessary.
This code calls out to [[https://manpages.debian.org/testing/poppler-utils/pdfimages.1.en.html][pdfimages]] from [[https://poppler.freedesktop.org/][Poppler]].
#+NAME: pdf-to-images-overview
#+HEADER: :mkdirp yes :tangle table_ocr/pdf_to_images/__init__.py
#+BEGIN_SRC python :noweb strip-export :results none
import os
import re
import subprocess
from table_ocr.util import get_logger, working_dir
logger = get_logger(__name__)
# Wrapper around the Poppler command line utility "pdfimages" and helpers for
# finding the output files of that command.
<<pdf-to-images>>
# Helpers to detect orientation of the images that Poppler extracted and if the
# images are rotated or skewed, use ImageMagick's `mogrify` to correct the
# rotation. (Makes OCR more straightforward.)
<<fix-orientation>>
#+END_SRC
#+NAME: pdf-to-images
#+BEGIN_SRC python :results none
def pdf_to_images(pdf_filepath):
@ -94,7 +115,9 @@ def pdfimages(pdf_filepath):
filename_sans_ext = filename.split(".pdf")[0]
subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
logger.debug(
"Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
)
return image_filenames
@ -665,9 +688,6 @@ import tempfile
<<get-logger>>
logger = get_logger()
@contextmanager
def working_dir(directory):
original_working_dir = os.getcwd()
@ -682,7 +702,7 @@ def make_tempdir(identifier):
return tempfile.mkdtemp(prefix="{}_".format(identifier))
#+END_SRC
*** table_ocr/prepare_pdfs.py
*** table_ocr/pdf_to_images/__main__.py
Takes a variable number of pdf files and creates images out of each page of the
file using ~pdfimages~ from Poppler. Images are created in the same directory
@ -696,24 +716,20 @@ python -m table_ocr.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
#+END_SRC
#+BEGIN_SRC python :tangle table_ocr/prepare_pdfs.py :noweb yes
#+NAME: pdf_to_images/__main__.py
#+HEADER: :tangle table_ocr/pdf_to_images/__main__.py
#+BEGIN_SRC python
import argparse
import logging
import os
import re
import subprocess
import sys
from table_ocr.util import working_dir, make_tempdir
from table_ocr.util import working_dir, make_tempdir, get_logger
from table_ocr.pdf_to_images import pdf_to_images, preprocess_img
<<get-logger>>
logger = get_logger()
logger = get_logger(__name__)
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
pdf_images = []
for f in files:
@ -727,9 +743,6 @@ def main(files):
print("{}\n{}\n".format(pdf, "\n".join(images)))
<<pdf-to-images>>
<<fix-orientation>>
if __name__ == "__main__":
args = parser.parse_args()
main(args.files)
@ -961,8 +974,8 @@ with ~advice-add~.
** Logging
#+BEGIN_SRC python :eval query :noweb-ref get-logger
def get_logger():
logger = logging.getLogger(__name__)
def get_logger(name):
logger = logging.getLogger(name)
lvl = os.environ.get("PY_LOG_LVL", "info").upper()
handler = logging.StreamHandler()
formatter = logging.Formatter(logging.BASIC_FORMAT)

@ -0,0 +1,76 @@
import os
import re
import subprocess
from table_ocr.util import get_logger, working_dir
logger = get_logger(__name__)
# Wrapper around the Poppler command line utility "pdfimages" and helpers for
# finding the output files of that command.
def pdf_to_images(pdf_filepath):
"""
Turn a pdf into images
"""
directory, filename = os.path.split(pdf_filepath)
with working_dir(directory):
image_filenames = pdfimages(pdf_filepath)
# Since pdfimages creates a number of files named each for there page number
# and doesn't return us the list that it created
return [os.path.join(directory, f) for f in image_filenames]
def pdfimages(pdf_filepath):
"""
Uses the `pdfimages` utility from Poppler
(https://poppler.freedesktop.org/). Creates images out of each page. Images
are prefixed by their name sans extension and suffixed by their page number.
This should work up to pdfs with 999 pages since find matching files in dir
uses 3 digits in its regex.
"""
directory, filename = os.path.split(pdf_filepath)
filename_sans_ext = filename.split(".pdf")[0]
subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
logger.debug(
"Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
)
return image_filenames
def find_matching_files_in_dir(file_prefix, directory):
files = [
filename
for filename in os.listdir(directory)
if re.match(r"{}-\d{{3}}.*\.png".format(re.escape(file_prefix)), filename)
]
return files
# Helpers to detect orientation of the images that Poppler extracted and if the
# images are rotated or skewed, use ImageMagick's `mogrify` to correct the
# rotation. (Makes OCR more straightforward.)
def preprocess_img(filepath):
"""
Processing that involves running shell executables,
like mogrify to rotate.
"""
rotate = get_rotate(filepath)
logger.debug("Rotating {} by {}.".format(filepath, rotate))
mogrify(filepath, rotate)
def get_rotate(image_filepath):
output = (
subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
.decode("utf-8")
.split("\n")
)
output = next(l for l in output if "Rotate: " in l)
output = output.split(": ")[1]
return output
def mogrify(image_filepath, rotate):
subprocess.run(["mogrify", "-rotate", rotate, image_filepath])

@ -0,0 +1,27 @@
import argparse
from table_ocr.util import working_dir, make_tempdir, get_logger
from table_ocr.pdf_to_images import pdf_to_images, preprocess_img
logger = get_logger(__name__)
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
pdf_images = []
for f in files:
pdf_images.append((f, pdf_to_images(f)))
for pdf, images in pdf_images:
for image in images:
preprocess_img(image)
for pdf, images in pdf_images:
print("{}\n{}\n".format(pdf, "\n".join(images)))
if __name__ == "__main__":
args = parser.parse_args()
main(args.files)

@ -8,8 +8,8 @@ import sys
from table_ocr.util import working_dir, make_tempdir
def get_logger():
logger = logging.getLogger(__name__)
def get_logger(name):
logger = logging.getLogger(name)
lvl = os.environ.get("PY_LOG_LVL", "info").upper()
handler = logging.StreamHandler()
formatter = logging.Formatter(logging.BASIC_FORMAT)
@ -63,7 +63,9 @@ def pdfimages(pdf_filepath):
filename_sans_ext = filename.split(".pdf")[0]
subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
logger.debug(
"Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
)
return image_filenames

@ -4,8 +4,8 @@ import logging
import os
import tempfile
def get_logger():
logger = logging.getLogger(__name__)
def get_logger(name):
logger = logging.getLogger(name)
lvl = os.environ.get("PY_LOG_LVL", "info").upper()
handler = logging.StreamHandler()
formatter = logging.Formatter(logging.BASIC_FORMAT)
@ -15,9 +15,6 @@ def get_logger():
logger.setLevel(lvl)
return logger
logger = get_logger()
@contextmanager
def working_dir(directory):
original_working_dir = os.getcwd()

Loading…
Cancel
Save