diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index a6198d5..ac398dc 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -66,6 +66,27 @@ probably aren't necessary. This code calls out to [[https://manpages.debian.org/testing/poppler-utils/pdfimages.1.en.html][pdfimages]] from [[https://poppler.freedesktop.org/][Poppler]]. +#+NAME: pdf-to-images-overview +#+HEADER: :mkdirp yes :tangle table_ocr/pdf_to_images/__init__.py +#+BEGIN_SRC python :noweb strip-export :results none +import os +import re +import subprocess + +from table_ocr.util import get_logger, working_dir + +logger = get_logger(__name__) + +# Wrapper around the Poppler command line utility "pdfimages" and helpers for +# finding the output files of that command. +<> + +# Helpers to detect orientation of the images that Poppler extracted and if the +# images are rotated or skewed, use ImageMagick's `mogrify` to correct the +# rotation. (Makes OCR more straightforward.) +<> +#+END_SRC + #+NAME: pdf-to-images #+BEGIN_SRC python :results none def pdf_to_images(pdf_filepath): @@ -94,7 +115,9 @@ def pdfimages(pdf_filepath): filename_sans_ext = filename.split(".pdf")[0] subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]]) image_filenames = find_matching_files_in_dir(filename_sans_ext, directory) - logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))) + logger.debug( + "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)) + ) return image_filenames @@ -665,9 +688,6 @@ import tempfile <> -logger = get_logger() - - @contextmanager def working_dir(directory): original_working_dir = os.getcwd() @@ -682,7 +702,7 @@ def make_tempdir(identifier): return tempfile.mkdtemp(prefix="{}_".format(identifier)) #+END_SRC -*** table_ocr/prepare_pdfs.py +*** table_ocr/pdf_to_images/__main__.py Takes a variable number of pdf files and creates images out of each page of the file using ~pdfimages~ from Poppler. Images are created in the same directory @@ -696,24 +716,20 @@ python -m table_ocr.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ... #+END_SRC -#+BEGIN_SRC python :tangle table_ocr/prepare_pdfs.py :noweb yes +#+NAME: pdf_to_images/__main__.py +#+HEADER: :tangle table_ocr/pdf_to_images/__main__.py +#+BEGIN_SRC python import argparse -import logging -import os -import re -import subprocess -import sys -from table_ocr.util import working_dir, make_tempdir +from table_ocr.util import working_dir, make_tempdir, get_logger +from table_ocr.pdf_to_images import pdf_to_images, preprocess_img - -<> - -logger = get_logger() +logger = get_logger(__name__) parser = argparse.ArgumentParser() parser.add_argument("files", nargs="+") + def main(files): pdf_images = [] for f in files: @@ -727,9 +743,6 @@ def main(files): print("{}\n{}\n".format(pdf, "\n".join(images))) -<> -<> - if __name__ == "__main__": args = parser.parse_args() main(args.files) @@ -961,8 +974,8 @@ with ~advice-add~. ** Logging #+BEGIN_SRC python :eval query :noweb-ref get-logger -def get_logger(): - logger = logging.getLogger(__name__) +def get_logger(name): + logger = logging.getLogger(name) lvl = os.environ.get("PY_LOG_LVL", "info").upper() handler = logging.StreamHandler() formatter = logging.Formatter(logging.BASIC_FORMAT) diff --git a/table_ocr/pdf_to_images/__init__.py b/table_ocr/pdf_to_images/__init__.py new file mode 100644 index 0000000..4236ac9 --- /dev/null +++ b/table_ocr/pdf_to_images/__init__.py @@ -0,0 +1,76 @@ +import os +import re +import subprocess + +from table_ocr.util import get_logger, working_dir + +logger = get_logger(__name__) + +# Wrapper around the Poppler command line utility "pdfimages" and helpers for +# finding the output files of that command. +def pdf_to_images(pdf_filepath): + """ + Turn a pdf into images + """ + directory, filename = os.path.split(pdf_filepath) + with working_dir(directory): + image_filenames = pdfimages(pdf_filepath) + + # Since pdfimages creates a number of files named each for there page number + # and doesn't return us the list that it created + return [os.path.join(directory, f) for f in image_filenames] + + +def pdfimages(pdf_filepath): + """ + Uses the `pdfimages` utility from Poppler + (https://poppler.freedesktop.org/). Creates images out of each page. Images + are prefixed by their name sans extension and suffixed by their page number. + + This should work up to pdfs with 999 pages since find matching files in dir + uses 3 digits in its regex. + """ + directory, filename = os.path.split(pdf_filepath) + filename_sans_ext = filename.split(".pdf")[0] + subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]]) + image_filenames = find_matching_files_in_dir(filename_sans_ext, directory) + logger.debug( + "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)) + ) + return image_filenames + + +def find_matching_files_in_dir(file_prefix, directory): + files = [ + filename + for filename in os.listdir(directory) + if re.match(r"{}-\d{{3}}.*\.png".format(re.escape(file_prefix)), filename) + ] + return files + +# Helpers to detect orientation of the images that Poppler extracted and if the +# images are rotated or skewed, use ImageMagick's `mogrify` to correct the +# rotation. (Makes OCR more straightforward.) +def preprocess_img(filepath): + """ + Processing that involves running shell executables, + like mogrify to rotate. + """ + rotate = get_rotate(filepath) + logger.debug("Rotating {} by {}.".format(filepath, rotate)) + mogrify(filepath, rotate) + + +def get_rotate(image_filepath): + output = ( + subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"]) + .decode("utf-8") + .split("\n") + ) + output = next(l for l in output if "Rotate: " in l) + output = output.split(": ")[1] + return output + + +def mogrify(image_filepath, rotate): + subprocess.run(["mogrify", "-rotate", rotate, image_filepath]) diff --git a/table_ocr/pdf_to_images/__main__.py b/table_ocr/pdf_to_images/__main__.py new file mode 100644 index 0000000..6243555 --- /dev/null +++ b/table_ocr/pdf_to_images/__main__.py @@ -0,0 +1,27 @@ +import argparse + +from table_ocr.util import working_dir, make_tempdir, get_logger +from table_ocr.pdf_to_images import pdf_to_images, preprocess_img + +logger = get_logger(__name__) + +parser = argparse.ArgumentParser() +parser.add_argument("files", nargs="+") + + +def main(files): + pdf_images = [] + for f in files: + pdf_images.append((f, pdf_to_images(f))) + + for pdf, images in pdf_images: + for image in images: + preprocess_img(image) + + for pdf, images in pdf_images: + print("{}\n{}\n".format(pdf, "\n".join(images))) + + +if __name__ == "__main__": + args = parser.parse_args() + main(args.files) diff --git a/table_ocr/prepare_pdfs.py b/table_ocr/prepare_pdfs.py index 9ea7cdd..f20b7a9 100644 --- a/table_ocr/prepare_pdfs.py +++ b/table_ocr/prepare_pdfs.py @@ -8,8 +8,8 @@ import sys from table_ocr.util import working_dir, make_tempdir -def get_logger(): - logger = logging.getLogger(__name__) +def get_logger(name): + logger = logging.getLogger(name) lvl = os.environ.get("PY_LOG_LVL", "info").upper() handler = logging.StreamHandler() formatter = logging.Formatter(logging.BASIC_FORMAT) @@ -63,7 +63,9 @@ def pdfimages(pdf_filepath): filename_sans_ext = filename.split(".pdf")[0] subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]]) image_filenames = find_matching_files_in_dir(filename_sans_ext, directory) - logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))) + logger.debug( + "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)) + ) return image_filenames diff --git a/table_ocr/util.py b/table_ocr/util.py index 8c6d3bb..05792c0 100644 --- a/table_ocr/util.py +++ b/table_ocr/util.py @@ -4,8 +4,8 @@ import logging import os import tempfile -def get_logger(): - logger = logging.getLogger(__name__) +def get_logger(name): + logger = logging.getLogger(name) lvl = os.environ.get("PY_LOG_LVL", "info").upper() handler = logging.StreamHandler() formatter = logging.Formatter(logging.BASIC_FORMAT) @@ -15,9 +15,6 @@ def get_logger(): logger.setLevel(lvl) return logger -logger = get_logger() - - @contextmanager def working_dir(directory): original_working_dir = os.getcwd()