Fix bug relating to directory of pdf

Relative paths now work.
main
Eric Ihli 4 years ago
parent 449ee015d3
commit 7ad4c0d4dc

@ -129,8 +129,7 @@ def pdf_to_images(pdf_filepath):
Returns the filenames of the created images sorted lexicographically. Returns the filenames of the created images sorted lexicographically.
""" """
directory, filename = os.path.split(pdf_filepath) directory, filename = os.path.split(pdf_filepath)
with working_dir(directory): image_filenames = pdfimages(pdf_filepath)
image_filenames = pdfimages(pdf_filepath)
# Since pdfimages creates a number of files named each for there page number # Since pdfimages creates a number of files named each for there page number
# and doesn't return us the list that it created # and doesn't return us the list that it created
@ -147,8 +146,14 @@ def pdfimages(pdf_filepath):
uses 3 digits in its regex. uses 3 digits in its regex.
""" """
directory, filename = os.path.split(pdf_filepath) directory, filename = os.path.split(pdf_filepath)
if not os.path.isabs(directory):
directory = os.path.abspath(directory)
filename_sans_ext = filename.split(".pdf")[0] filename_sans_ext = filename.split(".pdf")[0]
subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
# pdfimages outputs results to the current working directory
with working_dir(directory):
subprocess.run(["pdfimages", "-png", filename, filename.split(".pdf")[0]])
image_filenames = find_matching_files_in_dir(filename_sans_ext, directory) image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
logger.debug( logger.debug(
"Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)) "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
@ -189,19 +194,29 @@ to correct the rotation. This makes OCR more straightforward.
#+NAME: fix-orientation #+NAME: fix-orientation
#+BEGIN_SRC python :results none #+BEGIN_SRC python :results none
def preprocess_img(filepath): def preprocess_img(filepath, tess_params=None):
""" """Processing that involves running shell executables,
Processing that involves running shell executables,
like mogrify to rotate. like mogrify to rotate.
Uses tesseract to detect rotation.
Orientation and script detection is only available for legacy tesseract
(--oem 0). Some versions of tesseract will segfault if you let it run OSD
with the default oem (3).
""" """
rotate = get_rotate(filepath) if tess_params is None:
tess_params = ["--psm", "0", "--oem", "0"]
rotate = get_rotate(filepath, tess_params)
logger.debug("Rotating {} by {}.".format(filepath, rotate)) logger.debug("Rotating {} by {}.".format(filepath, rotate))
mogrify(filepath, rotate) mogrify(filepath, rotate)
def get_rotate(image_filepath): def get_rotate(image_filepath, tess_params):
"""
"""
tess_command = ["tesseract"] + tess_params + [image_filepath, "-"]
output = ( output = (
subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"]) subprocess.check_output(tess_command)
.decode("utf-8") .decode("utf-8")
.split("\n") .split("\n")
) )

@ -14,8 +14,7 @@ def pdf_to_images(pdf_filepath):
Returns the filenames of the created images sorted lexicographically. Returns the filenames of the created images sorted lexicographically.
""" """
directory, filename = os.path.split(pdf_filepath) directory, filename = os.path.split(pdf_filepath)
with working_dir(directory): image_filenames = pdfimages(pdf_filepath)
image_filenames = pdfimages(pdf_filepath)
# Since pdfimages creates a number of files named each for there page number # Since pdfimages creates a number of files named each for there page number
# and doesn't return us the list that it created # and doesn't return us the list that it created
@ -32,8 +31,14 @@ def pdfimages(pdf_filepath):
uses 3 digits in its regex. uses 3 digits in its regex.
""" """
directory, filename = os.path.split(pdf_filepath) directory, filename = os.path.split(pdf_filepath)
if not os.path.isabs(directory):
directory = os.path.abspath(directory)
filename_sans_ext = filename.split(".pdf")[0] filename_sans_ext = filename.split(".pdf")[0]
subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
# pdfimages outputs results to the current working directory
with working_dir(directory):
subprocess.run(["pdfimages", "-png", filename, filename.split(".pdf")[0]])
image_filenames = find_matching_files_in_dir(filename_sans_ext, directory) image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
logger.debug( logger.debug(
"Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)) "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))

Loading…
Cancel
Save