Fix bug relating to directory of pdf

Relative paths now work.
5 years ago · 7ad4c0d4dc
parent 449ee015d3
commit 7ad4c0d4dc
2 changed files with 32 additions and 12 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -129,8 +129,7 @@ def pdf_to_images(pdf_filepath):
    Returns the filenames of the created images sorted lexicographically.
    """
    directory, filename = os.path.split(pdf_filepath)
-    with working_dir(directory):
+    image_filenames = pdfimages(pdf_filepath)
        image_filenames = pdfimages(pdf_filepath)
    # Since pdfimages creates a number of files named each for there page number
    # and doesn't return us the list that it created
@ -147,8 +146,14 @@ def pdfimages(pdf_filepath):
    uses 3 digits in its regex.
    """
    directory, filename = os.path.split(pdf_filepath)
    if not os.path.isabs(directory):
        directory = os.path.abspath(directory)
    filename_sans_ext = filename.split(".pdf")[0]
-    subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
+
    # pdfimages outputs results to the current working directory
    with working_dir(directory):
        subprocess.run(["pdfimages", "-png", filename, filename.split(".pdf")[0]])
    image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
    logger.debug(
        "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
@ -189,19 +194,29 @@ to correct the rotation. This makes OCR more straightforward.
 #+NAME: fix-orientation
 #+BEGIN_SRC python :results none
-def preprocess_img(filepath):
+def preprocess_img(filepath, tess_params=None):
-    """
+    """Processing that involves running shell executables,
    Processing that involves running shell executables,
    like mogrify to rotate.
    Uses tesseract to detect rotation.
    Orientation and script detection is only available for legacy tesseract
    (--oem 0). Some versions of tesseract will segfault if you let it run OSD
    with the default oem (3).
    """
-    rotate = get_rotate(filepath)
+    if tess_params is None:
        tess_params = ["--psm", "0", "--oem", "0"]
    rotate = get_rotate(filepath, tess_params)
    logger.debug("Rotating {} by {}.".format(filepath, rotate))
    mogrify(filepath, rotate)
-def get_rotate(image_filepath):
+def get_rotate(image_filepath, tess_params):
    """
    """
    tess_command = ["tesseract"] + tess_params + [image_filepath, "-"]
    output = (
-        subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
+        subprocess.check_output(tess_command)
        .decode("utf-8")
        .split("\n")
    )
--- a/table_ocr/pdf_to_images/init.py
+++ b/table_ocr/pdf_to_images/init.py
@ -14,8 +14,7 @@ def pdf_to_images(pdf_filepath):
    Returns the filenames of the created images sorted lexicographically.
    """
    directory, filename = os.path.split(pdf_filepath)
-    with working_dir(directory):
+    image_filenames = pdfimages(pdf_filepath)
        image_filenames = pdfimages(pdf_filepath)
    # Since pdfimages creates a number of files named each for there page number
    # and doesn't return us the list that it created
@ -32,8 +31,14 @@ def pdfimages(pdf_filepath):
    uses 3 digits in its regex.
    """
    directory, filename = os.path.split(pdf_filepath)
    if not os.path.isabs(directory):
        directory = os.path.abspath(directory)
    filename_sans_ext = filename.split(".pdf")[0]
-    subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
+
    # pdfimages outputs results to the current working directory
    with working_dir(directory):
        subprocess.run(["pdfimages", "-png", filename, filename.split(".pdf")[0]])
    image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
    logger.debug(
        "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))