Make ocr_image return/print path of text file

Move the main function to the __init__ file so it can be imported by other code. Modify it so that it returns the path to the file that contains the OCR text so that calling code can keep find the results.
6 years ago · 99beaaa2d1
parent 6359b86e42
commit 99beaaa2d1
3 changed files with 54 additions and 48 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -965,11 +965,35 @@ print("\n".join(paths))
 **** table_ocr/ocr_image/__init__.py
 #+BEGIN_SRC python :tangle table_ocr/ocr_image/__init__.py
 import math
 import os
 import cv2
 import numpy as np
 import pytesseract
 def main(image_file, tess_args):
    """
    OCR the image and output the text to a file with an extension that is ready
    to be used in Tesseract training (.gt.txt).
    Tries to crop the image so that only the relevant text gets passed to Tesseract.
    Returns the name of the text file that contains the text.
    """
    directory, filename = os.path.split(image_file)
    filename_sans_ext, ext = os.path.splitext(filename)
    image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
    cropped = crop_to_text(image)
    ocr_data_dir = os.path.join(directory, "ocr_data")
    os.makedirs(ocr_data_dir, exist_ok=True)
    out_imagepath = os.path.join(ocr_data_dir, filename)
    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
    cv2.imwrite(out_imagepath, cropped)
    txt = ocr_image(cropped, " ".join(tess_args))
    with open(out_txtpath, "w") as txt_file:
        txt_file.write(txt)
    return out_txtpath
 <<crop-to-text>>
 <<ocr-image>>
 #+END_SRC
@ -982,13 +1006,8 @@ https://github.com/tesseract-ocr/tesstrain.
 #+BEGIN_SRC python :tangle table_ocr/ocr_image/__main__.py :mkdirp yes :results none
 import argparse
 import math
 import os
 import sys
-import cv2
+from table_ocr.ocr_image import main
 from table_ocr.ocr_image import crop_to_text, ocr_image
 description="""Takes a single argument that is the image to OCR.
 Remaining arguments are passed directly to Tesseract.
@ -999,24 +1018,8 @@ Filenames are of the format for training with tesstrain."""
 parser = argparse.ArgumentParser(description=description)
 parser.add_argument("image", help="filepath of image to perform OCR")
-def main(image_file, tess_args):
+args, tess_args = parser.parse_known_args()
-    directory, filename = os.path.split(image_file)
+print(main(args.image, tess_args))
    filename_sans_ext, ext = os.path.splitext(filename)
    image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
    cropped = crop_to_text(image)
    ocr_data_dir = os.path.join(directory, "ocr_data")
    os.makedirs(ocr_data_dir, exist_ok=True)
    out_imagepath = os.path.join(ocr_data_dir, filename)
    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
    cv2.imwrite(out_imagepath, cropped)
    txt = ocr_image(cropped, " ".join(tess_args))
    print(txt)
    with open(out_txtpath, "w") as txt_file:
        txt_file.write(txt)
 if __name__ == "__main__":
    args, tess_args = parser.parse_known_args()
    main(args.image, tess_args)
 #+END_SRC
 *** table_ocr/ocr_to_csv/
 **** table_ocr/ocr_to_csv/__init__.py
--- a/table_ocr/ocr_image/init.py
+++ b/table_ocr/ocr_image/init.py
@ -1,9 +1,33 @@
 import math
 import os
 import cv2
 import numpy as np
 import pytesseract
 def main(image_file, tess_args):
    """
    OCR the image and output the text to a file with an extension that is ready
    to be used in Tesseract training (.gt.txt).
    Tries to crop the image so that only the relevant text gets passed to Tesseract.
    Returns the name of the text file that contains the text.
    """
    directory, filename = os.path.split(image_file)
    filename_sans_ext, ext = os.path.splitext(filename)
    image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
    cropped = crop_to_text(image)
    ocr_data_dir = os.path.join(directory, "ocr_data")
    os.makedirs(ocr_data_dir, exist_ok=True)
    out_imagepath = os.path.join(ocr_data_dir, filename)
    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
    cv2.imwrite(out_imagepath, cropped)
    txt = ocr_image(cropped, " ".join(tess_args))
    with open(out_txtpath, "w") as txt_file:
        txt_file.write(txt)
    return out_txtpath
 def crop_to_text(image):
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
--- a/table_ocr/ocr_image/main.py
+++ b/table_ocr/ocr_image/main.py
@ -1,11 +1,6 @@
 import argparse
 import math
 import os
 import sys
-import cv2
+from table_ocr.ocr_image import main
 from table_ocr.ocr_image import crop_to_text, ocr_image
 description="""Takes a single argument that is the image to OCR.
 Remaining arguments are passed directly to Tesseract.
@ -16,21 +11,5 @@ Filenames are of the format for training with tesstrain."""
 parser = argparse.ArgumentParser(description=description)
 parser.add_argument("image", help="filepath of image to perform OCR")
-def main(image_file, tess_args):
+args, tess_args = parser.parse_known_args()
-    directory, filename = os.path.split(image_file)
+print(main(args.image, tess_args))
    filename_sans_ext, ext = os.path.splitext(filename)
    image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
    cropped = crop_to_text(image)
    ocr_data_dir = os.path.join(directory, "ocr_data")
    os.makedirs(ocr_data_dir, exist_ok=True)
    out_imagepath = os.path.join(ocr_data_dir, filename)
    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
    cv2.imwrite(out_imagepath, cropped)
    txt = ocr_image(cropped, " ".join(tess_args))
    print(txt)
    with open(out_txtpath, "w") as txt_file:
        txt_file.write(txt)
 if __name__ == "__main__":
    args, tess_args = parser.parse_known_args()
    main(args.image, tess_args)