From 99beaaa2d1145a7a5630c1bcdef769c0423b82fc Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Sun, 26 Apr 2020 18:29:04 -0700 Subject: [PATCH] Make ocr_image return/print path of text file Move the main function to the __init__ file so it can be imported by other code. Modify it so that it returns the path to the file that contains the OCR text so that calling code can keep find the results. --- pdf_table_extraction_and_ocr.org | 51 +++++++++++++++++--------------- table_ocr/ocr_image/__init__.py | 24 +++++++++++++++ table_ocr/ocr_image/__main__.py | 27 ++--------------- 3 files changed, 54 insertions(+), 48 deletions(-) diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index 327e867..52d01e6 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -965,11 +965,35 @@ print("\n".join(paths)) **** table_ocr/ocr_image/__init__.py #+BEGIN_SRC python :tangle table_ocr/ocr_image/__init__.py import math +import os import cv2 import numpy as np import pytesseract +def main(image_file, tess_args): + """ + OCR the image and output the text to a file with an extension that is ready + to be used in Tesseract training (.gt.txt). + + Tries to crop the image so that only the relevant text gets passed to Tesseract. + + Returns the name of the text file that contains the text. + """ + directory, filename = os.path.split(image_file) + filename_sans_ext, ext = os.path.splitext(filename) + image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE) + cropped = crop_to_text(image) + ocr_data_dir = os.path.join(directory, "ocr_data") + os.makedirs(ocr_data_dir, exist_ok=True) + out_imagepath = os.path.join(ocr_data_dir, filename) + out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) + cv2.imwrite(out_imagepath, cropped) + txt = ocr_image(cropped, " ".join(tess_args)) + with open(out_txtpath, "w") as txt_file: + txt_file.write(txt) + return out_txtpath + <> <> #+END_SRC @@ -982,13 +1006,8 @@ https://github.com/tesseract-ocr/tesstrain. #+BEGIN_SRC python :tangle table_ocr/ocr_image/__main__.py :mkdirp yes :results none import argparse -import math -import os -import sys -import cv2 - -from table_ocr.ocr_image import crop_to_text, ocr_image +from table_ocr.ocr_image import main description="""Takes a single argument that is the image to OCR. Remaining arguments are passed directly to Tesseract. @@ -999,24 +1018,8 @@ Filenames are of the format for training with tesstrain.""" parser = argparse.ArgumentParser(description=description) parser.add_argument("image", help="filepath of image to perform OCR") -def main(image_file, tess_args): - directory, filename = os.path.split(image_file) - filename_sans_ext, ext = os.path.splitext(filename) - image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE) - cropped = crop_to_text(image) - ocr_data_dir = os.path.join(directory, "ocr_data") - os.makedirs(ocr_data_dir, exist_ok=True) - out_imagepath = os.path.join(ocr_data_dir, filename) - out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) - cv2.imwrite(out_imagepath, cropped) - txt = ocr_image(cropped, " ".join(tess_args)) - print(txt) - with open(out_txtpath, "w") as txt_file: - txt_file.write(txt) - -if __name__ == "__main__": - args, tess_args = parser.parse_known_args() - main(args.image, tess_args) +args, tess_args = parser.parse_known_args() +print(main(args.image, tess_args)) #+END_SRC *** table_ocr/ocr_to_csv/ **** table_ocr/ocr_to_csv/__init__.py diff --git a/table_ocr/ocr_image/__init__.py b/table_ocr/ocr_image/__init__.py index 526202c..e952522 100644 --- a/table_ocr/ocr_image/__init__.py +++ b/table_ocr/ocr_image/__init__.py @@ -1,9 +1,33 @@ import math +import os import cv2 import numpy as np import pytesseract +def main(image_file, tess_args): + """ + OCR the image and output the text to a file with an extension that is ready + to be used in Tesseract training (.gt.txt). + + Tries to crop the image so that only the relevant text gets passed to Tesseract. + + Returns the name of the text file that contains the text. + """ + directory, filename = os.path.split(image_file) + filename_sans_ext, ext = os.path.splitext(filename) + image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE) + cropped = crop_to_text(image) + ocr_data_dir = os.path.join(directory, "ocr_data") + os.makedirs(ocr_data_dir, exist_ok=True) + out_imagepath = os.path.join(ocr_data_dir, filename) + out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) + cv2.imwrite(out_imagepath, cropped) + txt = ocr_image(cropped, " ".join(tess_args)) + with open(out_txtpath, "w") as txt_file: + txt_file.write(txt) + return out_txtpath + def crop_to_text(image): MAX_COLOR_VAL = 255 BLOCK_SIZE = 15 diff --git a/table_ocr/ocr_image/__main__.py b/table_ocr/ocr_image/__main__.py index f849b53..67dd4c2 100644 --- a/table_ocr/ocr_image/__main__.py +++ b/table_ocr/ocr_image/__main__.py @@ -1,11 +1,6 @@ import argparse -import math -import os -import sys -import cv2 - -from table_ocr.ocr_image import crop_to_text, ocr_image +from table_ocr.ocr_image import main description="""Takes a single argument that is the image to OCR. Remaining arguments are passed directly to Tesseract. @@ -16,21 +11,5 @@ Filenames are of the format for training with tesstrain.""" parser = argparse.ArgumentParser(description=description) parser.add_argument("image", help="filepath of image to perform OCR") -def main(image_file, tess_args): - directory, filename = os.path.split(image_file) - filename_sans_ext, ext = os.path.splitext(filename) - image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE) - cropped = crop_to_text(image) - ocr_data_dir = os.path.join(directory, "ocr_data") - os.makedirs(ocr_data_dir, exist_ok=True) - out_imagepath = os.path.join(ocr_data_dir, filename) - out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) - cv2.imwrite(out_imagepath, cropped) - txt = ocr_image(cropped, " ".join(tess_args)) - print(txt) - with open(out_txtpath, "w") as txt_file: - txt_file.write(txt) - -if __name__ == "__main__": - args, tess_args = parser.parse_known_args() - main(args.image, tess_args) +args, tess_args = parser.parse_known_args() +print(main(args.image, tess_args))