diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index 327e867..52d01e6 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -965,11 +965,35 @@ print("\n".join(paths)) **** table_ocr/ocr_image/__init__.py #+BEGIN_SRC python :tangle table_ocr/ocr_image/__init__.py import math +import os import cv2 import numpy as np import pytesseract +def main(image_file, tess_args): + """ + OCR the image and output the text to a file with an extension that is ready + to be used in Tesseract training (.gt.txt). + + Tries to crop the image so that only the relevant text gets passed to Tesseract. + + Returns the name of the text file that contains the text. + """ + directory, filename = os.path.split(image_file) + filename_sans_ext, ext = os.path.splitext(filename) + image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE) + cropped = crop_to_text(image) + ocr_data_dir = os.path.join(directory, "ocr_data") + os.makedirs(ocr_data_dir, exist_ok=True) + out_imagepath = os.path.join(ocr_data_dir, filename) + out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) + cv2.imwrite(out_imagepath, cropped) + txt = ocr_image(cropped, " ".join(tess_args)) + with open(out_txtpath, "w") as txt_file: + txt_file.write(txt) + return out_txtpath + <> <> #+END_SRC @@ -982,13 +1006,8 @@ https://github.com/tesseract-ocr/tesstrain. #+BEGIN_SRC python :tangle table_ocr/ocr_image/__main__.py :mkdirp yes :results none import argparse -import math -import os -import sys -import cv2 - -from table_ocr.ocr_image import crop_to_text, ocr_image +from table_ocr.ocr_image import main description="""Takes a single argument that is the image to OCR. Remaining arguments are passed directly to Tesseract. @@ -999,24 +1018,8 @@ Filenames are of the format for training with tesstrain.""" parser = argparse.ArgumentParser(description=description) parser.add_argument("image", help="filepath of image to perform OCR") -def main(image_file, tess_args): - directory, filename = os.path.split(image_file) - filename_sans_ext, ext = os.path.splitext(filename) - image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE) - cropped = crop_to_text(image) - ocr_data_dir = os.path.join(directory, "ocr_data") - os.makedirs(ocr_data_dir, exist_ok=True) - out_imagepath = os.path.join(ocr_data_dir, filename) - out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) - cv2.imwrite(out_imagepath, cropped) - txt = ocr_image(cropped, " ".join(tess_args)) - print(txt) - with open(out_txtpath, "w") as txt_file: - txt_file.write(txt) - -if __name__ == "__main__": - args, tess_args = parser.parse_known_args() - main(args.image, tess_args) +args, tess_args = parser.parse_known_args() +print(main(args.image, tess_args)) #+END_SRC *** table_ocr/ocr_to_csv/ **** table_ocr/ocr_to_csv/__init__.py diff --git a/table_ocr/ocr_image/__init__.py b/table_ocr/ocr_image/__init__.py index 526202c..e952522 100644 --- a/table_ocr/ocr_image/__init__.py +++ b/table_ocr/ocr_image/__init__.py @@ -1,9 +1,33 @@ import math +import os import cv2 import numpy as np import pytesseract +def main(image_file, tess_args): + """ + OCR the image and output the text to a file with an extension that is ready + to be used in Tesseract training (.gt.txt). + + Tries to crop the image so that only the relevant text gets passed to Tesseract. + + Returns the name of the text file that contains the text. + """ + directory, filename = os.path.split(image_file) + filename_sans_ext, ext = os.path.splitext(filename) + image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE) + cropped = crop_to_text(image) + ocr_data_dir = os.path.join(directory, "ocr_data") + os.makedirs(ocr_data_dir, exist_ok=True) + out_imagepath = os.path.join(ocr_data_dir, filename) + out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) + cv2.imwrite(out_imagepath, cropped) + txt = ocr_image(cropped, " ".join(tess_args)) + with open(out_txtpath, "w") as txt_file: + txt_file.write(txt) + return out_txtpath + def crop_to_text(image): MAX_COLOR_VAL = 255 BLOCK_SIZE = 15 diff --git a/table_ocr/ocr_image/__main__.py b/table_ocr/ocr_image/__main__.py index f849b53..67dd4c2 100644 --- a/table_ocr/ocr_image/__main__.py +++ b/table_ocr/ocr_image/__main__.py @@ -1,11 +1,6 @@ import argparse -import math -import os -import sys -import cv2 - -from table_ocr.ocr_image import crop_to_text, ocr_image +from table_ocr.ocr_image import main description="""Takes a single argument that is the image to OCR. Remaining arguments are passed directly to Tesseract. @@ -16,21 +11,5 @@ Filenames are of the format for training with tesstrain.""" parser = argparse.ArgumentParser(description=description) parser.add_argument("image", help="filepath of image to perform OCR") -def main(image_file, tess_args): - directory, filename = os.path.split(image_file) - filename_sans_ext, ext = os.path.splitext(filename) - image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE) - cropped = crop_to_text(image) - ocr_data_dir = os.path.join(directory, "ocr_data") - os.makedirs(ocr_data_dir, exist_ok=True) - out_imagepath = os.path.join(ocr_data_dir, filename) - out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) - cv2.imwrite(out_imagepath, cropped) - txt = ocr_image(cropped, " ".join(tess_args)) - print(txt) - with open(out_txtpath, "w") as txt_file: - txt_file.write(txt) - -if __name__ == "__main__": - args, tess_args = parser.parse_known_args() - main(args.image, tess_args) +args, tess_args = parser.parse_known_args() +print(main(args.image, tess_args))