import math import os import sys import cv2 import numpy as np import pytesseract def main(image_file, tess_args): """ OCR the image and output the text to a file with an extension that is ready to be used in Tesseract training (.gt.txt). Tries to crop the image so that only the relevant text gets passed to Tesseract. Returns the name of the text file that contains the text. """ directory, filename = os.path.split(image_file) filename_sans_ext, ext = os.path.splitext(filename) image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE) cropped = crop_to_text(image) ocr_data_dir = os.path.join(directory, "ocr_data") os.makedirs(ocr_data_dir, exist_ok=True) out_imagepath = os.path.join(ocr_data_dir, filename) out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) cv2.imwrite(out_imagepath, cropped) if not tess_args: d = os.path.dirname(sys.modules["table_ocr"].__file__) tessdata_dir = os.path.join(d, "tessdata") tess_args = ["--psm", "7", "-l", "table-ocr", "--tessdata-dir", tessdata_dir] txt = ocr_image(cropped, " ".join(tess_args)) with open(out_txtpath, "w") as txt_file: txt_file.write(txt) return out_txtpath def crop_to_text(image): MAX_COLOR_VAL = 255 BLOCK_SIZE = 15 SUBTRACT_FROM_MEAN = -2 img_bin = cv2.adaptiveThreshold( ~image, MAX_COLOR_VAL, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, BLOCK_SIZE, SUBTRACT_FROM_MEAN, ) img_h, img_w = image.shape horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1)) vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7))) horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) both = horizontal_lines + vertical_lines cleaned = img_bin - both # Get rid of little noise. kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel) opened = cv2.dilate(opened, kernel) contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) bounding_rects = [cv2.boundingRect(c) for c in contours] NUM_PX_COMMA = 6 MIN_CHAR_AREA = 5 * 9 char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA] if char_sized_bounding_rects: minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 for x, y, w, h in char_sized_bounding_rects: minx = min(minx, x) miny = min(miny, y) maxx = max(maxx, x + w) maxy = max(maxy, y + h) x, y, w, h = minx, miny, maxx - minx, maxy - miny cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)] else: # If we morphed out all of the text, assume an empty image. cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8) bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) return bordered def ocr_image(image, config): return pytesseract.image_to_string( image, config=config )