You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
3.1 KiB
Python

import argparse
import math
import os
import sys
import cv2
import pytesseract
description="""Takes a single argument that is the image to OCR.
Remaining arguments are passed directly to Tesseract.
Attempts to make OCR more accurate by performing some modifications on the image.
Saves the modified image and the OCR text in an `ocr_data` directory.
Filenames are of the format for training with tesstrain."""
parser = argparse.ArgumentParser(description=description)
parser.add_argument("image", help="filepath of image to perform OCR")
def crop_to_text(image):
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~image,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
img_h, img_w = image.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
both = horizontal_lines + vertical_lines
cleaned = img_bin - both
# Get rid of little noise.
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours]
NUM_PX_COMMA = 6
MIN_CHAR_AREA = 5 * 9
if bounding_rects:
minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
minx = min(minx, x)
miny = min(miny, y)
maxx = max(maxx, x + w)
maxy = max(maxy, y + h)
x, y, w, h = minx, miny, maxx - minx, maxy - miny
cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
else:
# If we morphed out all of the text, fallback to using the unmorphed image.
cropped = image
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered
def ocr_image(image, config):
return pytesseract.image_to_string(
image,
config=config
)
def main(image_file, tess_args):
directory, filename = os.path.split(image_file)
filename_sans_ext, ext = os.path.splitext(filename)
image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
cropped = crop_to_text(image)
ocr_data_dir = os.path.join(directory, "ocr_data")
os.makedirs(ocr_data_dir, exist_ok=True)
out_imagepath = os.path.join(ocr_data_dir, filename)
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
cv2.imwrite(out_imagepath, cropped)
txt = ocr_image(cropped, " ".join(tess_args))
print(txt)
with open(out_txtpath, "w") as txt_file:
txt_file.write(txt)
if __name__ == "__main__":
args, tess_args = parser.parse_known_args()
main(args.image, tess_args)