You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
3.1 KiB
Python
86 lines
3.1 KiB
Python
5 years ago
|
import argparse
|
||
|
import math
|
||
5 years ago
|
import os
|
||
5 years ago
|
import sys
|
||
|
|
||
|
import cv2
|
||
|
import pytesseract
|
||
|
|
||
5 years ago
|
description="""Takes a single argument that is the image to OCR.
|
||
|
Remaining arguments are passed directly to Tesseract.
|
||
|
|
||
|
Attempts to make OCR more accurate by performing some modifications on the image.
|
||
|
Saves the modified image and the OCR text in an `ocr_data` directory.
|
||
|
Filenames are of the format for training with tesstrain."""
|
||
|
parser = argparse.ArgumentParser(description=description)
|
||
|
parser.add_argument("image", help="filepath of image to perform OCR")
|
||
|
|
||
5 years ago
|
def crop_to_text(image):
|
||
5 years ago
|
MAX_COLOR_VAL = 255
|
||
|
BLOCK_SIZE = 15
|
||
|
SUBTRACT_FROM_MEAN = -2
|
||
|
|
||
|
img_bin = cv2.adaptiveThreshold(
|
||
|
~image,
|
||
|
MAX_COLOR_VAL,
|
||
|
cv2.ADAPTIVE_THRESH_MEAN_C,
|
||
|
cv2.THRESH_BINARY,
|
||
|
BLOCK_SIZE,
|
||
|
SUBTRACT_FROM_MEAN,
|
||
|
)
|
||
|
|
||
5 years ago
|
img_h, img_w = image.shape
|
||
|
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
|
||
|
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
|
||
|
horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
|
||
|
vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
|
||
|
both = horizontal_lines + vertical_lines
|
||
|
cleaned = img_bin - both
|
||
5 years ago
|
|
||
5 years ago
|
# Get rid of little noise.
|
||
|
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
|
||
|
opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
|
||
5 years ago
|
|
||
5 years ago
|
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
||
5 years ago
|
bounding_rects = [cv2.boundingRect(c) for c in contours]
|
||
5 years ago
|
NUM_PX_COMMA = 6
|
||
5 years ago
|
MIN_CHAR_AREA = 5 * 9
|
||
5 years ago
|
if bounding_rects:
|
||
5 years ago
|
minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
|
||
5 years ago
|
for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
|
||
5 years ago
|
minx = min(minx, x)
|
||
|
miny = min(miny, y)
|
||
|
maxx = max(maxx, x + w)
|
||
|
maxy = max(maxy, y + h)
|
||
|
x, y, w, h = minx, miny, maxx - minx, maxy - miny
|
||
|
cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
|
||
5 years ago
|
else:
|
||
5 years ago
|
# If we morphed out all of the text, fallback to using the unmorphed image.
|
||
5 years ago
|
cropped = image
|
||
5 years ago
|
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
|
||
|
return bordered
|
||
|
def ocr_image(image, config):
|
||
|
return pytesseract.image_to_string(
|
||
5 years ago
|
image,
|
||
5 years ago
|
config=config
|
||
|
)
|
||
|
|
||
5 years ago
|
def main(image_file, tess_args):
|
||
|
directory, filename = os.path.split(image_file)
|
||
5 years ago
|
filename_sans_ext, ext = os.path.splitext(filename)
|
||
5 years ago
|
image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
|
||
5 years ago
|
cropped = crop_to_text(image)
|
||
|
ocr_data_dir = os.path.join(directory, "ocr_data")
|
||
|
os.makedirs(ocr_data_dir, exist_ok=True)
|
||
|
out_imagepath = os.path.join(ocr_data_dir, filename)
|
||
|
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
|
||
|
cv2.imwrite(out_imagepath, cropped)
|
||
5 years ago
|
txt = ocr_image(cropped, " ".join(tess_args))
|
||
|
print(txt)
|
||
5 years ago
|
with open(out_txtpath, "w") as txt_file:
|
||
|
txt_file.write(txt)
|
||
5 years ago
|
|
||
|
if __name__ == "__main__":
|
||
5 years ago
|
args, tess_args = parser.parse_known_args()
|
||
|
main(args.image, tess_args)
|