You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
87 lines
3.2 KiB
Python
87 lines
3.2 KiB
Python
import math
|
|
import os
|
|
import sys
|
|
|
|
import cv2
|
|
import numpy as np
|
|
import pytesseract
|
|
|
|
def main(image_file, tess_args):
|
|
"""
|
|
OCR the image and output the text to a file with an extension that is ready
|
|
to be used in Tesseract training (.gt.txt).
|
|
|
|
Tries to crop the image so that only the relevant text gets passed to Tesseract.
|
|
|
|
Returns the name of the text file that contains the text.
|
|
"""
|
|
directory, filename = os.path.split(image_file)
|
|
filename_sans_ext, ext = os.path.splitext(filename)
|
|
image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
|
|
cropped = crop_to_text(image)
|
|
ocr_data_dir = os.path.join(directory, "ocr_data")
|
|
os.makedirs(ocr_data_dir, exist_ok=True)
|
|
out_imagepath = os.path.join(ocr_data_dir, filename)
|
|
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
|
|
cv2.imwrite(out_imagepath, cropped)
|
|
if not tess_args:
|
|
d = os.path.dirname(sys.modules["table_ocr"].__file__)
|
|
tessdata_dir = os.path.join(d, "tessdata")
|
|
tess_args = ["--psm", "7", "-l", "table-ocr", "--tessdata-dir", tessdata_dir]
|
|
txt = ocr_image(cropped, " ".join(tess_args))
|
|
with open(out_txtpath, "w") as txt_file:
|
|
txt_file.write(txt)
|
|
return out_txtpath
|
|
|
|
def crop_to_text(image):
|
|
MAX_COLOR_VAL = 255
|
|
BLOCK_SIZE = 15
|
|
SUBTRACT_FROM_MEAN = -2
|
|
|
|
img_bin = cv2.adaptiveThreshold(
|
|
~image,
|
|
MAX_COLOR_VAL,
|
|
cv2.ADAPTIVE_THRESH_MEAN_C,
|
|
cv2.THRESH_BINARY,
|
|
BLOCK_SIZE,
|
|
SUBTRACT_FROM_MEAN,
|
|
)
|
|
|
|
img_h, img_w = image.shape
|
|
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
|
|
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
|
|
horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
|
|
vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
|
|
both = horizontal_lines + vertical_lines
|
|
cleaned = img_bin - both
|
|
|
|
# Get rid of little noise.
|
|
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
|
|
opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
|
|
opened = cv2.dilate(opened, kernel)
|
|
|
|
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
|
bounding_rects = [cv2.boundingRect(c) for c in contours]
|
|
NUM_PX_COMMA = 6
|
|
MIN_CHAR_AREA = 5 * 9
|
|
char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
|
|
if char_sized_bounding_rects:
|
|
minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
|
|
for x, y, w, h in char_sized_bounding_rects:
|
|
minx = min(minx, x)
|
|
miny = min(miny, y)
|
|
maxx = max(maxx, x + w)
|
|
maxy = max(maxy, y + h)
|
|
x, y, w, h = minx, miny, maxx - minx, maxy - miny
|
|
cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
|
|
else:
|
|
# If we morphed out all of the text, assume an empty image.
|
|
cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
|
|
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
|
|
return bordered
|
|
def ocr_image(image, config):
|
|
return pytesseract.image_to_string(
|
|
image,
|
|
config=config
|
|
)
|