image-table-ocr/table_ocr/ocr_image.py

import os
import sys

import cv2
import pytesseract

def crop_to_text(image):
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2

    img_bin = cv2.adaptiveThreshold(
        ~image,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )

    # Get rid of littl noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
    opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)

    # Dilate so each digit is connected, so we can get a bounding rectangle
    # around all of the digits as one contour. This will make the bounding
    # rectangle 8 pixels wider on the left and right, so we'll need to crop that
    # out at the end so that we don't pick up stray border pixels.
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
    dilated = cv2.dilate(opened, kernel)

    contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

    bounding_rects = [cv2.boundingRect(c) for c in contours]

    if bounding_rects:
        # The largest contour is certainly the text that we're looking for.
        largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
        x, y, w, h = largest_rect
        # Commas sometimes go a little below the bounding box and we don't want
        # to lost them or turn them into periods.
        img_h, img_w = image.shape
        cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
    else:
        cropped = image
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
def ocr_image(image, config):
    return pytesseract.image_to_string(
        image,
        config=config
    )

def main(f):
    directory, filename = os.path.split(f)
    filename_sans_ext, ext = os.path.splitext(filename)
    image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
    cropped = crop_to_text(image)
    ocr_data_dir = os.path.join(directory, "ocr_data")
    os.makedirs(ocr_data_dir, exist_ok=True)
    out_imagepath = os.path.join(ocr_data_dir, filename)
    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
    cv2.imwrite(out_imagepath, cropped)
    txt = ocr_image(cropped, "--psm 7")
    with open(out_txtpath, "w") as txt_file:
        txt_file.write(txt)

if __name__ == "__main__":
    main(sys.argv[1])
Add module for outputting csv from parsed table Make cell extraction a little more accurate. 5 years ago			`import os`
Add ocr_image module 5 years ago			`import sys`

			`import cv2`
			`import pytesseract`

			`def crop_to_text(image):`
Add module for outputting csv from parsed table Make cell extraction a little more accurate. 5 years ago			`MAX_COLOR_VAL = 255`
			`BLOCK_SIZE = 15`
			`SUBTRACT_FROM_MEAN = -2`

			`img_bin = cv2.adaptiveThreshold(`
			`~image,`
			`MAX_COLOR_VAL,`
			`cv2.ADAPTIVE_THRESH_MEAN_C,`
			`cv2.THRESH_BINARY,`
			`BLOCK_SIZE,`
			`SUBTRACT_FROM_MEAN,`
			`)`

			`# Get rid of littl noise.`
			`kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))`
			`opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)`

			`# Dilate so each digit is connected, so we can get a bounding rectangle`
			`# around all of the digits as one contour. This will make the bounding`
			`# rectangle 8 pixels wider on the left and right, so we'll need to crop that`
			`# out at the end so that we don't pick up stray border pixels.`
			`kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))`
			`dilated = cv2.dilate(opened, kernel)`

			`contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)`
Add ocr_image module 5 years ago
			`bounding_rects = [cv2.boundingRect(c) for c in contours]`
Add module for outputting csv from parsed table Make cell extraction a little more accurate. 5 years ago
			`if bounding_rects:`
			`# The largest contour is certainly the text that we're looking for.`
			`largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])`
			`x, y, w, h = largest_rect`
			`# Commas sometimes go a little below the bounding box and we don't want`
			`# to lost them or turn them into periods.`
			`img_h, img_w = image.shape`
			`cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]`
			`else:`
			`cropped = image`
Add ocr_image module 5 years ago			`bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)`
			`return bordered`
			`def ocr_image(image, config):`
			`return pytesseract.image_to_string(`
Add module for outputting csv from parsed table Make cell extraction a little more accurate. 5 years ago			`image,`
Add ocr_image module 5 years ago			`config=config`
			`)`

			`def main(f):`
Add module for outputting csv from parsed table Make cell extraction a little more accurate. 5 years ago			`directory, filename = os.path.split(f)`
			`filename_sans_ext, ext = os.path.splitext(filename)`
Add ocr_image module 5 years ago			`image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)`
Add module for outputting csv from parsed table Make cell extraction a little more accurate. 5 years ago			`cropped = crop_to_text(image)`
			`ocr_data_dir = os.path.join(directory, "ocr_data")`
			`os.makedirs(ocr_data_dir, exist_ok=True)`
			`out_imagepath = os.path.join(ocr_data_dir, filename)`
			`out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))`
			`cv2.imwrite(out_imagepath, cropped)`
			`txt = ocr_image(cropped, "--psm 7")`
			`with open(out_txtpath, "w") as txt_file:`
			`txt_file.write(txt)`
Add ocr_image module 5 years ago
			`if __name__ == "__main__":`
			`main(sys.argv[1])`