Fix bugs and improve accuracy

Files in the ocr_to_csv module need to be named in a certain way.
Specify that and fix a bug, we need to have them sorted
lexicographically.

Don't dilate the characters in a cell in order to make a contiguous set
of pixels that we can find a contour around. The problem with that is
that you sometimes dilate too far and hit an image boundary and can't
erode back in. If a cell wall border was remaining between the text and
the image boundary, well now you're keeping that border line in the
image. (Unless you remove it some other way. So that might be a valid
option in the future.) The method we're using now instead is to group
all contours together and create a bounding box around all of them. The
problem with that is if there is any noise at all outside the text,
we're grabbing it. Before, we were dilating and taking the largest
contour, so we weren't including that noise. And we can't get rid of the
noise with opening morph because it's sometimes pretty big noise and
opening any bigger distorts the text so much that we lose accuracy in
finding those boundaries.

Also adds a shell script to simplify the plumbing of all these modules.
main
Eric Ihli 5 years ago
parent aa900de4e7
commit 54511b9a1f

@ -0,0 +1,13 @@
#!/bin/sh
PDF=$1
python -m table_ocr.prepare_pdfs $PDF | grep .png > /tmp/pdf-images.txt
cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt
cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} | grep cells > /tmp/extracted-cells.txt
cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l data-table
for image in $(cat /tmp/extracted-tables.txt); do
dir=$(dirname $image)
python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt")
done

@ -26,16 +26,33 @@ output~ to a code block will minimize that noise.
#+BEGIN_SRC shell :results none :session *Shell* #+BEGIN_SRC shell :results none :session *Shell*
TABLES=("/tmp/example-1/example-1.pdf" "/tmp/example-2/example-2.pdf") TABLES=("/tmp/example-1/example-1.pdf" "/tmp/example-2/example-2.pdf")
python -m table_ocr.prepare_pdfs $TABLES | grep .png > /tmp/pdf_images.txt python -m table_ocr.prepare_pdfs $TABLES | grep .png > /tmp/pdf-images.txt
# All pngs that don't have "table" in their name. Assume "table" has already been found for files with table in name. cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt
cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} # | grep tables > /tmp/extracted-tables.txt cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} | grep cells > /tmp/extracted-cells.txt
cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} # | grep cells > /tmp/extracted-cells.txt
cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {}
# This next one needs to be run on each subdirectory one at a time. # This next one needs to be run on each subdirectory one at a time.
python -m table_ocr.ocr_to_csv $(find . -iregex ".*cells.*ocr_data.*\.txt" 2>/dev/null) python -m table_ocr.ocr_to_csv $(find . -iregex ".*cells.*ocr_data.*\.txt" 2>/dev/null)
#+END_SRC #+END_SRC
Or, as a shell script.
#+BEGIN_SRC shell :results none :tangle ocr_tables :tangle-mode (identity #o755)
#!/bin/sh
PDF=$1
python -m table_ocr.prepare_pdfs $PDF | grep .png > /tmp/pdf-images.txt
cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt
cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} | grep cells > /tmp/extracted-cells.txt
cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l data-table
for image in $(cat /tmp/extracted-tables.txt); do
dir=$(dirname $image)
python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt")
done
#+END_SRC
* Preparing data * Preparing data
** Converting PDFs to images ** Converting PDFs to images
@ -506,30 +523,32 @@ def crop_to_text(image):
SUBTRACT_FROM_MEAN, SUBTRACT_FROM_MEAN,
) )
# Get rid of littl noise. img_h, img_w = image.shape
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel) vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
both = horizontal_lines + vertical_lines
cleaned = img_bin - both
# Dilate so each digit is connected, so we can get a bounding rectangle # Get rid of little noise.
# around all of the digits as one contour. This will make the bounding kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
# rectangle 8 pixels wider on the left and right, so we'll need to crop that opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
# out at the end so that we don't pick up stray border pixels.
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
dilated = cv2.dilate(opened, kernel)
contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours] bounding_rects = [cv2.boundingRect(c) for c in contours]
NUM_PX_COMMA = 6
if bounding_rects: if bounding_rects:
# The largest contour is certainly the text that we're looking for. minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) for x, y, w, h in bounding_rects:
x, y, w, h = largest_rect minx = min(minx, x)
# Commas sometimes go a little below the bounding box and we don't want miny = min(miny, y)
# to lost them or turn them into periods. maxx = max(maxx, x + w)
img_h, img_w = image.shape maxy = max(maxy, y + h)
cropped = image[y:min(img_h, y+h+6), x+8:x+w-8] x, y, w, h = minx, miny, maxx - minx, maxy - miny
cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
else: else:
# If we morphed out all of the text, fallback to using the unmorphed image.
cropped = image cropped = image
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered return bordered
@ -549,7 +568,6 @@ cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image)
#+ATTR_HTML: :width 200px :height 100% #+ATTR_HTML: :width 200px :height 100%
[[file:resources/examples/example-table-cell-1-1-cropped.png]] [[file:resources/examples/example-table-cell-1-1-cropped.png]]
** OCR each cell ** OCR each cell
If we cleaned up the images well enough, we might get some accurate OCR! If we cleaned up the images well enough, we might get some accurate OCR!
@ -813,31 +831,44 @@ python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
: PRIZE : PRIZE
#+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none #+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none
import argparse
import math
import os import os
import sys import sys
import cv2 import cv2
import pytesseract import pytesseract
description="""Takes a single argument that is the image to OCR.
Remaining arguments are passed directly to Tesseract.
Attempts to make OCR more accurate by performing some modifications on the image.
Saves the modified image and the OCR text in an `ocr_data` directory.
Filenames are of the format for training with tesstrain."""
parser = argparse.ArgumentParser(description=description)
parser.add_argument("image", help="filepath of image to perform OCR")
<<crop-to-text>> <<crop-to-text>>
<<ocr-image>> <<ocr-image>>
def main(f): def main(image_file, tess_args):
directory, filename = os.path.split(f) directory, filename = os.path.split(image_file)
filename_sans_ext, ext = os.path.splitext(filename) filename_sans_ext, ext = os.path.splitext(filename)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
cropped = crop_to_text(image) cropped = crop_to_text(image)
ocr_data_dir = os.path.join(directory, "ocr_data") ocr_data_dir = os.path.join(directory, "ocr_data")
os.makedirs(ocr_data_dir, exist_ok=True) os.makedirs(ocr_data_dir, exist_ok=True)
out_imagepath = os.path.join(ocr_data_dir, filename) out_imagepath = os.path.join(ocr_data_dir, filename)
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
cv2.imwrite(out_imagepath, cropped) cv2.imwrite(out_imagepath, cropped)
txt = ocr_image(cropped, "--psm 7") txt = ocr_image(cropped, " ".join(tess_args))
print(txt)
with open(out_txtpath, "w") as txt_file: with open(out_txtpath, "w") as txt_file:
txt_file.write(txt) txt_file.write(txt)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1]) args, tess_args = parser.parse_known_args()
main(args.image, tess_args)
#+END_SRC #+END_SRC
*** table_ocr/ocr_to_csv.py *** table_ocr/ocr_to_csv.py
@ -854,6 +885,13 @@ parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+") parser.add_argument("files", nargs="+")
def main(files): def main(files):
"""Files must be sorted lexicographically
Filenames must be <row>-<colum>.txt.
000-000.txt
000-001.txt
001-000.txt
etc...
"""
rows = [] rows = []
for f in files: for f in files:
directory, filename = os.path.split(f) directory, filename = os.path.split(f)
@ -871,9 +909,9 @@ def main(files):
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
main(args.files) files = args.files
files.sort()
main(files)
#+END_SRC #+END_SRC
* Utils * Utils
@ -899,10 +937,6 @@ with ~advice-add~.
(concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]") (concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]")
#+END_SRC #+END_SRC
#+RESULTS: html-image-size
#+ATTR_HTML: :width 100% :height 100%
[[file:]]
#+BEGIN_SRC emacs-lisp :results none #+BEGIN_SRC emacs-lisp :results none
(defun remove-attributes-from-src-block-result (&rest args) (defun remove-attributes-from-src-block-result (&rest args)
(let ((location (org-babel-where-is-src-block-result)) (let ((location (org-babel-where-is-src-block-result))

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 KiB

After

Width:  |  Height:  |  Size: 1.0 KiB

@ -1,9 +1,20 @@
import argparse
import math
import os import os
import sys import sys
import cv2 import cv2
import pytesseract import pytesseract
description="""Takes a single argument that is the image to OCR.
Remaining arguments are passed directly to Tesseract.
Attempts to make OCR more accurate by performing some modifications on the image.
Saves the modified image and the OCR text in an `ocr_data` directory.
Filenames are of the format for training with tesstrain."""
parser = argparse.ArgumentParser(description=description)
parser.add_argument("image", help="filepath of image to perform OCR")
def crop_to_text(image): def crop_to_text(image):
MAX_COLOR_VAL = 255 MAX_COLOR_VAL = 255
BLOCK_SIZE = 15 BLOCK_SIZE = 15
@ -18,30 +29,32 @@ def crop_to_text(image):
SUBTRACT_FROM_MEAN, SUBTRACT_FROM_MEAN,
) )
# Get rid of littl noise. img_h, img_w = image.shape
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel) vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
both = horizontal_lines + vertical_lines
cleaned = img_bin - both
# Dilate so each digit is connected, so we can get a bounding rectangle # Get rid of little noise.
# around all of the digits as one contour. This will make the bounding kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
# rectangle 8 pixels wider on the left and right, so we'll need to crop that opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
# out at the end so that we don't pick up stray border pixels.
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
dilated = cv2.dilate(opened, kernel)
contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours] bounding_rects = [cv2.boundingRect(c) for c in contours]
NUM_PX_COMMA = 6
if bounding_rects: if bounding_rects:
# The largest contour is certainly the text that we're looking for. minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) for x, y, w, h in bounding_rects:
x, y, w, h = largest_rect minx = min(minx, x)
# Commas sometimes go a little below the bounding box and we don't want miny = min(miny, y)
# to lost them or turn them into periods. maxx = max(maxx, x + w)
img_h, img_w = image.shape maxy = max(maxy, y + h)
cropped = image[y:min(img_h, y+h+6), x+8:x+w-8] x, y, w, h = minx, miny, maxx - minx, maxy - miny
cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
else: else:
# If we morphed out all of the text, fallback to using the unmorphed image.
cropped = image cropped = image
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered return bordered
@ -51,19 +64,21 @@ def ocr_image(image, config):
config=config config=config
) )
def main(f): def main(image_file, tess_args):
directory, filename = os.path.split(f) directory, filename = os.path.split(image_file)
filename_sans_ext, ext = os.path.splitext(filename) filename_sans_ext, ext = os.path.splitext(filename)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
cropped = crop_to_text(image) cropped = crop_to_text(image)
ocr_data_dir = os.path.join(directory, "ocr_data") ocr_data_dir = os.path.join(directory, "ocr_data")
os.makedirs(ocr_data_dir, exist_ok=True) os.makedirs(ocr_data_dir, exist_ok=True)
out_imagepath = os.path.join(ocr_data_dir, filename) out_imagepath = os.path.join(ocr_data_dir, filename)
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
cv2.imwrite(out_imagepath, cropped) cv2.imwrite(out_imagepath, cropped)
txt = ocr_image(cropped, "--psm 7") txt = ocr_image(cropped, " ".join(tess_args))
print(txt)
with open(out_txtpath, "w") as txt_file: with open(out_txtpath, "w") as txt_file:
txt_file.write(txt) txt_file.write(txt)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1]) args, tess_args = parser.parse_known_args()
main(args.image, tess_args)

@ -9,6 +9,13 @@ parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+") parser.add_argument("files", nargs="+")
def main(files): def main(files):
"""Files must be sorted lexicographically
Filenames must be <row>-<colum>.txt.
000-000.txt
000-001.txt
001-000.txt
etc...
"""
rows = [] rows = []
for f in files: for f in files:
directory, filename = os.path.split(f) directory, filename = os.path.split(f)
@ -26,4 +33,6 @@ def main(files):
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
main(args.files) files = args.files
files.sort()
main(files)

Loading…
Cancel
Save