From 54511b9a1f91503879f7c97e3a086f544e0ddd35 Mon Sep 17 00:00:00 2001
From: Eric Ihli <ericihli@gmail.com>
Date: Thu, 23 Apr 2020 20:03:43 -0700
Subject: [PATCH] Fix bugs and improve accuracy

Files in the ocr_to_csv module need to be named in a certain way.
Specify that and fix a bug, we need to have them sorted
lexicographically.

Don't dilate the characters in a cell in order to make a contiguous set
of pixels that we can find a contour around. The problem with that is
that you sometimes dilate too far and hit an image boundary and can't
erode back in. If a cell wall border was remaining between the text and
the image boundary, well now you're keeping that border line in the
image. (Unless you remove it some other way. So that might be a valid
option in the future.) The method we're using now instead is to group
all contours together and create a bounding box around all of them. The
problem with that is if there is any noise at all outside the text,
we're grabbing it. Before, we were dilating and taking the largest
contour, so we weren't including that noise. And we can't get rid of the
noise with opening morph because it's sometimes pretty big noise and
opening any bigger distorts the text so much that we lose accuracy in
finding those boundaries.

Also adds a shell script to simplify the plumbing of all these modules.
---
 ocr_tables                                    |  13 +++
 pdf_table_extraction_and_ocr.org              | 106 ++++++++++++------
 .../example-table-cell-1-1-cropped.png        | Bin 1026 -> 1067 bytes
 table_ocr/ocr_image.py                        |  63 +++++++----
 table_ocr/ocr_to_csv.py                       |  11 +-
 5 files changed, 132 insertions(+), 61 deletions(-)
 create mode 100755 ocr_tables

diff --git a/ocr_tables b/ocr_tables
new file mode 100755
index 0000000..d8c3217
--- /dev/null
+++ b/ocr_tables
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+PDF=$1
+
+python -m table_ocr.prepare_pdfs $PDF | grep .png > /tmp/pdf-images.txt
+cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {}  | grep table > /tmp/extracted-tables.txt
+cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} | grep cells > /tmp/extracted-cells.txt
+cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l data-table
+
+for image in $(cat /tmp/extracted-tables.txt); do
+    dir=$(dirname $image)
+    python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt")
+done
diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org
index 6004b31..04007c0 100644
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@@ -26,16 +26,33 @@ output~ to a code block will minimize that noise.
 
 #+BEGIN_SRC shell :results none :session *Shell*
 TABLES=("/tmp/example-1/example-1.pdf" "/tmp/example-2/example-2.pdf")
-python -m table_ocr.prepare_pdfs $TABLES | grep .png > /tmp/pdf_images.txt
-# All pngs that don't have "table" in their name. Assume "table" has already been found for files with table in name.
-cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} # | grep tables > /tmp/extracted-tables.txt
-cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} # | grep cells > /tmp/extracted-cells.txt
+python -m table_ocr.prepare_pdfs $TABLES | grep .png > /tmp/pdf-images.txt
+cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {}  | grep table > /tmp/extracted-tables.txt
+cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} | grep cells > /tmp/extracted-cells.txt
 cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {}
 
 # This next one needs to be run on each subdirectory one at a time.
 python -m table_ocr.ocr_to_csv $(find . -iregex ".*cells.*ocr_data.*\.txt" 2>/dev/null)
 #+END_SRC
 
+Or, as a shell script.
+
+#+BEGIN_SRC shell :results none :tangle ocr_tables :tangle-mode (identity #o755)
+#!/bin/sh
+
+PDF=$1
+
+python -m table_ocr.prepare_pdfs $PDF | grep .png > /tmp/pdf-images.txt
+cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {}  | grep table > /tmp/extracted-tables.txt
+cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} | grep cells > /tmp/extracted-cells.txt
+cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l data-table
+
+for image in $(cat /tmp/extracted-tables.txt); do
+    dir=$(dirname $image)
+    python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt")
+done
+#+END_SRC
+
 * Preparing data
 ** Converting PDFs to images
 
@@ -506,30 +523,32 @@ def crop_to_text(image):
         SUBTRACT_FROM_MEAN,
     )
 
-    # Get rid of littl noise.
-    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
-    opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
+    img_h, img_w = image.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
+    horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    both = horizontal_lines + vertical_lines
+    cleaned = img_bin - both
 
-    # Dilate so each digit is connected, so we can get a bounding rectangle
-    # around all of the digits as one contour. This will make the bounding
-    # rectangle 8 pixels wider on the left and right, so we'll need to crop that
-    # out at the end so that we don't pick up stray border pixels.
-    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
-    dilated = cv2.dilate(opened, kernel)
-
-    contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+    # Get rid of little noise.
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
+    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
 
+    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
     bounding_rects = [cv2.boundingRect(c) for c in contours]
-
+    NUM_PX_COMMA = 6
     if bounding_rects:
-        # The largest contour is certainly the text that we're looking for.
-        largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
-        x, y, w, h = largest_rect
-        # Commas sometimes go a little below the bounding box and we don't want
-        # to lost them or turn them into periods.
-        img_h, img_w = image.shape
-        cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
+        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
+        for x, y, w, h in bounding_rects:
+            minx = min(minx, x)
+            miny = min(miny, y)
+            maxx = max(maxx, x + w)
+            maxy = max(maxy, y + h)
+        x, y, w, h = minx, miny, maxx - minx, maxy - miny
+        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
     else:
+        # If we morphed out all of the text, fallback to using the unmorphed image.
         cropped = image
     bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
     return bordered
@@ -549,7 +568,6 @@ cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image)
 #+ATTR_HTML: :width 200px :height 100%
 [[file:resources/examples/example-table-cell-1-1-cropped.png]]
 
-
 ** OCR each cell
 
 If we cleaned up the images well enough, we might get some accurate OCR!
@@ -813,31 +831,44 @@ python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
 : PRIZE
 
 #+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none
+import argparse
+import math
 import os
 import sys
 
 import cv2
 import pytesseract
 
+description="""Takes a single argument that is the image to OCR.
+Remaining arguments are passed directly to Tesseract.
+
+Attempts to make OCR more accurate by performing some modifications on the image.
+Saves the modified image and the OCR text in an `ocr_data` directory.
+Filenames are of the format for training with tesstrain."""
+parser = argparse.ArgumentParser(description=description)
+parser.add_argument("image", help="filepath of image to perform OCR")
+
 <<crop-to-text>>
 <<ocr-image>>
 
-def main(f):
-    directory, filename = os.path.split(f)
+def main(image_file, tess_args):
+    directory, filename = os.path.split(image_file)
     filename_sans_ext, ext = os.path.splitext(filename)
-    image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
+    image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
     cropped = crop_to_text(image)
     ocr_data_dir = os.path.join(directory, "ocr_data")
     os.makedirs(ocr_data_dir, exist_ok=True)
     out_imagepath = os.path.join(ocr_data_dir, filename)
     out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
     cv2.imwrite(out_imagepath, cropped)
-    txt = ocr_image(cropped, "--psm 7")
+    txt = ocr_image(cropped, " ".join(tess_args))
+    print(txt)
     with open(out_txtpath, "w") as txt_file:
         txt_file.write(txt)
 
 if __name__ == "__main__":
-    main(sys.argv[1])
+    args, tess_args = parser.parse_known_args()
+    main(args.image, tess_args)
 #+END_SRC
 
 *** table_ocr/ocr_to_csv.py
@@ -854,6 +885,13 @@ parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
 
 def main(files):
+    """Files must be sorted lexicographically
+    Filenames must be <row>-<colum>.txt.
+    000-000.txt
+    000-001.txt
+    001-000.txt
+    etc...
+    """
     rows = []
     for f in files:
         directory, filename = os.path.split(f)
@@ -871,9 +909,9 @@ def main(files):
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    main(args.files)
-
-
+    files = args.files
+    files.sort()
+    main(files)
 #+END_SRC
 
 * Utils
@@ -899,10 +937,6 @@ with ~advice-add~.
 (concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]")
 #+END_SRC
 
-#+RESULTS: html-image-size
-#+ATTR_HTML: :width 100% :height 100%
-[[file:]]
-
 #+BEGIN_SRC emacs-lisp :results none
 (defun remove-attributes-from-src-block-result (&rest args)
   (let ((location (org-babel-where-is-src-block-result))
diff --git a/resources/examples/example-table-cell-1-1-cropped.png b/resources/examples/example-table-cell-1-1-cropped.png
index 4aba6ecd1931960d623cd5c138fb9ca37260c5e9..5bbcbe9cfd30cfc8df5dff8a854d8781bc7bbc8c 100644
GIT binary patch
literal 1067
zcmV+`1l0S9P)<h;3K|Lk000e1NJLTq001-q001Hg00000HJqif000B>Nkl<ZI1!b>
z|7+BB90%~{>+}A6xzF9UyQ^&uR!%3W%i7vBr)e!zB!*wXNa+Vjau}Exl&I)nM5WEJ
z%&eg#%)V>|qZCbS!a$hPEL&@AIxU@T?)$gfeeUjapU>yj`}QB)<3Wt{5hH!XNFOoM
zM~w6lLvqFK-!_jsahmUHo&rY&FnT>}=}1S{E=@~nQ{3SnxEI={Vg@m0RCf0}c~jvB
zfCptN%hJc+Kb6-PblpGa#eHI5ovR2teAkB=#7Oz5dcmH^qi+lMFUv4(-BE!vpG>T%
z>Dk@)=F!J6u(^9nNn)(7q_!{wi5SxP?USCTjQqXEr*-71u88`FN(X{JzaYNerG4-Y
z7LM^F6{tS3{bK_XF{Cql*EPWM-+Xm&NwW@5Nosv_ebw&*H(s$fpR;nk<ySO|Pwua(
zLkcmZL!Yg$R}nK~xiA03B%%RyR`y=3bZL6o@REzeW&N*x?+H|YrL-)BL=36+@Ww_s
zo__F)@+(XL+|HGA+dZP;=SNpHkqG5<iG*>iX1cf?5;3Ij-V2^CGtu^v{!B~4uP~u|
z<?J@Wbid50xPV;MyyvLS;S<4C-#{XUbo#((b<LfE)-^d<?It9*t9;g7#nxlX`U+Z&
z?e#fB7*;p6c`KVC5ks1|yl?Qi4BfpN$m?W5tav@;q5H&yP1nL5=IHAFry~(t?=RfD
z0}?S(CigUnTy=d*XOwLnfZJ21O?V<EUcXe>VWZ-K70O@Uam_hd1&J6lZs<jcP`vT2
z#m$Dpia_5I9&kn6_r{O2dK6sGe##-fwO0Ql9}+PnPl*QLC?lnbCvQ0_q~79q_iQLw
z*BbiAVGXeSICOOWUjh;_B#Z2z<wx66bM<*vJSa}@v(^t=2)#u{$yFu!Up8gL#lgl!
zmk1Iur23st%)6+FeYN=)N*&uD$>hNzTl;-p;>w1{Uu<%#&gR|8bj8MBD}F%=F{FnJ
zg7L<+*4yWf?$XrTO%=tOQQGTuCIX)xH;?Qb+tC`G9thPR$o_DH0f`t=L|f654A?a1
zT9)&0QMmue)`F?6%@pvBxtCYNp`!bNjFIqA%rlTej2WjI=4AdGo7DjD=f`u7EjE>t
zUB!{f!JevZww~O5bB)q_^UHG8KngKZ+^Mjq3}q`w7ztxsH`K`>TrnP=HU$t9dYqsN
zM@8HRi5MwUS(d61;yxrbI4Y#12Dl<-f+-qEYGAT;%a0UdNE0CdNe!@s#;nHK^r8V4
zp6x?I_XLu{Q(~qP4MZ{_5hKM*8bI8~1Sw|1@;fRL23!HKZ7?B{=|aHPArT{e#7G}8
l(npN+5hH!XNFOoM_dl9Qy%42^wD14`002ovPDHLkV1g%@3ON7(

literal 1026
zcmV+d1pWJoP)<h;3K|Lk000e1NJLTq001%o001Hg00000ID*yo000BYNkl<ZI1!b>
z{YzbS90&0C=X}q@J@=~j-tq~oJt(bgrRi+CwlY#i$krxow9yZvC^1-rAY_daB&x|4
ztF=~A6l_FVAl0yiIT#3Tq_LKXZf@l)ozKpPd+xdCo^w8Z@8}=6uNN`WMT~S2BVEKu
z7ctUBjFfA?)-Sp?ZSXW-fvW<Tzm&CkZg99!(~{aU4+f`hM)s>nAx28w$fTRU9K8kj
zP?obSees>s`4eH!`@P5xh^97A5l-|*04c<f-fw>J(8}$v3lCRin2zqMz!x9f+1fIC
zaN_0GdoWc$vZpk$&{le^91=04=#uX%bAQ$bw84DU6S3e-*;M$4C&ZVH+Pkmgp#{FJ
z0?j86d<2OYQpS#USiyn4(-l2Bd?l$(tPeE50bG98sXuGw`dhj+i%%VHXoExyDRbu;
z6>&43`}FrrA{N3>-FR1>N2^t{6<xFClh1$S3p9VGOhF=s6xh`P*Vm7HQrXP}z#H1O
z{;E$j`}q7bog^YT!y;iEZ&_tX#E=TYpO>5He@cJkVj`$8VPxBye!(<W?pB{eu4>+N
zRrl?a;Ymoukh15STduF|tIf%}YC`gcEAwtBjvlX?SleeDIFmDjS#@{6Z$l!6)ZR4x
zScdLh2;~p4AXfa*%E%ZoVRvtI(44QCe0XKW(I*T4fJ6*w=}@Q0Rd-$-im{^u@J7qE
zJHCjE7k(-nbWnZEiWF=a>~$ST#7McU7bhZi$E%xq43`ywi3*<bgdKSCySz~am$Dyl
ziLY$ery&s|B^H9K%#|hXzv8Np#!KvxHBhjlFY=em8ej$SR_lf#NW_rhPS^^ff3sQh
zBP%`>cl=T7+dYKwVx#nelKeNDGHh|AW8-;9#E^{pin<ihbgbZ9nd<~sGI_e#(S9pR
zbU%0Rlbv3}SNYd7J+bST>hq9@AqB&B$9C(rv#pJqdabj%L^I09{h>tY!xQGwR~BCC
zi>*#Y&b*oZ-d;$=knF9a$&f?qdb8YrHby6p?pwROuZKc`1@Gq?xKunA%9xAJ#C-!2
zF;Y&qugm;9p4SfW$0u@*Z!(or!zC+A)1wXh9X)w)pjH_l__R_rAQ40ImZQEhldT|O
zB#cGfP?y5+#A0;S6hK_)HbE7xio1v*EoWJ(Mz8}&YH(FZNe%Es+yqlBl+?iF=vELC
zG5(j-0841hs?DY!3$gH>01~<{kQBZWH<efjDa4Tcqyg9gCP;A;R?t<EFyIM*9fJug
wnH~i0B1XE1kuGASix}x5M!JZRE@GtXKb6C-5RimauK)l507*qoM6N<$f+<DtvH$=8

diff --git a/table_ocr/ocr_image.py b/table_ocr/ocr_image.py
index e2c886b..f92e786 100644
--- a/table_ocr/ocr_image.py
+++ b/table_ocr/ocr_image.py
@@ -1,9 +1,20 @@
+import argparse
+import math
 import os
 import sys
 
 import cv2
 import pytesseract
 
+description="""Takes a single argument that is the image to OCR.
+Remaining arguments are passed directly to Tesseract.
+
+Attempts to make OCR more accurate by performing some modifications on the image.
+Saves the modified image and the OCR text in an `ocr_data` directory.
+Filenames are of the format for training with tesstrain."""
+parser = argparse.ArgumentParser(description=description)
+parser.add_argument("image", help="filepath of image to perform OCR")
+
 def crop_to_text(image):
     MAX_COLOR_VAL = 255
     BLOCK_SIZE = 15
@@ -18,30 +29,32 @@ def crop_to_text(image):
         SUBTRACT_FROM_MEAN,
     )
 
-    # Get rid of littl noise.
-    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
-    opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
+    img_h, img_w = image.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
+    horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    both = horizontal_lines + vertical_lines
+    cleaned = img_bin - both
 
-    # Dilate so each digit is connected, so we can get a bounding rectangle
-    # around all of the digits as one contour. This will make the bounding
-    # rectangle 8 pixels wider on the left and right, so we'll need to crop that
-    # out at the end so that we don't pick up stray border pixels.
-    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
-    dilated = cv2.dilate(opened, kernel)
-
-    contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+    # Get rid of little noise.
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
+    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
 
+    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
     bounding_rects = [cv2.boundingRect(c) for c in contours]
-
+    NUM_PX_COMMA = 6
     if bounding_rects:
-        # The largest contour is certainly the text that we're looking for.
-        largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
-        x, y, w, h = largest_rect
-        # Commas sometimes go a little below the bounding box and we don't want
-        # to lost them or turn them into periods.
-        img_h, img_w = image.shape
-        cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
+        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
+        for x, y, w, h in bounding_rects:
+            minx = min(minx, x)
+            miny = min(miny, y)
+            maxx = max(maxx, x + w)
+            maxy = max(maxy, y + h)
+        x, y, w, h = minx, miny, maxx - minx, maxy - miny
+        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
     else:
+        # If we morphed out all of the text, fallback to using the unmorphed image.
         cropped = image
     bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
     return bordered
@@ -51,19 +64,21 @@ def ocr_image(image, config):
         config=config
     )
 
-def main(f):
-    directory, filename = os.path.split(f)
+def main(image_file, tess_args):
+    directory, filename = os.path.split(image_file)
     filename_sans_ext, ext = os.path.splitext(filename)
-    image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
+    image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
     cropped = crop_to_text(image)
     ocr_data_dir = os.path.join(directory, "ocr_data")
     os.makedirs(ocr_data_dir, exist_ok=True)
     out_imagepath = os.path.join(ocr_data_dir, filename)
     out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
     cv2.imwrite(out_imagepath, cropped)
-    txt = ocr_image(cropped, "--psm 7")
+    txt = ocr_image(cropped, " ".join(tess_args))
+    print(txt)
     with open(out_txtpath, "w") as txt_file:
         txt_file.write(txt)
 
 if __name__ == "__main__":
-    main(sys.argv[1])
+    args, tess_args = parser.parse_known_args()
+    main(args.image, tess_args)
diff --git a/table_ocr/ocr_to_csv.py b/table_ocr/ocr_to_csv.py
index d390bd6..2560233 100644
--- a/table_ocr/ocr_to_csv.py
+++ b/table_ocr/ocr_to_csv.py
@@ -9,6 +9,13 @@ parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
 
 def main(files):
+    """Files must be sorted lexicographically
+    Filenames must be <row>-<colum>.txt.
+    000-000.txt
+    000-001.txt
+    001-000.txt
+    etc...
+    """
     rows = []
     for f in files:
         directory, filename = os.path.split(f)
@@ -26,4 +33,6 @@ def main(files):
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    main(args.files)
+    files = args.files
+    files.sort()
+    main(files)