From e49fffa5a72fe6801b88cfaaaca11e70d297c5f0 Mon Sep 17 00:00:00 2001
From: Eric Ihli <ericihli@gmail.com>
Date: Tue, 14 Apr 2020 10:42:58 -0700
Subject: [PATCH] Add module for outputting csv from parsed table

Make cell extraction a little more accurate.
---
 pdf_table_extraction_and_ocr.org              | 117 ++++++++++++++----
 .../example-table-cell-1-1-cropped.png        | Bin 1058 -> 1026 bytes
 table_ocr/ocr_image.py                        |  59 +++++++--
 table_ocr/ocr_to_csv.py                       |  29 +++++
 4 files changed, 169 insertions(+), 36 deletions(-)
 create mode 100644 table_ocr/ocr_to_csv.py
diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org
index b77687b..1ab9ed5 100644
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@@ -482,19 +482,48 @@ much fewer than the width of the text. If that's the case, then we can remove
 that noise with a simple open morph.
 
 Once the stray border pixels have been removed, we can expand our border using
-~openMakeBorder~.
+~copyMakeBorder~.
 
 #+BEGIN_SRC python :eval no :noweb-ref crop-to-text
 def crop_to_text(image):
-    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
-    opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    SUBTRACT_FROM_MEAN = -2
+
+    img_bin = cv2.adaptiveThreshold(
+        ~image,
+        MAX_COLOR_VAL,
+        cv2.ADAPTIVE_THRESH_MEAN_C,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+        SUBTRACT_FROM_MEAN,
+    )
+
+    # Get rid of littl noise.
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
+    opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
+
+    # Dilate so each digit is connected, so we can get a bounding rectangle
+    # around all of the digits as one contour. This will make the bounding
+    # rectangle 8 pixels wider on the left and right, so we'll need to crop that
+    # out at the end so that we don't pick up stray border pixels.
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
+    dilated = cv2.dilate(opened, kernel)
+
+    contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
 
-    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
     bounding_rects = [cv2.boundingRect(c) for c in contours]
-    # The largest contour is certainly the text that we're looking for.
-    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
-    x, y, w, h = largest_rect
-    cropped = image[y:y+h, x:x+w]
+
+    if bounding_rects:
+        # The largest contour is certainly the text that we're looking for.
+        largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
+        x, y, w, h = largest_rect
+        # Commas sometimes go a little below the bounding box and we don't want
+        # to lost them or turn them into periods.
+        img_h, img_w = image.shape
+        cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
+    else:
+        cropped = image
     bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
     return bordered
 #+END_SRC
@@ -513,20 +542,6 @@ cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image)
 #+ATTR_HTML: :width 200px :height 100%
 [[file:resources/examples/example-table-cell-1-1-cropped.png]]
 
-#+HEADER: :post html-image-size(text=*this*, width="200px")
-#+BEGIN_SRC python :noweb no-export :results raw :exports both
-import cv2
-<<crop-to-text>>
-image = cv2.imread("/tmp/example-1/cells/001-002.png", cv2.IMREAD_GRAYSCALE)
-image = crop_to_text(image)
-cv2.imwrite("/tmp/example-1/cells/001-002-cropped.png", image)
-"/tmp/example-1/cells/001-002-cropped.png"
-#+END_SRC
-
-#+RESULTS:
-#+ATTR_HTML: :width 200px :height 100%
-[[file:/tmp/example-1/cells/001-002-cropped.png]]
-
 
 ** OCR each cell
 
@@ -543,9 +558,8 @@ period into a comma, then you might need to do some custom Tesseract training.
 
 #+BEGIN_SRC python :noweb-ref ocr-image :eval no
 def ocr_image(image, config):
-    cropped = crop_to_text(image)
     return pytesseract.image_to_string(
-        ~cropped,
+        image,
         config=config
     )
 #+END_SRC
@@ -556,6 +570,7 @@ import cv2
 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
 <<crop-to-text>>
 <<ocr-image>>
+image = crop_to_text(image)
 ocr_image(image, "--psm 7")
 #+END_SRC
 
@@ -777,6 +792,9 @@ if __name__ == "__main__":
 
 This does a little bit of cleanup before sending it through tesseract.
 
+Creates images and text files that can be used for training tesseract. See
+https://github.com/tesseract-ocr/tesstrain.
+
 #+BEGIN_SRC shell :results output
 . ~/.virtualenvs/lotto_odds/bin/activate
 python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
@@ -785,7 +803,8 @@ python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
 #+RESULTS:
 : PRIZE
 
-#+BEGIN_SRC python :tangle table_ocr/ocr_cell.py :mkdirp yes :results none
+#+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none
+import os
 import sys
 
 import cv2
@@ -795,13 +814,59 @@ import pytesseract
 <<ocr-image>>
 
 def main(f):
+    directory, filename = os.path.split(f)
+    filename_sans_ext, ext = os.path.splitext(filename)
     image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
-    print(ocr_image(image, "--psm 7"))
+    cropped = crop_to_text(image)
+    ocr_data_dir = os.path.join(directory, "ocr_data")
+    os.makedirs(ocr_data_dir, exist_ok=True)
+    out_imagepath = os.path.join(ocr_data_dir, filename)
+    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
+    cv2.imwrite(out_imagepath, cropped)
+    txt = ocr_image(cropped, "--psm 7")
+    with open(out_txtpath, "w") as txt_file:
+        txt_file.write(txt)
 
 if __name__ == "__main__":
     main(sys.argv[1])
 #+END_SRC
 
+*** table_ocr/ocr_to_csv.py
+
+#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv.py
+import argparse
+import csv
+import io
+import os
+import sys
+import tempfile
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+def main(files):
+    rows = []
+    for f in files:
+        directory, filename = os.path.split(f)
+        with open(f) as of:
+            txt = of.read()
+        row, column = map(int, filename.split(".")[0].split("-"))
+        if row == len(rows):
+            rows.append([])
+        rows[row].append(txt)
+
+    csv_file = io.StringIO()
+    writer = csv.writer(csv_file)
+    writer.writerows(rows)
+    print(csv_file.getvalue())
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.files)
+
+
+#+END_SRC
+
 * Utils
 
 The following code lets us specify a size for images when they are exported to
diff --git a/resources/examples/example-table-cell-1-1-cropped.png b/resources/examples/example-table-cell-1-1-cropped.png
index 2ba2327016cfcc28b0fc151be47674463af6f091..4aba6ecd1931960d623cd5c138fb9ca37260c5e9 100644
GIT binary patch
literal 1026
zcmV+d1pWJoP)<h;3K|Lk000e1NJLTq001%o001Hg00000ID*yo000BYNkl<ZI1!b>
z{YzbS90&0C=X}q@J@=~j-tq~oJt(bgrRi+CwlY#i$krxow9yZvC^1-rAY_daB&x|4
ztF=~A6l_FVAl0yiIT#3Tq_LKXZf@l)ozKpPd+xdCo^w8Z@8}=6uNN`WMT~S2BVEKu
z7ctUBjFfA?)-Sp?ZSXW-fvW<Tzm&CkZg99!(~{aU4+f`hM)s>nAx28w$fTRU9K8kj
zP?obSees>s`4eH!`@P5xh^97A5l-|*04c<f-fw>J(8}$v3lCRin2zqMz!x9f+1fIC
zaN_0GdoWc$vZpk$&{le^91=04=#uX%bAQ$bw84DU6S3e-*;M$4C&ZVH+Pkmgp#{FJ
z0?j86d<2OYQpS#USiyn4(-l2Bd?l$(tPeE50bG98sXuGw`dhj+i%%VHXoExyDRbu;
z6>&43`}FrrA{N3>-FR1>N2^t{6<xFClh1$S3p9VGOhF=s6xh`P*Vm7HQrXP}z#H1O
z{;E$j`}q7bog^YT!y;iEZ&_tX#E=TYpO>5He@cJkVj`$8VPxBye!(<W?pB{eu4>+N
zRrl?a;Ymoukh15STduF|tIf%}YC`gcEAwtBjvlX?SleeDIFmDjS#@{6Z$l!6)ZR4x
zScdLh2;~p4AXfa*%E%ZoVRvtI(44QCe0XKW(I*T4fJ6*w=}@Q0Rd-$-im{^u@J7qE
zJHCjE7k(-nbWnZEiWF=a>~$ST#7McU7bhZi$E%xq43`ywi3*<bgdKSCySz~am$Dyl
ziLY$ery&s|B^H9K%#|hXzv8Np#!KvxHBhjlFY=em8ej$SR_lf#NW_rhPS^^ff3sQh
zBP%`>cl=T7+dYKwVx#nelKeNDGHh|AW8-;9#E^{pin<ihbgbZ9nd<~sGI_e#(S9pR
zbU%0Rlbv3}SNYd7J+bST>hq9@AqB&B$9C(rv#pJqdabj%L^I09{h>tY!xQGwR~BCC
zi>*#Y&b*oZ-d;$=knF9a$&f?qdb8YrHby6p?pwROuZKc`1@Gq?xKunA%9xAJ#C-!2
zF;Y&qugm;9p4SfW$0u@*Z!(or!zC+A)1wXh9X)w)pjH_l__R_rAQ40ImZQEhldT|O
zB#cGfP?y5+#A0;S6hK_)HbE7xio1v*EoWJ(Mz8}&YH(FZNe%Es+yqlBl+?iF=vELC
zG5(j-0841hs?DY!3$gH>01~<{kQBZWH<efjDa4Tcqyg9gCP;A;R?t<EFyIM*9fJug
wnH~i0B1XE1kuGASix}x5M!JZRE@GtXKb6C-5RimauK)l507*qoM6N<$f+<DtvH$=8

literal 1058
zcmV+-1l{|IP)<h;3K|Lk000e1NJLTq001@s0018d00000iH%|Z000B&Nkl<ZI1!z}
z{cBZq90&0C=leYm_hEPUUfUB`K1@=VwY9mNrnPV)AwGhU(hrj4SYTpMqM`$dO4G2+
zEKw3Bl`9yfXkrr#VM?>i*4T7fbGEtXx!v7!?>+aN^XZ&Rf5Cmdh>;CqWP=#lAVxNb
zkqu(FVj)|OPBElLredp(Na~KII|8v#T>6*>3N3}n2w*TT9)gD$E?eIIb={biq}jIW
z2v`b$k!!gNhTA&oR5h(ma9g1NUSy+!{}Cf&MQ5K?G!eZII1m%LhBo%zv7+9vZT~SV
z>61H-+d`VryFR2<c!=Touzt?A*u$@jcP`BFo0_EnCqI6)cwg7n-Zu_RLI1kW^<}Bi
z<7EenBPks&Vz|z38h0Eq{MRy{+E%34G9DN#?+^d@y!@(8egAFD9p#4#P=938M^b@@
z7_O7sSDt|pxbezBWwQoHq}AS;zWR57>o1$@P8$WzqD!j5M|aj9hpE9u4A-7dS2ZX|
z_!9*${lFySLA0;vIlscD$%R9e=Z7l#Uj5dQsQ*%2nQJL<5yN$G@0vzfj<);rs!L1&
z?Di!yZ#hIm&y6f?A{oi=kSYDpzRA*GBJdEy_07BC6BT~6zNkHYB^3}%=v*?bRWfx~
zSk>oHp!n}uinaGhc<E_^hZwFCyGH8w-I?C9EI;>_A1=G2s_?EbwZy{S=`H%EhWtSc
zDQjDuCC$hnhU?MA9RttiX!hk`Q9Daw;dE6+x`_#EuSVPaBQ<?b#bTz`SG;{QGKk?B
z-_|4xlvP*S<7{dG?5+y+ks}lGwF|{<CaUink*SN?u3ATHkwJ`%>sm=Fl5Bi)ezR_|
zkm#-Cep@DeuYX_IC2%e8NsIXAa_#p}j)038E=R<Ju*7hA>WQ0{0#{FIvU3^)R<=a`
zwpay>0QMZ1{ij3*F<b_jVIzRn1^$||EF1`{=NaR>^@N@hz3j3`|C>iSNx8dm-UUmA
zhZwGgEsxJSFXWDcQ_q!KW*`>gff7^wZC2{i>c?JavTIKj-3i%p&Ck_8IS_CW!}VZJ
zIN7+|c<c0mI#s#dR9&j-<vmV&D)`A^|Nbqbn_J?O{gH-Uc^|Bn3OvMc#ni=J>7Yq7
zuI5?~=0*GVZ<s!@p_zidQTt*IEGp>^<_t#%6ONAYFkHmQICf@6=$}O48Gt`KoPTJ(
zUmWcyjg1d<)owJk^wt~8M9+;csuVq~!b1#?JrQ-pV4grqPw8Wtu8fCa%dzOBUjhlK
zB?*eO6eOhz4>3FwxrU+=l0KwWSPEQe6|iN(52ko9t%AwajKH|&hl?25AVxNbkqu&G
cgBaQP4<y005G-;H#Q*>R07*qoM6N<$f;IpH(f|Me

diff --git a/table_ocr/ocr_image.py b/table_ocr/ocr_image.py
index b15a28c..e2c886b 100644
--- a/table_ocr/ocr_image.py
+++ b/table_ocr/ocr_image.py
@@ -1,30 +1,69 @@
+import os
 import sys
 
 import cv2
 import pytesseract
 
 def crop_to_text(image):
-    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
-    opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    SUBTRACT_FROM_MEAN = -2
+
+    img_bin = cv2.adaptiveThreshold(
+        ~image,
+        MAX_COLOR_VAL,
+        cv2.ADAPTIVE_THRESH_MEAN_C,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+        SUBTRACT_FROM_MEAN,
+    )
+
+    # Get rid of littl noise.
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
+    opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
+
+    # Dilate so each digit is connected, so we can get a bounding rectangle
+    # around all of the digits as one contour. This will make the bounding
+    # rectangle 8 pixels wider on the left and right, so we'll need to crop that
+    # out at the end so that we don't pick up stray border pixels.
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
+    dilated = cv2.dilate(opened, kernel)
+
+    contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
 
-    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
     bounding_rects = [cv2.boundingRect(c) for c in contours]
-    # The largest contour is certainly the text that we're looking for.
-    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
-    x, y, w, h = largest_rect
-    cropped = image[y:y+h, x:x+w]
+
+    if bounding_rects:
+        # The largest contour is certainly the text that we're looking for.
+        largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
+        x, y, w, h = largest_rect
+        # Commas sometimes go a little below the bounding box and we don't want
+        # to lost them or turn them into periods.
+        img_h, img_w = image.shape
+        cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
+    else:
+        cropped = image
     bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
     return bordered
 def ocr_image(image, config):
-    cropped = crop_to_text(image)
     return pytesseract.image_to_string(
-        ~cropped,
+        image,
         config=config
     )
 
 def main(f):
+    directory, filename = os.path.split(f)
+    filename_sans_ext, ext = os.path.splitext(filename)
     image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
-    print(ocr_image(image, "--psm 7"))
+    cropped = crop_to_text(image)
+    ocr_data_dir = os.path.join(directory, "ocr_data")
+    os.makedirs(ocr_data_dir, exist_ok=True)
+    out_imagepath = os.path.join(ocr_data_dir, filename)
+    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
+    cv2.imwrite(out_imagepath, cropped)
+    txt = ocr_image(cropped, "--psm 7")
+    with open(out_txtpath, "w") as txt_file:
+        txt_file.write(txt)
 
 if __name__ == "__main__":
     main(sys.argv[1])
diff --git a/table_ocr/ocr_to_csv.py b/table_ocr/ocr_to_csv.py
new file mode 100644
index 0000000..98eff38
--- /dev/null
+++ b/table_ocr/ocr_to_csv.py
@@ -0,0 +1,29 @@
+import argparse
+import csv
+import io
+import os
+import sys
+import tempfile
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+def main(files):
+    rows = []
+    for f in files:
+        directory, filename = os.path.split(f)
+        with open(f) as of:
+            txt = of.read()
+        row, column = map(int, filename.split(".")[0].split("-"))
+        if row == len(rows):
+            rows.append([])
+        rows[row].append(txt)
+
+    csv_file = io.StringIO()
+    writer = csv.writer(csv_file)
+    writer.writerows(rows)
+    print(csv_file.getvalue())
+   
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.files)