Add module for outputting csv from parsed table

Make cell extraction a little more accurate.
main
Eric Ihli 4 years ago
parent de398f73c2
commit e49fffa5a7

@ -482,19 +482,48 @@ much fewer than the width of the text. If that's the case, then we can remove
that noise with a simple open morph. that noise with a simple open morph.
Once the stray border pixels have been removed, we can expand our border using Once the stray border pixels have been removed, we can expand our border using
~openMakeBorder~. ~copyMakeBorder~.
#+BEGIN_SRC python :eval no :noweb-ref crop-to-text #+BEGIN_SRC python :eval no :noweb-ref crop-to-text
def crop_to_text(image): def crop_to_text(image):
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4)) MAX_COLOR_VAL = 255
opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel) BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~image,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
# Get rid of littl noise.
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
# Dilate so each digit is connected, so we can get a bounding rectangle
# around all of the digits as one contour. This will make the bounding
# rectangle 8 pixels wider on the left and right, so we'll need to crop that
# out at the end so that we don't pick up stray border pixels.
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
dilated = cv2.dilate(opened, kernel)
contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours] bounding_rects = [cv2.boundingRect(c) for c in contours]
# The largest contour is certainly the text that we're looking for.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) if bounding_rects:
x, y, w, h = largest_rect # The largest contour is certainly the text that we're looking for.
cropped = image[y:y+h, x:x+w] largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
x, y, w, h = largest_rect
# Commas sometimes go a little below the bounding box and we don't want
# to lost them or turn them into periods.
img_h, img_w = image.shape
cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
else:
cropped = image
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered return bordered
#+END_SRC #+END_SRC
@ -513,20 +542,6 @@ cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image)
#+ATTR_HTML: :width 200px :height 100% #+ATTR_HTML: :width 200px :height 100%
[[file:resources/examples/example-table-cell-1-1-cropped.png]] [[file:resources/examples/example-table-cell-1-1-cropped.png]]
#+HEADER: :post html-image-size(text=*this*, width="200px")
#+BEGIN_SRC python :noweb no-export :results raw :exports both
import cv2
<<crop-to-text>>
image = cv2.imread("/tmp/example-1/cells/001-002.png", cv2.IMREAD_GRAYSCALE)
image = crop_to_text(image)
cv2.imwrite("/tmp/example-1/cells/001-002-cropped.png", image)
"/tmp/example-1/cells/001-002-cropped.png"
#+END_SRC
#+RESULTS:
#+ATTR_HTML: :width 200px :height 100%
[[file:/tmp/example-1/cells/001-002-cropped.png]]
** OCR each cell ** OCR each cell
@ -543,9 +558,8 @@ period into a comma, then you might need to do some custom Tesseract training.
#+BEGIN_SRC python :noweb-ref ocr-image :eval no #+BEGIN_SRC python :noweb-ref ocr-image :eval no
def ocr_image(image, config): def ocr_image(image, config):
cropped = crop_to_text(image)
return pytesseract.image_to_string( return pytesseract.image_to_string(
~cropped, image,
config=config config=config
) )
#+END_SRC #+END_SRC
@ -556,6 +570,7 @@ import cv2
image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE) image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
<<crop-to-text>> <<crop-to-text>>
<<ocr-image>> <<ocr-image>>
image = crop_to_text(image)
ocr_image(image, "--psm 7") ocr_image(image, "--psm 7")
#+END_SRC #+END_SRC
@ -777,6 +792,9 @@ if __name__ == "__main__":
This does a little bit of cleanup before sending it through tesseract. This does a little bit of cleanup before sending it through tesseract.
Creates images and text files that can be used for training tesseract. See
https://github.com/tesseract-ocr/tesstrain.
#+BEGIN_SRC shell :results output #+BEGIN_SRC shell :results output
. ~/.virtualenvs/lotto_odds/bin/activate . ~/.virtualenvs/lotto_odds/bin/activate
python -m table_ocr.ocr_cell resources/examples/cells/000-000.png python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
@ -785,7 +803,8 @@ python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
#+RESULTS: #+RESULTS:
: PRIZE : PRIZE
#+BEGIN_SRC python :tangle table_ocr/ocr_cell.py :mkdirp yes :results none #+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none
import os
import sys import sys
import cv2 import cv2
@ -795,13 +814,59 @@ import pytesseract
<<ocr-image>> <<ocr-image>>
def main(f): def main(f):
directory, filename = os.path.split(f)
filename_sans_ext, ext = os.path.splitext(filename)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
print(ocr_image(image, "--psm 7")) cropped = crop_to_text(image)
ocr_data_dir = os.path.join(directory, "ocr_data")
os.makedirs(ocr_data_dir, exist_ok=True)
out_imagepath = os.path.join(ocr_data_dir, filename)
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
cv2.imwrite(out_imagepath, cropped)
txt = ocr_image(cropped, "--psm 7")
with open(out_txtpath, "w") as txt_file:
txt_file.write(txt)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1]) main(sys.argv[1])
#+END_SRC #+END_SRC
*** table_ocr/ocr_to_csv.py
#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv.py
import argparse
import csv
import io
import os
import sys
import tempfile
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
rows = []
for f in files:
directory, filename = os.path.split(f)
with open(f) as of:
txt = of.read()
row, column = map(int, filename.split(".")[0].split("-"))
if row == len(rows):
rows.append([])
rows[row].append(txt)
csv_file = io.StringIO()
writer = csv.writer(csv_file)
writer.writerows(rows)
print(csv_file.getvalue())
if __name__ == "__main__":
args = parser.parse_args()
main(args.files)
#+END_SRC
* Utils * Utils
The following code lets us specify a size for images when they are exported to The following code lets us specify a size for images when they are exported to

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 KiB

After

Width:  |  Height:  |  Size: 1.0 KiB

@ -1,30 +1,69 @@
import os
import sys import sys
import cv2 import cv2
import pytesseract import pytesseract
def crop_to_text(image): def crop_to_text(image):
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4)) MAX_COLOR_VAL = 255
opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel) BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~image,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
# Get rid of littl noise.
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
# Dilate so each digit is connected, so we can get a bounding rectangle
# around all of the digits as one contour. This will make the bounding
# rectangle 8 pixels wider on the left and right, so we'll need to crop that
# out at the end so that we don't pick up stray border pixels.
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
dilated = cv2.dilate(opened, kernel)
contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours] bounding_rects = [cv2.boundingRect(c) for c in contours]
# The largest contour is certainly the text that we're looking for.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) if bounding_rects:
x, y, w, h = largest_rect # The largest contour is certainly the text that we're looking for.
cropped = image[y:y+h, x:x+w] largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
x, y, w, h = largest_rect
# Commas sometimes go a little below the bounding box and we don't want
# to lost them or turn them into periods.
img_h, img_w = image.shape
cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
else:
cropped = image
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered return bordered
def ocr_image(image, config): def ocr_image(image, config):
cropped = crop_to_text(image)
return pytesseract.image_to_string( return pytesseract.image_to_string(
~cropped, image,
config=config config=config
) )
def main(f): def main(f):
directory, filename = os.path.split(f)
filename_sans_ext, ext = os.path.splitext(filename)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
print(ocr_image(image, "--psm 7")) cropped = crop_to_text(image)
ocr_data_dir = os.path.join(directory, "ocr_data")
os.makedirs(ocr_data_dir, exist_ok=True)
out_imagepath = os.path.join(ocr_data_dir, filename)
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
cv2.imwrite(out_imagepath, cropped)
txt = ocr_image(cropped, "--psm 7")
with open(out_txtpath, "w") as txt_file:
txt_file.write(txt)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1]) main(sys.argv[1])

@ -0,0 +1,29 @@
import argparse
import csv
import io
import os
import sys
import tempfile
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
rows = []
for f in files:
directory, filename = os.path.split(f)
with open(f) as of:
txt = of.read()
row, column = map(int, filename.split(".")[0].split("-"))
if row == len(rows):
rows.append([])
rows[row].append(txt)
csv_file = io.StringIO()
writer = csv.writer(csv_file)
writer.writerows(rows)
print(csv_file.getvalue())
if __name__ == "__main__":
args = parser.parse_args()
main(args.files)
Loading…
Cancel
Save