Add module for outputting csv from parsed table

Make cell extraction a little more accurate.
main
Eric Ihli 5 years ago
parent de398f73c2
commit e49fffa5a7

@ -482,19 +482,48 @@ much fewer than the width of the text. If that's the case, then we can remove
that noise with a simple open morph.
Once the stray border pixels have been removed, we can expand our border using
~openMakeBorder~.
~copyMakeBorder~.
#+BEGIN_SRC python :eval no :noweb-ref crop-to-text
def crop_to_text(image):
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~image,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
# Get rid of littl noise.
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
# Dilate so each digit is connected, so we can get a bounding rectangle
# around all of the digits as one contour. This will make the bounding
# rectangle 8 pixels wider on the left and right, so we'll need to crop that
# out at the end so that we don't pick up stray border pixels.
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
dilated = cv2.dilate(opened, kernel)
contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours]
if bounding_rects:
# The largest contour is certainly the text that we're looking for.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
x, y, w, h = largest_rect
cropped = image[y:y+h, x:x+w]
# Commas sometimes go a little below the bounding box and we don't want
# to lost them or turn them into periods.
img_h, img_w = image.shape
cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
else:
cropped = image
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered
#+END_SRC
@ -513,20 +542,6 @@ cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image)
#+ATTR_HTML: :width 200px :height 100%
[[file:resources/examples/example-table-cell-1-1-cropped.png]]
#+HEADER: :post html-image-size(text=*this*, width="200px")
#+BEGIN_SRC python :noweb no-export :results raw :exports both
import cv2
<<crop-to-text>>
image = cv2.imread("/tmp/example-1/cells/001-002.png", cv2.IMREAD_GRAYSCALE)
image = crop_to_text(image)
cv2.imwrite("/tmp/example-1/cells/001-002-cropped.png", image)
"/tmp/example-1/cells/001-002-cropped.png"
#+END_SRC
#+RESULTS:
#+ATTR_HTML: :width 200px :height 100%
[[file:/tmp/example-1/cells/001-002-cropped.png]]
** OCR each cell
@ -543,9 +558,8 @@ period into a comma, then you might need to do some custom Tesseract training.
#+BEGIN_SRC python :noweb-ref ocr-image :eval no
def ocr_image(image, config):
cropped = crop_to_text(image)
return pytesseract.image_to_string(
~cropped,
image,
config=config
)
#+END_SRC
@ -556,6 +570,7 @@ import cv2
image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
<<crop-to-text>>
<<ocr-image>>
image = crop_to_text(image)
ocr_image(image, "--psm 7")
#+END_SRC
@ -777,6 +792,9 @@ if __name__ == "__main__":
This does a little bit of cleanup before sending it through tesseract.
Creates images and text files that can be used for training tesseract. See
https://github.com/tesseract-ocr/tesstrain.
#+BEGIN_SRC shell :results output
. ~/.virtualenvs/lotto_odds/bin/activate
python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
@ -785,7 +803,8 @@ python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
#+RESULTS:
: PRIZE
#+BEGIN_SRC python :tangle table_ocr/ocr_cell.py :mkdirp yes :results none
#+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none
import os
import sys
import cv2
@ -795,13 +814,59 @@ import pytesseract
<<ocr-image>>
def main(f):
directory, filename = os.path.split(f)
filename_sans_ext, ext = os.path.splitext(filename)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
print(ocr_image(image, "--psm 7"))
cropped = crop_to_text(image)
ocr_data_dir = os.path.join(directory, "ocr_data")
os.makedirs(ocr_data_dir, exist_ok=True)
out_imagepath = os.path.join(ocr_data_dir, filename)
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
cv2.imwrite(out_imagepath, cropped)
txt = ocr_image(cropped, "--psm 7")
with open(out_txtpath, "w") as txt_file:
txt_file.write(txt)
if __name__ == "__main__":
main(sys.argv[1])
#+END_SRC
*** table_ocr/ocr_to_csv.py
#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv.py
import argparse
import csv
import io
import os
import sys
import tempfile
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
rows = []
for f in files:
directory, filename = os.path.split(f)
with open(f) as of:
txt = of.read()
row, column = map(int, filename.split(".")[0].split("-"))
if row == len(rows):
rows.append([])
rows[row].append(txt)
csv_file = io.StringIO()
writer = csv.writer(csv_file)
writer.writerows(rows)
print(csv_file.getvalue())
if __name__ == "__main__":
args = parser.parse_args()
main(args.files)
#+END_SRC
* Utils
The following code lets us specify a size for images when they are exported to

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 KiB

After

Width:  |  Height:  |  Size: 1.0 KiB

@ -1,30 +1,69 @@
import os
import sys
import cv2
import pytesseract
def crop_to_text(image):
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~image,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
# Get rid of littl noise.
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
# Dilate so each digit is connected, so we can get a bounding rectangle
# around all of the digits as one contour. This will make the bounding
# rectangle 8 pixels wider on the left and right, so we'll need to crop that
# out at the end so that we don't pick up stray border pixels.
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
dilated = cv2.dilate(opened, kernel)
contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours]
if bounding_rects:
# The largest contour is certainly the text that we're looking for.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
x, y, w, h = largest_rect
cropped = image[y:y+h, x:x+w]
# Commas sometimes go a little below the bounding box and we don't want
# to lost them or turn them into periods.
img_h, img_w = image.shape
cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
else:
cropped = image
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered
def ocr_image(image, config):
cropped = crop_to_text(image)
return pytesseract.image_to_string(
~cropped,
image,
config=config
)
def main(f):
directory, filename = os.path.split(f)
filename_sans_ext, ext = os.path.splitext(filename)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
print(ocr_image(image, "--psm 7"))
cropped = crop_to_text(image)
ocr_data_dir = os.path.join(directory, "ocr_data")
os.makedirs(ocr_data_dir, exist_ok=True)
out_imagepath = os.path.join(ocr_data_dir, filename)
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
cv2.imwrite(out_imagepath, cropped)
txt = ocr_image(cropped, "--psm 7")
with open(out_txtpath, "w") as txt_file:
txt_file.write(txt)
if __name__ == "__main__":
main(sys.argv[1])

@ -0,0 +1,29 @@
import argparse
import csv
import io
import os
import sys
import tempfile
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
rows = []
for f in files:
directory, filename = os.path.split(f)
with open(f) as of:
txt = of.read()
row, column = map(int, filename.split(".")[0].split("-"))
if row == len(rows):
rows.append([])
rows[row].append(txt)
csv_file = io.StringIO()
writer = csv.writer(csv_file)
writer.writerows(rows)
print(csv_file.getvalue())
if __name__ == "__main__":
args = parser.parse_args()
main(args.files)
Loading…
Cancel
Save