Add example image and csv output

Give more code blocks names
main
Eric Ihli 5 years ago
parent 4eca593944
commit 6891fc9990

@ -22,6 +22,34 @@ output~ to a code block will minimize that noise.
* Overview * Overview
This Python package provides utilities for extracting tabular data from PDF
files and images of tables.
Given an image that contains a table...
#+ATTR_HTML: :width 25%
[[file:resources/examples/example-page.png]]
Extract the the text into a CSV format...
#+BEGIN_EXAMPLE
PRIZE,ODDS 1 IN:,# OF WINNERS*
$3,9.09,"282,447"
$5,16.66,"154,097"
$7,40.01,"64,169"
$10,26.67,"96,283"
$20,100.00,"25,677"
$30,290.83,"8,829"
$50,239.66,"10,714"
$100,919.66,"2,792"
$500,"6,652.07",386
"$40,000","855,899.99",3
1,i223,
Toa,,
,,
,,"* Based upon 2,567,700"
#+END_EXAMPLE
** To get CSV data from a table in a scanned pdf document: ** To get CSV data from a table in a scanned pdf document:
#+BEGIN_SRC shell :results none :session *Shell* #+BEGIN_SRC shell :results none :session *Shell*
@ -367,7 +395,8 @@ header bar or something. If we know our cells are all within a certain size (by
area of pixels) then we can filter out the junk cells by removing cells area of pixels) then we can filter out the junk cells by removing cells
above/below certain sizes. above/below certain sizes.
#+BEGIN_SRC python :noweb-ref bounding-rects :results none #+NAME: bounding-rects
#+BEGIN_SRC python :results none
contours, heirarchy = cv2.findContours( contours, heirarchy = cv2.findContours(
mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE, mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
) )
@ -409,7 +438,8 @@ of the rectangles that have a center that is within the top-y and bottom-y
values of that top-left rectangle. Then we'll sort those rectangles by the x values of that top-left rectangle. Then we'll sort those rectangles by the x
value of their center. We'll remove those rectangles from the list and repeat. value of their center. We'll remove those rectangles from the list and repeat.
#+BEGIN_SRC python :noweb-ref sort-contours :results none #+NAME: sort-contours
#+BEGIN_SRC python :results none
def cell_in_same_row(c1, c2): def cell_in_same_row(c1, c2):
c1_center = c1[1] + c1[3] - c1[3] / 2 c1_center = c1[1] + c1[3] - c1[3] / 2
c2_bottom = c2[1] + c2[3] c2_bottom = c2[1] + c2[3]
@ -479,7 +509,8 @@ cv2.imwrite("resources/examples/example-table-cells-numbered.png", image)
#+ATTR_HTML: :width 500px :height 100% #+ATTR_HTML: :width 500px :height 100%
[[file:resources/examples/example-table-cells-numbered.png]] [[file:resources/examples/example-table-cells-numbered.png]]
#+BEGIN_SRC python :noweb-ref extract-cells-from-table :noweb yes :eval no #+NAME: extract-cells-from-table
#+BEGIN_SRC python :noweb yes :eval no
def extract_cell_images_from_table(image): def extract_cell_images_from_table(image):
<<blur>> <<blur>>
<<threshold>> <<threshold>>
@ -547,14 +578,16 @@ def crop_to_text(image):
# Get rid of little noise. # Get rid of little noise.
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel) opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
opened = cv2.dilate(opened, kernel)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours] bounding_rects = [cv2.boundingRect(c) for c in contours]
NUM_PX_COMMA = 6 NUM_PX_COMMA = 6
MIN_CHAR_AREA = 5 * 9 MIN_CHAR_AREA = 5 * 9
if bounding_rects: char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
if char_sized_bounding_rects:
minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]: for x, y, w, h in char_sized_bounding_rects:
minx = min(minx, x) minx = min(minx, x)
miny = min(miny, y) miny = min(miny, y)
maxx = max(maxx, x + w) maxx = max(maxx, x + w)
@ -562,8 +595,8 @@ def crop_to_text(image):
x, y, w, h = minx, miny, maxx - minx, maxy - miny x, y, w, h = minx, miny, maxx - minx, maxy - miny
cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)] cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
else: else:
# If we morphed out all of the text, fallback to using the unmorphed image. # If we morphed out all of the text, assume an empty image.
cropped = image cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered return bordered
#+END_SRC #+END_SRC
@ -571,6 +604,7 @@ def crop_to_text(image):
#+HEADER: :post html-image-size(text=*this*, width="200px") #+HEADER: :post html-image-size(text=*this*, width="200px")
#+BEGIN_SRC python :noweb no-export :results raw :exports both #+BEGIN_SRC python :noweb no-export :results raw :exports both
import cv2 import cv2
import numpy as np
<<crop-to-text>> <<crop-to-text>>
image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE) image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
image = crop_to_text(image) image = crop_to_text(image)
@ -606,6 +640,7 @@ def ocr_image(image, config):
#+BEGIN_SRC python :noweb no-export :exports both #+BEGIN_SRC python :noweb no-export :exports both
import pytesseract import pytesseract
import cv2 import cv2
import numpy as np
image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE) image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
<<crop-to-text>> <<crop-to-text>>
<<ocr-image>> <<ocr-image>>
@ -884,6 +919,8 @@ if __name__ == "__main__":
import math import math
import cv2 import cv2
import numpy as np
import pytesseract
<<crop-to-text>> <<crop-to-text>>
<<ocr-image>> <<ocr-image>>
@ -902,7 +939,8 @@ import os
import sys import sys
import cv2 import cv2
import pytesseract
from table_ocr.ocr_image import crop_to_text, ocr_image
description="""Takes a single argument that is the image to OCR. description="""Takes a single argument that is the image to OCR.
Remaining arguments are passed directly to Tesseract. Remaining arguments are passed directly to Tesseract.
@ -913,9 +951,6 @@ Filenames are of the format for training with tesstrain."""
parser = argparse.ArgumentParser(description=description) parser = argparse.ArgumentParser(description=description)
parser.add_argument("image", help="filepath of image to perform OCR") parser.add_argument("image", help="filepath of image to perform OCR")
<<crop-to-text>>
<<ocr-image>>
def main(image_file, tess_args): def main(image_file, tess_args):
directory, filename = os.path.split(image_file) directory, filename = os.path.split(image_file)
filename_sans_ext, ext = os.path.splitext(filename) filename_sans_ext, ext = os.path.splitext(filename)

@ -1,6 +1,8 @@
import math import math
import cv2 import cv2
import numpy as np
import pytesseract
def crop_to_text(image): def crop_to_text(image):
MAX_COLOR_VAL = 255 MAX_COLOR_VAL = 255
@ -27,14 +29,16 @@ def crop_to_text(image):
# Get rid of little noise. # Get rid of little noise.
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel) opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
opened = cv2.dilate(opened, kernel)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours] bounding_rects = [cv2.boundingRect(c) for c in contours]
NUM_PX_COMMA = 6 NUM_PX_COMMA = 6
MIN_CHAR_AREA = 5 * 9 MIN_CHAR_AREA = 5 * 9
if bounding_rects: char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
if char_sized_bounding_rects:
minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]: for x, y, w, h in char_sized_bounding_rects:
minx = min(minx, x) minx = min(minx, x)
miny = min(miny, y) miny = min(miny, y)
maxx = max(maxx, x + w) maxx = max(maxx, x + w)
@ -42,8 +46,8 @@ def crop_to_text(image):
x, y, w, h = minx, miny, maxx - minx, maxy - miny x, y, w, h = minx, miny, maxx - minx, maxy - miny
cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)] cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
else: else:
# If we morphed out all of the text, fallback to using the unmorphed image. # If we morphed out all of the text, assume an empty image.
cropped = image cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered return bordered
def ocr_image(image, config): def ocr_image(image, config):

@ -4,7 +4,8 @@ import os
import sys import sys
import cv2 import cv2
import pytesseract
from table_ocr.ocr_image import crop_to_text, ocr_image
description="""Takes a single argument that is the image to OCR. description="""Takes a single argument that is the image to OCR.
Remaining arguments are passed directly to Tesseract. Remaining arguments are passed directly to Tesseract.
@ -15,56 +16,6 @@ Filenames are of the format for training with tesstrain."""
parser = argparse.ArgumentParser(description=description) parser = argparse.ArgumentParser(description=description)
parser.add_argument("image", help="filepath of image to perform OCR") parser.add_argument("image", help="filepath of image to perform OCR")
def crop_to_text(image):
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~image,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
img_h, img_w = image.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
both = horizontal_lines + vertical_lines
cleaned = img_bin - both
# Get rid of little noise.
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours]
NUM_PX_COMMA = 6
MIN_CHAR_AREA = 5 * 9
if bounding_rects:
minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
minx = min(minx, x)
miny = min(miny, y)
maxx = max(maxx, x + w)
maxy = max(maxy, y + h)
x, y, w, h = minx, miny, maxx - minx, maxy - miny
cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
else:
# If we morphed out all of the text, fallback to using the unmorphed image.
cropped = image
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered
def ocr_image(image, config):
return pytesseract.image_to_string(
image,
config=config
)
def main(image_file, tess_args): def main(image_file, tess_args):
directory, filename = os.path.split(image_file) directory, filename = os.path.split(image_file)
filename_sans_ext, ext = os.path.splitext(filename) filename_sans_ext, ext = os.path.splitext(filename)

Loading…
Cancel
Save