Refactor extract_cells into module

main
Eric Ihli 5 years ago
parent b9f088cf92
commit b911f87126

@ -3,7 +3,7 @@
#+TITLE: PDF Parsing #+TITLE: PDF Parsing
#+PROPERTY: header-args :session *Python* #+PROPERTY: header-args :session *Python*
#+STARTUP: inlineimages #+STARTUP: inlineimages
#+OPTIONS: ^:nil #+OPTIONS: ^:nil H:4
#+BEGIN_COMMENT #+BEGIN_COMMENT
Some notes about the header for those not familiar with Org Mode: Some notes about the header for those not familiar with Org Mode:
@ -659,13 +659,13 @@ setuptools.setup(
** table_ocr ** table_ocr
*** table_ocr/__init__.py *** table_ocr/__init__.py
#+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none #+BEGIN_SRC python :tangle table_ocr/__init__.py :results none :exports none
#+END_SRC #+END_SRC
*** table_ocr/util.py *** table_ocr/util.py
#+BEGIN_SRC python :tangle table_ocr/util.py :mkdirp yes :results none #+BEGIN_SRC python :tangle table_ocr/util.py :results none
from contextlib import contextmanager from contextlib import contextmanager
import functools import functools
import logging import logging
@ -687,11 +687,12 @@ def working_dir(directory):
def make_tempdir(identifier): def make_tempdir(identifier):
return tempfile.mkdtemp(prefix="{}_".format(identifier)) return tempfile.mkdtemp(prefix="{}_".format(identifier))
#+END_SRC #+END_SRC
*** table_ocr/pdf_to_images/ *** table_ocr/pdf_to_images/
**** table_ocr/pdf_to_images/__init__.py **** table_ocr/pdf_to_images/__init__.py
#+NAME: pdf_to_images/__init__.py #+NAME: pdf_to_images/__init__.py
#+HEADER: :mkdirp yes :tangle table_ocr/pdf_to_images/__init__.py #+HEADER: :tangle table_ocr/pdf_to_images/__init__.py
#+BEGIN_SRC python :noweb strip-export :results none #+BEGIN_SRC python :results none
import os import os
import re import re
import subprocess import subprocess
@ -764,14 +765,22 @@ import cv2
**** table_ocr/extract_tables/__main__.py **** table_ocr/extract_tables/__main__.py
#+BEGIN_SRC shell Takes 1 or more image paths as arguments.
. ~/.virtualenvs/lotto_odds/bin/activate
python -m pdf.extract_tables "resources/examples/example-page.png"
#+END_SRC
#+RESULTS: Images are opened and read with OpenCV.
| resources/examples/example-page.png |
| resources/examples/example-page-table-000.png | Tables are detected and extracted to a new subdirectory of the given image. The
subdirectory will be the filename sans the extension. The tables inside that
directory will be named ~table-000.png~.
If you want to do something with the output, like pipe the paths of the
extracted tables into some other utility, here is a description of the output.
For each image path given as an agument, outputs:
1. The given image path
2. Paths of extracted tables; seperated by newlines
3. Empty newline
#+NAME: extract_tables/__main__.py #+NAME: extract_tables/__main__.py
#+BEGIN_SRC python :tangle table_ocr/extract_tables/__main__.py :results none #+BEGIN_SRC python :tangle table_ocr/extract_tables/__main__.py :results none
@ -816,19 +825,25 @@ if __name__ == "__main__":
main(files) main(files)
#+END_SRC #+END_SRC
*** table_ocr/extract_cells_from_table.py *** table_ocr/extract_cells/
#+BEGIN_SRC shell :results none **** table_ocr/extract_cells/__init__.py
. ~/.virtualenvs/lotto_odds/bin/activate
python -m pdf.extract_cells_from_table "resources/examples/example-table.png" #+BEGIN_SRC python :tangle table_ocr/extract_cells/__init__.py
import cv2
<<extract-cells-from-table>>
#+END_SRC #+END_SRC
#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_cells_from_table.py :results none **** table_ocr/extract_cells/__main__.py
#+BEGIN_SRC python :tangle table_ocr/extract_cells/__main__.py :results none
import os import os
import sys import sys
import cv2 import cv2
import pytesseract
from table_ocr.extract_cells import extract_cell_images_from_table
def main(f): def main(f):
results = [] results = []

@ -0,0 +1,97 @@
import cv2
def extract_cell_images_from_table(image):
BLUR_KERNEL_SIZE = (17, 17)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~blurred,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
vertical = horizontal = img_bin.copy()
SCALE = 5
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
mask = horizontally_dilated + vertically_dilated
contours, heirarchy = cv2.findContours(
mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
)
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.05 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
# Filter out contours that aren't rectangular. Those that aren't rectangular
# are probably noise.
approx_rects = [p for p in approx_polys if len(p) == 4]
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
# Filter out rectangles that are too narrow or too short.
MIN_RECT_WIDTH = 40
MIN_RECT_HEIGHT = 10
bounding_rects = [
r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
]
# The largest bounding rectangle is assumed to be the entire table.
# Remove it from the list. We don't want to accidentally try to OCR
# the entire table.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
bounding_rects = [b for b in bounding_rects if b is not largest_rect]
cells = [c for c in bounding_rects]
def cell_in_same_row(c1, c2):
c1_center = c1[1] + c1[3] - c1[3] / 2
c2_bottom = c2[1] + c2[3]
c2_top = c2[1]
return c2_top < c1_center < c2_bottom
orig_cells = [c for c in cells]
rows = []
while cells:
first = cells[0]
rest = cells[1:]
cells_in_same_row = sorted(
[
c for c in rest
if cell_in_same_row(c, first)
],
key=lambda c: c[0]
)
row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
rows.append(row_cells)
cells = [
c for c in rest
if not cell_in_same_row(c, first)
]
# Sort rows by average height of their center.
def avg_height_of_center(row):
centers = [y + h - h / 2 for x, y, w, h in row]
return sum(centers) / len(centers)
rows.sort(key=avg_height_of_center)
cell_images_rows = []
for row in rows:
cell_images_row = []
for x, y, w, h in row:
cell_images_row.append(image[y:y+h, x:x+w])
cell_images_rows.append(cell_images_row)
return cell_images_rows

@ -0,0 +1,120 @@
import os
import sys
import cv2
from table_ocr.extract_cells import extract_cell_images_from_table
def main(f):
results = []
directory, filename = os.path.split(f)
table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
rows = extract_cell_images_from_table(table)
cell_img_dir = os.path.join(directory, "cells")
os.makedirs(cell_img_dir, exist_ok=True)
for i, row in enumerate(rows):
for j, cell in enumerate(row):
cell_filename = "{:03d}-{:03d}.png".format(i, j)
path = os.path.join(cell_img_dir, cell_filename)
cv2.imwrite(path, cell)
print(path)
def extract_cell_images_from_table(image):
BLUR_KERNEL_SIZE = (17, 17)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~blurred,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
vertical = horizontal = img_bin.copy()
SCALE = 5
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
mask = horizontally_dilated + vertically_dilated
contours, heirarchy = cv2.findContours(
mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
)
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.05 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
# Filter out contours that aren't rectangular. Those that aren't rectangular
# are probably noise.
approx_rects = [p for p in approx_polys if len(p) == 4]
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
# Filter out rectangles that are too narrow or too short.
MIN_RECT_WIDTH = 40
MIN_RECT_HEIGHT = 10
bounding_rects = [
r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
]
# The largest bounding rectangle is assumed to be the entire table.
# Remove it from the list. We don't want to accidentally try to OCR
# the entire table.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
bounding_rects = [b for b in bounding_rects if b is not largest_rect]
cells = [c for c in bounding_rects]
def cell_in_same_row(c1, c2):
c1_center = c1[1] + c1[3] - c1[3] / 2
c2_bottom = c2[1] + c2[3]
c2_top = c2[1]
return c2_top < c1_center < c2_bottom
orig_cells = [c for c in cells]
rows = []
while cells:
first = cells[0]
rest = cells[1:]
cells_in_same_row = sorted(
[
c for c in rest
if cell_in_same_row(c, first)
],
key=lambda c: c[0]
)
row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
rows.append(row_cells)
cells = [
c for c in rest
if not cell_in_same_row(c, first)
]
# Sort rows by average height of their center.
def avg_height_of_center(row):
centers = [y + h - h / 2 for x, y, w, h in row]
return sum(centers) / len(centers)
rows.sort(key=avg_height_of_center)
cell_images_rows = []
for row in rows:
cell_images_row = []
for x, y, w, h in row:
cell_images_row.append(image[y:y+h, x:x+w])
cell_images_rows.append(cell_images_row)
return cell_images_rows
if __name__ == "__main__":
main(sys.argv[1])

@ -2,7 +2,8 @@ import os
import sys import sys
import cv2 import cv2
import pytesseract
from table_ocr.extract_cells import extract_cells_from_table
def main(f): def main(f):
results = [] results = []

Loading…
Cancel
Save