Refactor table extraction into module

main
Eric Ihli 5 years ago
parent 98ef6ffd85
commit b9f088cf92

@ -66,29 +66,10 @@ probably aren't necessary.
This code calls out to [[https://manpages.debian.org/testing/poppler-utils/pdfimages.1.en.html][pdfimages]] from [[https://poppler.freedesktop.org/][Poppler]]. This code calls out to [[https://manpages.debian.org/testing/poppler-utils/pdfimages.1.en.html][pdfimages]] from [[https://poppler.freedesktop.org/][Poppler]].
#+NAME: pdf-to-images-overview
#+HEADER: :mkdirp yes :tangle table_ocr/pdf_to_images/__init__.py
#+BEGIN_SRC python :noweb strip-export :results none
import os
import re
import subprocess
from table_ocr.util import get_logger, working_dir
logger = get_logger(__name__)
# Wrapper around the Poppler command line utility "pdfimages" and helpers for
# finding the output files of that command.
<<pdf-to-images>>
# Helpers to detect orientation of the images that Poppler extracted and if the
# images are rotated or skewed, use ImageMagick's `mogrify` to correct the
# rotation. (Makes OCR more straightforward.)
<<fix-orientation>>
#+END_SRC
#+NAME: pdf-to-images #+NAME: pdf-to-images
#+BEGIN_SRC python :results none #+BEGIN_SRC python :results none
# Wrapper around the Poppler command line utility "pdfimages" and helpers for
# finding the output files of that command.
def pdf_to_images(pdf_filepath): def pdf_to_images(pdf_filepath):
""" """
Turn a pdf into images Turn a pdf into images
@ -148,6 +129,10 @@ Script: Latin
Script confidence: 2.44 Script confidence: 2.44
#+END_EXAMPLE #+END_EXAMPLE
The following are some helpers to detect orientation of the images that Poppler
extracted and, if the images are rotated or skewed, use ImageMagick's `mogrify`
to correct the rotation. This makes OCR more straightforward.
#+NAME: fix-orientation #+NAME: fix-orientation
#+BEGIN_SRC python :results none #+BEGIN_SRC python :results none
def preprocess_img(filepath): def preprocess_img(filepath):
@ -189,7 +174,8 @@ The blurring, thresholding, and line detection is used here as well as later on
for cell extraction. They are good techniques for cleaning an image up in a way for cell extraction. They are good techniques for cleaning an image up in a way
that makes things like shape detection more accurate. that makes things like shape detection more accurate.
#+BEGIN_SRC python :noweb-ref detect-table :results none :noweb no-export #+NAME: detect-tables
#+BEGIN_SRC python :results none :noweb yes
def find_tables(image): def find_tables(image):
<<blur>> <<blur>>
<<threshold>> <<threshold>>
@ -215,10 +201,10 @@ def find_tables(image):
#+END_SRC #+END_SRC
#+HEADER: :post html-image-size(text=*this*, width="500px") #+HEADER: :post html-image-size(text=*this*, width="500px")
#+BEGIN_SRC python :noweb-ref test-detect-table :noweb no-export :results raw #+BEGIN_SRC python :noweb-ref test-detect-table :noweb strip-export :results raw
import cv2 import cv2
<<detect-table>> <<detect-tables>>
image_filename = "resources/examples/example-page.png" image_filename = "resources/examples/example-page.png"
image = cv2.imread(image_filename, cv2.IMREAD_GRAYSCALE) image = cv2.imread(image_filename, cv2.IMREAD_GRAYSCALE)
@ -701,8 +687,25 @@ def working_dir(directory):
def make_tempdir(identifier): def make_tempdir(identifier):
return tempfile.mkdtemp(prefix="{}_".format(identifier)) return tempfile.mkdtemp(prefix="{}_".format(identifier))
#+END_SRC #+END_SRC
*** table_ocr/pdf_to_images/
**** table_ocr/pdf_to_images/__init__.py
#+NAME: pdf_to_images/__init__.py
#+HEADER: :mkdirp yes :tangle table_ocr/pdf_to_images/__init__.py
#+BEGIN_SRC python :noweb strip-export :results none
import os
import re
import subprocess
from table_ocr.util import get_logger, working_dir
logger = get_logger(__name__)
<<pdf-to-images>>
<<fix-orientation>>
#+END_SRC
*** table_ocr/pdf_to_images/__main__.py **** table_ocr/pdf_to_images/__main__.py
Takes a variable number of pdf files and creates images out of each page of the Takes a variable number of pdf files and creates images out of each page of the
file using ~pdfimages~ from Poppler. Images are created in the same directory file using ~pdfimages~ from Poppler. Images are created in the same directory
@ -748,7 +751,18 @@ if __name__ == "__main__":
main(args.files) main(args.files)
#+END_SRC #+END_SRC
*** table_ocr/extract_tables.py *** table_ocr/extract_tables/
**** table_ocr/extract_tables/__init__.py
#+NAME: extract_tables/__init__.py
#+HEADER: :tangle table_ocr/extract_tables/__init__.py
#+BEGIN_SRC python
import cv2
<<detect-tables>>
#+END_SRC
**** table_ocr/extract_tables/__main__.py
#+BEGIN_SRC shell #+BEGIN_SRC shell
. ~/.virtualenvs/lotto_odds/bin/activate . ~/.virtualenvs/lotto_odds/bin/activate
@ -759,12 +773,15 @@ python -m pdf.extract_tables "resources/examples/example-page.png"
| resources/examples/example-page.png | | resources/examples/example-page.png |
| resources/examples/example-page-table-000.png | | resources/examples/example-page-table-000.png |
#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_tables.py :results none #+NAME: extract_tables/__main__.py
#+BEGIN_SRC python :tangle table_ocr/extract_tables/__main__.py :results none
import argparse import argparse
import os import os
import cv2 import cv2
from table_ocr.extract_tables import find_tables
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+") parser.add_argument("files", nargs="+")
@ -781,7 +798,9 @@ def main(files):
os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True) os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
for i, table in enumerate(tables): for i, table in enumerate(tables):
table_filename = "table-{:03d}.png".format(i) table_filename = "table-{:03d}.png".format(i)
table_filepath = os.path.join(directory, filename_sans_extension, table_filename) table_filepath = os.path.join(
directory, filename_sans_extension, table_filename
)
files.append(table_filepath) files.append(table_filepath)
cv2.imwrite(table_filepath, table) cv2.imwrite(table_filepath, table)
if tables: if tables:
@ -790,7 +809,6 @@ def main(files):
for image_filename, table_filenames in results: for image_filename, table_filenames in results:
print("\n".join(table_filenames)) print("\n".join(table_filenames))
<<detect-table>>
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()

@ -3,6 +3,8 @@ import os
import cv2 import cv2
from table_ocr.extract_tables import find_tables
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+") parser.add_argument("files", nargs="+")
@ -19,7 +21,9 @@ def main(files):
os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True) os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
for i, table in enumerate(tables): for i, table in enumerate(tables):
table_filename = "table-{:03d}.png".format(i) table_filename = "table-{:03d}.png".format(i)
table_filepath = os.path.join(directory, filename_sans_extension, table_filename) table_filepath = os.path.join(
directory, filename_sans_extension, table_filename
)
files.append(table_filepath) files.append(table_filepath)
cv2.imwrite(table_filepath, table) cv2.imwrite(table_filepath, table)
if tables: if tables:
@ -28,53 +32,6 @@ def main(files):
for image_filename, table_filenames in results: for image_filename, table_filenames in results:
print("\n".join(table_filenames)) print("\n".join(table_filenames))
def find_tables(image):
BLUR_KERNEL_SIZE = (17, 17)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~blurred,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
vertical = horizontal = img_bin.copy()
SCALE = 5
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
mask = horizontally_dilated + vertically_dilated
contours, heirarchy = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
MIN_TABLE_AREA = 1e5
contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.1 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
# The link where a lot of this code was borrowed from recommends an
# additional step to check the number of "joints" inside this bounding rectangle.
# A table should have a lot of intersections. We might have a rectangular image
# here though which would only have 4 intersections, 1 at each corner.
# Leaving that step as a future TODO if it is ever necessary.
images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
return images
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()

@ -0,0 +1,49 @@
import cv2
def find_tables(image):
BLUR_KERNEL_SIZE = (17, 17)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~blurred,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
vertical = horizontal = img_bin.copy()
SCALE = 5
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
mask = horizontally_dilated + vertically_dilated
contours, heirarchy = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
MIN_TABLE_AREA = 1e5
contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.1 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
# The link where a lot of this code was borrowed from recommends an
# additional step to check the number of "joints" inside this bounding rectangle.
# A table should have a lot of intersections. We might have a rectangular image
# here though which would only have 4 intersections, 1 at each corner.
# Leaving that step as a future TODO if it is ever necessary.
images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
return images

@ -0,0 +1,39 @@
import argparse
import os
import cv2
from table_ocr.extract_tables import find_tables
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
results = []
for f in files:
directory, filename = os.path.split(f)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
tables = find_tables(image)
files = []
filename_sans_extension = os.path.splitext(filename)[0]
if tables:
os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
for i, table in enumerate(tables):
table_filename = "table-{:03d}.png".format(i)
table_filepath = os.path.join(
directory, filename_sans_extension, table_filename
)
files.append(table_filepath)
cv2.imwrite(table_filepath, table)
if tables:
results.append((f, files))
for image_filename, table_filenames in results:
print("\n".join(table_filenames))
if __name__ == "__main__":
args = parser.parse_args()
files = args.files
main(files)

@ -48,9 +48,6 @@ def find_matching_files_in_dir(file_prefix, directory):
] ]
return files return files
# Helpers to detect orientation of the images that Poppler extracted and if the
# images are rotated or skewed, use ImageMagick's `mogrify` to correct the
# rotation. (Makes OCR more straightforward.)
def preprocess_img(filepath): def preprocess_img(filepath):
""" """
Processing that involves running shell executables, Processing that involves running shell executables,

Loading…
Cancel
Save