diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index b929d92..3d2f117 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -3,7 +3,7 @@ #+TITLE: PDF Parsing #+PROPERTY: header-args :session *Python* #+STARTUP: inlineimages -#+OPTIONS: ^:nil +#+OPTIONS: ^:nil H:4 #+BEGIN_COMMENT Some notes about the header for those not familiar with Org Mode: @@ -659,13 +659,13 @@ setuptools.setup( ** table_ocr *** table_ocr/__init__.py -#+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none +#+BEGIN_SRC python :tangle table_ocr/__init__.py :results none :exports none #+END_SRC *** table_ocr/util.py -#+BEGIN_SRC python :tangle table_ocr/util.py :mkdirp yes :results none +#+BEGIN_SRC python :tangle table_ocr/util.py :results none from contextlib import contextmanager import functools import logging @@ -687,11 +687,12 @@ def working_dir(directory): def make_tempdir(identifier): return tempfile.mkdtemp(prefix="{}_".format(identifier)) #+END_SRC + *** table_ocr/pdf_to_images/ **** table_ocr/pdf_to_images/__init__.py #+NAME: pdf_to_images/__init__.py -#+HEADER: :mkdirp yes :tangle table_ocr/pdf_to_images/__init__.py -#+BEGIN_SRC python :noweb strip-export :results none +#+HEADER: :tangle table_ocr/pdf_to_images/__init__.py +#+BEGIN_SRC python :results none import os import re import subprocess @@ -764,14 +765,22 @@ import cv2 **** table_ocr/extract_tables/__main__.py -#+BEGIN_SRC shell -. ~/.virtualenvs/lotto_odds/bin/activate -python -m pdf.extract_tables "resources/examples/example-page.png" -#+END_SRC +Takes 1 or more image paths as arguments. -#+RESULTS: -| resources/examples/example-page.png | -| resources/examples/example-page-table-000.png | +Images are opened and read with OpenCV. + +Tables are detected and extracted to a new subdirectory of the given image. The +subdirectory will be the filename sans the extension. The tables inside that +directory will be named ~table-000.png~. + +If you want to do something with the output, like pipe the paths of the +extracted tables into some other utility, here is a description of the output. + +For each image path given as an agument, outputs: + +1. The given image path +2. Paths of extracted tables; seperated by newlines +3. Empty newline #+NAME: extract_tables/__main__.py #+BEGIN_SRC python :tangle table_ocr/extract_tables/__main__.py :results none @@ -816,19 +825,25 @@ if __name__ == "__main__": main(files) #+END_SRC -*** table_ocr/extract_cells_from_table.py +*** table_ocr/extract_cells/ -#+BEGIN_SRC shell :results none -. ~/.virtualenvs/lotto_odds/bin/activate -python -m pdf.extract_cells_from_table "resources/examples/example-table.png" +**** table_ocr/extract_cells/__init__.py + +#+BEGIN_SRC python :tangle table_ocr/extract_cells/__init__.py +import cv2 + +<> #+END_SRC -#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_cells_from_table.py :results none +**** table_ocr/extract_cells/__main__.py + +#+BEGIN_SRC python :tangle table_ocr/extract_cells/__main__.py :results none import os import sys import cv2 -import pytesseract + +from table_ocr.extract_cells import extract_cell_images_from_table def main(f): results = [] diff --git a/table_ocr/extract_cells/__init__.py b/table_ocr/extract_cells/__init__.py new file mode 100644 index 0000000..4fed823 --- /dev/null +++ b/table_ocr/extract_cells/__init__.py @@ -0,0 +1,97 @@ +import cv2 + +def extract_cell_images_from_table(image): + BLUR_KERNEL_SIZE = (17, 17) + STD_DEV_X_DIRECTION = 0 + STD_DEV_Y_DIRECTION = 0 + blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION) + MAX_COLOR_VAL = 255 + BLOCK_SIZE = 15 + SUBTRACT_FROM_MEAN = -2 + + img_bin = cv2.adaptiveThreshold( + ~blurred, + MAX_COLOR_VAL, + cv2.ADAPTIVE_THRESH_MEAN_C, + cv2.THRESH_BINARY, + BLOCK_SIZE, + SUBTRACT_FROM_MEAN, + ) + vertical = horizontal = img_bin.copy() + SCALE = 5 + image_width, image_height = horizontal.shape + horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1)) + horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) + vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE))) + vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) + + horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))) + vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60))) + + mask = horizontally_dilated + vertically_dilated + contours, heirarchy = cv2.findContours( + mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE, + ) + + perimeter_lengths = [cv2.arcLength(c, True) for c in contours] + epsilons = [0.05 * p for p in perimeter_lengths] + approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)] + + # Filter out contours that aren't rectangular. Those that aren't rectangular + # are probably noise. + approx_rects = [p for p in approx_polys if len(p) == 4] + bounding_rects = [cv2.boundingRect(a) for a in approx_polys] + + # Filter out rectangles that are too narrow or too short. + MIN_RECT_WIDTH = 40 + MIN_RECT_HEIGHT = 10 + bounding_rects = [ + r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3] + ] + + # The largest bounding rectangle is assumed to be the entire table. + # Remove it from the list. We don't want to accidentally try to OCR + # the entire table. + largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) + bounding_rects = [b for b in bounding_rects if b is not largest_rect] + + cells = [c for c in bounding_rects] + def cell_in_same_row(c1, c2): + c1_center = c1[1] + c1[3] - c1[3] / 2 + c2_bottom = c2[1] + c2[3] + c2_top = c2[1] + return c2_top < c1_center < c2_bottom + + orig_cells = [c for c in cells] + rows = [] + while cells: + first = cells[0] + rest = cells[1:] + cells_in_same_row = sorted( + [ + c for c in rest + if cell_in_same_row(c, first) + ], + key=lambda c: c[0] + ) + + row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0]) + rows.append(row_cells) + cells = [ + c for c in rest + if not cell_in_same_row(c, first) + ] + + # Sort rows by average height of their center. + def avg_height_of_center(row): + centers = [y + h - h / 2 for x, y, w, h in row] + return sum(centers) / len(centers) + + rows.sort(key=avg_height_of_center) + cell_images_rows = [] + for row in rows: + cell_images_row = [] + for x, y, w, h in row: + cell_images_row.append(image[y:y+h, x:x+w]) + cell_images_rows.append(cell_images_row) + return cell_images_rows diff --git a/table_ocr/extract_cells/__main__.py b/table_ocr/extract_cells/__main__.py new file mode 100644 index 0000000..f1daee2 --- /dev/null +++ b/table_ocr/extract_cells/__main__.py @@ -0,0 +1,120 @@ +import os +import sys + +import cv2 + +from table_ocr.extract_cells import extract_cell_images_from_table + +def main(f): + results = [] + directory, filename = os.path.split(f) + table = cv2.imread(f, cv2.IMREAD_GRAYSCALE) + rows = extract_cell_images_from_table(table) + cell_img_dir = os.path.join(directory, "cells") + os.makedirs(cell_img_dir, exist_ok=True) + for i, row in enumerate(rows): + for j, cell in enumerate(row): + cell_filename = "{:03d}-{:03d}.png".format(i, j) + path = os.path.join(cell_img_dir, cell_filename) + cv2.imwrite(path, cell) + print(path) + + +def extract_cell_images_from_table(image): + BLUR_KERNEL_SIZE = (17, 17) + STD_DEV_X_DIRECTION = 0 + STD_DEV_Y_DIRECTION = 0 + blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION) + MAX_COLOR_VAL = 255 + BLOCK_SIZE = 15 + SUBTRACT_FROM_MEAN = -2 + + img_bin = cv2.adaptiveThreshold( + ~blurred, + MAX_COLOR_VAL, + cv2.ADAPTIVE_THRESH_MEAN_C, + cv2.THRESH_BINARY, + BLOCK_SIZE, + SUBTRACT_FROM_MEAN, + ) + vertical = horizontal = img_bin.copy() + SCALE = 5 + image_width, image_height = horizontal.shape + horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1)) + horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) + vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE))) + vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) + + horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))) + vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60))) + + mask = horizontally_dilated + vertically_dilated + contours, heirarchy = cv2.findContours( + mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE, + ) + + perimeter_lengths = [cv2.arcLength(c, True) for c in contours] + epsilons = [0.05 * p for p in perimeter_lengths] + approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)] + + # Filter out contours that aren't rectangular. Those that aren't rectangular + # are probably noise. + approx_rects = [p for p in approx_polys if len(p) == 4] + bounding_rects = [cv2.boundingRect(a) for a in approx_polys] + + # Filter out rectangles that are too narrow or too short. + MIN_RECT_WIDTH = 40 + MIN_RECT_HEIGHT = 10 + bounding_rects = [ + r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3] + ] + + # The largest bounding rectangle is assumed to be the entire table. + # Remove it from the list. We don't want to accidentally try to OCR + # the entire table. + largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) + bounding_rects = [b for b in bounding_rects if b is not largest_rect] + + cells = [c for c in bounding_rects] + def cell_in_same_row(c1, c2): + c1_center = c1[1] + c1[3] - c1[3] / 2 + c2_bottom = c2[1] + c2[3] + c2_top = c2[1] + return c2_top < c1_center < c2_bottom + + orig_cells = [c for c in cells] + rows = [] + while cells: + first = cells[0] + rest = cells[1:] + cells_in_same_row = sorted( + [ + c for c in rest + if cell_in_same_row(c, first) + ], + key=lambda c: c[0] + ) + + row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0]) + rows.append(row_cells) + cells = [ + c for c in rest + if not cell_in_same_row(c, first) + ] + + # Sort rows by average height of their center. + def avg_height_of_center(row): + centers = [y + h - h / 2 for x, y, w, h in row] + return sum(centers) / len(centers) + + rows.sort(key=avg_height_of_center) + cell_images_rows = [] + for row in rows: + cell_images_row = [] + for x, y, w, h in row: + cell_images_row.append(image[y:y+h, x:x+w]) + cell_images_rows.append(cell_images_row) + return cell_images_rows + +if __name__ == "__main__": + main(sys.argv[1]) diff --git a/table_ocr/extract_cells_from_table.py b/table_ocr/extract_cells_from_table.py index 6d2cc4a..393a4d3 100644 --- a/table_ocr/extract_cells_from_table.py +++ b/table_ocr/extract_cells_from_table.py @@ -2,7 +2,8 @@ import os import sys import cv2 -import pytesseract + +from table_ocr.extract_cells import extract_cells_from_table def main(f): results = []