Refactor extract_cells into module

6 years ago · b911f87126
parent b9f088cf92
commit b911f87126
4 changed files with 252 additions and 19 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -3,7 +3,7 @@
 #+TITLE: PDF Parsing
 #+PROPERTY: header-args :session *Python*
 #+STARTUP: inlineimages
-#+OPTIONS: ^:nil
+#+OPTIONS: ^:nil H:4
 #+BEGIN_COMMENT
 Some notes about the header for those not familiar with Org Mode:
@ -659,13 +659,13 @@ setuptools.setup(
 ** table_ocr
 *** table_ocr/__init__.py
-#+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none
+#+BEGIN_SRC python :tangle table_ocr/__init__.py :results none :exports none
 #+END_SRC
 *** table_ocr/util.py
-#+BEGIN_SRC python :tangle table_ocr/util.py :mkdirp yes :results none
+#+BEGIN_SRC python :tangle table_ocr/util.py :results none
 from contextlib import contextmanager
 import functools
 import logging
@ -687,11 +687,12 @@ def working_dir(directory):
 def make_tempdir(identifier):
    return tempfile.mkdtemp(prefix="{}_".format(identifier))
 #+END_SRC
 *** table_ocr/pdf_to_images/
 **** table_ocr/pdf_to_images/__init__.py
 #+NAME: pdf_to_images/__init__.py
-#+HEADER: :mkdirp yes :tangle table_ocr/pdf_to_images/__init__.py
+#+HEADER: :tangle table_ocr/pdf_to_images/__init__.py
-#+BEGIN_SRC python :noweb strip-export :results none
+#+BEGIN_SRC python :results none
 import os
 import re
 import subprocess
@ -764,14 +765,22 @@ import cv2
 **** table_ocr/extract_tables/__main__.py
-#+BEGIN_SRC shell
+Takes 1 or more image paths as arguments.
 . ~/.virtualenvs/lotto_odds/bin/activate
 python -m pdf.extract_tables "resources/examples/example-page.png"
 #+END_SRC
-#+RESULTS:
+Images are opened and read with OpenCV.
-| resources/examples/example-page.png           |
+
-| resources/examples/example-page-table-000.png |
+Tables are detected and extracted to a new subdirectory of the given image. The
 subdirectory will be the filename sans the extension. The tables inside that
 directory will be named ~table-000.png~.
 If you want to do something with the output, like pipe the paths of the
 extracted tables into some other utility, here is a description of the output.
 For each image path given as an agument, outputs:
 1. The given image path
 2. Paths of extracted tables; seperated by newlines
 3. Empty newline
 #+NAME: extract_tables/__main__.py
 #+BEGIN_SRC python :tangle table_ocr/extract_tables/__main__.py :results none
@ -816,19 +825,25 @@ if __name__ == "__main__":
    main(files)
 #+END_SRC
-*** table_ocr/extract_cells_from_table.py
+*** table_ocr/extract_cells/
-#+BEGIN_SRC shell :results none
+**** table_ocr/extract_cells/__init__.py
-. ~/.virtualenvs/lotto_odds/bin/activate
+
-python -m pdf.extract_cells_from_table "resources/examples/example-table.png"
+#+BEGIN_SRC python :tangle table_ocr/extract_cells/__init__.py
 import cv2
 <<extract-cells-from-table>>
 #+END_SRC
-#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_cells_from_table.py :results none
+**** table_ocr/extract_cells/__main__.py
 #+BEGIN_SRC python :tangle table_ocr/extract_cells/__main__.py :results none
 import os
 import sys
 import cv2
-import pytesseract
+
 from table_ocr.extract_cells import extract_cell_images_from_table
 def main(f):
    results = []
--- a/table_ocr/extract_cells/init.py
+++ b/table_ocr/extract_cells/init.py
@ -0,0 +1,97 @@
 import cv2
 def extract_cell_images_from_table(image):
    BLUR_KERNEL_SIZE = (17, 17)
    STD_DEV_X_DIRECTION = 0
    STD_DEV_Y_DIRECTION = 0
    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    img_bin = cv2.adaptiveThreshold(
        ~blurred,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    vertical = horizontal = img_bin.copy()
    SCALE = 5
    image_width, image_height = horizontal.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
    mask = horizontally_dilated + vertically_dilated
    contours, heirarchy = cv2.findContours(
        mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
    )
    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
    epsilons = [0.05 * p for p in perimeter_lengths]
    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
    # Filter out contours that aren't rectangular. Those that aren't rectangular
    # are probably noise.
    approx_rects = [p for p in approx_polys if len(p) == 4]
    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
    # Filter out rectangles that are too narrow or too short.
    MIN_RECT_WIDTH = 40
    MIN_RECT_HEIGHT = 10
    bounding_rects = [
        r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
    ]
    # The largest bounding rectangle is assumed to be the entire table.
    # Remove it from the list. We don't want to accidentally try to OCR
    # the entire table.
    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
    bounding_rects = [b for b in bounding_rects if b is not largest_rect]
    cells = [c for c in bounding_rects]
    def cell_in_same_row(c1, c2):
        c1_center = c1[1] + c1[3] - c1[3] / 2
        c2_bottom = c2[1] + c2[3]
        c2_top = c2[1]
        return c2_top < c1_center < c2_bottom
    orig_cells = [c for c in cells]
    rows = []
    while cells:
        first = cells[0]
        rest = cells[1:]
        cells_in_same_row = sorted(
            [
                c for c in rest
                if cell_in_same_row(c, first)
            ],
            key=lambda c: c[0]
        )
        row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
        rows.append(row_cells)
        cells = [
            c for c in rest
            if not cell_in_same_row(c, first)
        ]
    # Sort rows by average height of their center.
    def avg_height_of_center(row):
        centers = [y + h - h / 2 for x, y, w, h in row]
        return sum(centers) / len(centers)
    rows.sort(key=avg_height_of_center)
    cell_images_rows = []
    for row in rows:
        cell_images_row = []
        for x, y, w, h in row:
            cell_images_row.append(image[y:y+h, x:x+w])
        cell_images_rows.append(cell_images_row)
    return cell_images_rows
--- a/table_ocr/extract_cells/main.py
+++ b/table_ocr/extract_cells/main.py
@ -0,0 +1,120 @@
 import os
 import sys
 import cv2
 from table_ocr.extract_cells import extract_cell_images_from_table
 def main(f):
    results = []
    directory, filename = os.path.split(f)
    table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
    rows = extract_cell_images_from_table(table)
    cell_img_dir = os.path.join(directory, "cells")
    os.makedirs(cell_img_dir, exist_ok=True)
    for i, row in enumerate(rows):
        for j, cell in enumerate(row):
            cell_filename = "{:03d}-{:03d}.png".format(i, j)
            path = os.path.join(cell_img_dir, cell_filename)
            cv2.imwrite(path, cell)
            print(path)
 def extract_cell_images_from_table(image):
    BLUR_KERNEL_SIZE = (17, 17)
    STD_DEV_X_DIRECTION = 0
    STD_DEV_Y_DIRECTION = 0
    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    img_bin = cv2.adaptiveThreshold(
        ~blurred,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    vertical = horizontal = img_bin.copy()
    SCALE = 5
    image_width, image_height = horizontal.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
    mask = horizontally_dilated + vertically_dilated
    contours, heirarchy = cv2.findContours(
        mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
    )
    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
    epsilons = [0.05 * p for p in perimeter_lengths]
    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
    # Filter out contours that aren't rectangular. Those that aren't rectangular
    # are probably noise.
    approx_rects = [p for p in approx_polys if len(p) == 4]
    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
    # Filter out rectangles that are too narrow or too short.
    MIN_RECT_WIDTH = 40
    MIN_RECT_HEIGHT = 10
    bounding_rects = [
        r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
    ]
    # The largest bounding rectangle is assumed to be the entire table.
    # Remove it from the list. We don't want to accidentally try to OCR
    # the entire table.
    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
    bounding_rects = [b for b in bounding_rects if b is not largest_rect]
    cells = [c for c in bounding_rects]
    def cell_in_same_row(c1, c2):
        c1_center = c1[1] + c1[3] - c1[3] / 2
        c2_bottom = c2[1] + c2[3]
        c2_top = c2[1]
        return c2_top < c1_center < c2_bottom
    orig_cells = [c for c in cells]
    rows = []
    while cells:
        first = cells[0]
        rest = cells[1:]
        cells_in_same_row = sorted(
            [
                c for c in rest
                if cell_in_same_row(c, first)
            ],
            key=lambda c: c[0]
        )
        row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
        rows.append(row_cells)
        cells = [
            c for c in rest
            if not cell_in_same_row(c, first)
        ]
    # Sort rows by average height of their center.
    def avg_height_of_center(row):
        centers = [y + h - h / 2 for x, y, w, h in row]
        return sum(centers) / len(centers)
    rows.sort(key=avg_height_of_center)
    cell_images_rows = []
    for row in rows:
        cell_images_row = []
        for x, y, w, h in row:
            cell_images_row.append(image[y:y+h, x:x+w])
        cell_images_rows.append(cell_images_row)
    return cell_images_rows
 if __name__ == "__main__":
    main(sys.argv[1])
--- a/table_ocr/extract_cells_from_table.py
+++ b/table_ocr/extract_cells_from_table.py
@ -2,7 +2,8 @@ import os
 import sys
 import cv2
-import pytesseract
+
 from table_ocr.extract_cells import extract_cells_from_table
 def main(f):
    results = []