Add example image and csv output

Give more code blocks names
6 years ago · 6891fc9990
parent 4eca593944
commit 6891fc9990
3 changed files with 56 additions and 66 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -22,6 +22,34 @@ output~ to a code block will minimize that noise.
 * Overview
 This Python package provides utilities for extracting tabular data from PDF
 files and images of tables.
 Given an image that contains a table...
 #+ATTR_HTML: :width 25%
 [[file:resources/examples/example-page.png]]
 Extract the the text into a CSV format...
 #+BEGIN_EXAMPLE
 PRIZE,ODDS 1 IN:,# OF WINNERS*
 $3,9.09,"282,447"
 $5,16.66,"154,097"
 $7,40.01,"64,169"
 $10,26.67,"96,283"
 $20,100.00,"25,677"
 $30,290.83,"8,829"
 $50,239.66,"10,714"
 $100,919.66,"2,792"
 $500,"6,652.07",386
 "$40,000","855,899.99",3
 1,i223,
 Toa,,
 ,,
 ,,"* Based upon 2,567,700"
 #+END_EXAMPLE
 ** To get CSV data from a table in a scanned pdf document:
 #+BEGIN_SRC shell :results none :session *Shell*
@ -367,7 +395,8 @@ header bar or something. If we know our cells are all within a certain size (by
 area of pixels) then we can filter out the junk cells by removing cells
 above/below certain sizes.
-#+BEGIN_SRC python :noweb-ref bounding-rects :results none
+#+NAME: bounding-rects
 #+BEGIN_SRC python :results none
 contours, heirarchy = cv2.findContours(
    mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
 )
@ -409,7 +438,8 @@ of the rectangles that have a center that is within the top-y and bottom-y
 values of that top-left rectangle. Then we'll sort those rectangles by the x
 value of their center. We'll remove those rectangles from the list and repeat.
-#+BEGIN_SRC python :noweb-ref sort-contours :results none
+#+NAME: sort-contours
 #+BEGIN_SRC python :results none
 def cell_in_same_row(c1, c2):
    c1_center = c1[1] + c1[3] - c1[3] / 2
    c2_bottom = c2[1] + c2[3]
@ -479,7 +509,8 @@ cv2.imwrite("resources/examples/example-table-cells-numbered.png", image)
 #+ATTR_HTML: :width 500px :height 100%
 [[file:resources/examples/example-table-cells-numbered.png]]
-#+BEGIN_SRC python :noweb-ref extract-cells-from-table :noweb yes :eval no
+#+NAME: extract-cells-from-table
 #+BEGIN_SRC python :noweb yes :eval no
 def extract_cell_images_from_table(image):
    <<blur>>
    <<threshold>>
@ -547,14 +578,16 @@ def crop_to_text(image):
    # Get rid of little noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
    opened = cv2.dilate(opened, kernel)
    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    NUM_PX_COMMA = 6
    MIN_CHAR_AREA = 5 * 9
-    if bounding_rects:
+    char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
    if char_sized_bounding_rects:
        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
-        for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
+        for x, y, w, h in char_sized_bounding_rects:
            minx = min(minx, x)
            miny = min(miny, y)
            maxx = max(maxx, x + w)
@ -562,8 +595,8 @@ def crop_to_text(image):
        x, y, w, h = minx, miny, maxx - minx, maxy - miny
        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
    else:
-        # If we morphed out all of the text, fallback to using the unmorphed image.
+        # If we morphed out all of the text, assume an empty image.
-        cropped = image
+        cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
 #+END_SRC
@ -571,6 +604,7 @@ def crop_to_text(image):
 #+HEADER: :post html-image-size(text=*this*, width="200px")
 #+BEGIN_SRC python :noweb no-export :results raw :exports both
 import cv2
 import numpy as np
 <<crop-to-text>>
 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
 image = crop_to_text(image)
@ -606,6 +640,7 @@ def ocr_image(image, config):
 #+BEGIN_SRC python :noweb no-export :exports both
 import pytesseract
 import cv2
 import numpy as np
 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
 <<crop-to-text>>
 <<ocr-image>>
@ -884,6 +919,8 @@ if __name__ == "__main__":
 import math
 import cv2
 import numpy as np
 import pytesseract
 <<crop-to-text>>
 <<ocr-image>>
@ -902,7 +939,8 @@ import os
 import sys
 import cv2
-import pytesseract
+
 from table_ocr.ocr_image import crop_to_text, ocr_image
 description="""Takes a single argument that is the image to OCR.
 Remaining arguments are passed directly to Tesseract.
@ -913,9 +951,6 @@ Filenames are of the format for training with tesstrain."""
 parser = argparse.ArgumentParser(description=description)
 parser.add_argument("image", help="filepath of image to perform OCR")
 <<crop-to-text>>
 <<ocr-image>>
 def main(image_file, tess_args):
    directory, filename = os.path.split(image_file)
    filename_sans_ext, ext = os.path.splitext(filename)
--- a/table_ocr/ocr_image/init.py
+++ b/table_ocr/ocr_image/init.py
@ -1,6 +1,8 @@
 import math
 import cv2
 import numpy as np
 import pytesseract
 def crop_to_text(image):
    MAX_COLOR_VAL = 255
@ -27,14 +29,16 @@ def crop_to_text(image):
    # Get rid of little noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
    opened = cv2.dilate(opened, kernel)
    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    NUM_PX_COMMA = 6
    MIN_CHAR_AREA = 5 * 9
-    if bounding_rects:
+    char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
    if char_sized_bounding_rects:
        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
-        for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
+        for x, y, w, h in char_sized_bounding_rects:
            minx = min(minx, x)
            miny = min(miny, y)
            maxx = max(maxx, x + w)
@ -42,8 +46,8 @@ def crop_to_text(image):
        x, y, w, h = minx, miny, maxx - minx, maxy - miny
        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
    else:
-        # If we morphed out all of the text, fallback to using the unmorphed image.
+        # If we morphed out all of the text, assume an empty image.
-        cropped = image
+        cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
 def ocr_image(image, config):
--- a/table_ocr/ocr_image/main.py
+++ b/table_ocr/ocr_image/main.py
@ -4,7 +4,8 @@ import os
 import sys
 import cv2
-import pytesseract
+
 from table_ocr.ocr_image import crop_to_text, ocr_image
 description="""Takes a single argument that is the image to OCR.
 Remaining arguments are passed directly to Tesseract.
@ -15,56 +16,6 @@ Filenames are of the format for training with tesstrain."""
 parser = argparse.ArgumentParser(description=description)
 parser.add_argument("image", help="filepath of image to perform OCR")
 def crop_to_text(image):
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    img_bin = cv2.adaptiveThreshold(
        ~image,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    img_h, img_w = image.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
    horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    both = horizontal_lines + vertical_lines
    cleaned = img_bin - both
    # Get rid of little noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    NUM_PX_COMMA = 6
    MIN_CHAR_AREA = 5 * 9
    if bounding_rects:
        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
        for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
            minx = min(minx, x)
            miny = min(miny, y)
            maxx = max(maxx, x + w)
            maxy = max(maxy, y + h)
        x, y, w, h = minx, miny, maxx - minx, maxy - miny
        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
    else:
        # If we morphed out all of the text, fallback to using the unmorphed image.
        cropped = image
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
 def ocr_image(image, config):
    return pytesseract.image_to_string(
        image,
        config=config
    )
 def main(image_file, tess_args):
    directory, filename = os.path.split(image_file)
    filename_sans_ext, ext = os.path.splitext(filename)