Add example image and csv output

Give more code blocks names
5 years ago · 6891fc9990
parent 4eca593944
commit 6891fc9990
3 changed files with 56 additions and 66 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -22,6 +22,34 @@ output~ to a code block will minimize that noise.

 * Overview

+This Python package provides utilities for extracting tabular data from PDF
+files and images of tables.
+
+Given an image that contains a table...
+
+#+ATTR_HTML: :width 25%
+[[file:resources/examples/example-page.png]]
+
+Extract the the text into a CSV format...
+
+#+BEGIN_EXAMPLE
+PRIZE,ODDS 1 IN:,# OF WINNERS*
+$3,9.09,"282,447"
+$5,16.66,"154,097"
+$7,40.01,"64,169"
+$10,26.67,"96,283"
+$20,100.00,"25,677"
+$30,290.83,"8,829"
+$50,239.66,"10,714"
+$100,919.66,"2,792"
+$500,"6,652.07",386
+"$40,000","855,899.99",3
+1,i223,
+Toa,,
+,,
+,,"* Based upon 2,567,700"
+#+END_EXAMPLE
+
 ** To get CSV data from a table in a scanned pdf document:

 #+BEGIN_SRC shell :results none :session *Shell*
@ -367,7 +395,8 @@ header bar or something. If we know our cells are all within a certain size (by
 area of pixels) then we can filter out the junk cells by removing cells
 above/below certain sizes.

-#+BEGIN_SRC python :noweb-ref bounding-rects :results none
+#+NAME: bounding-rects
+#+BEGIN_SRC python :results none
 contours, heirarchy = cv2.findContours(
    mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
 )
@ -409,7 +438,8 @@ of the rectangles that have a center that is within the top-y and bottom-y
 values of that top-left rectangle. Then we'll sort those rectangles by the x
 value of their center. We'll remove those rectangles from the list and repeat.

-#+BEGIN_SRC python :noweb-ref sort-contours :results none
+#+NAME: sort-contours
+#+BEGIN_SRC python :results none
 def cell_in_same_row(c1, c2):
    c1_center = c1[1] + c1[3] - c1[3] / 2
    c2_bottom = c2[1] + c2[3]
@ -479,7 +509,8 @@ cv2.imwrite("resources/examples/example-table-cells-numbered.png", image)
 #+ATTR_HTML: :width 500px :height 100%
 [[file:resources/examples/example-table-cells-numbered.png]]

-#+BEGIN_SRC python :noweb-ref extract-cells-from-table :noweb yes :eval no
+#+NAME: extract-cells-from-table
+#+BEGIN_SRC python :noweb yes :eval no
 def extract_cell_images_from_table(image):
    <<blur>>
    <<threshold>>
@ -547,14 +578,16 @@ def crop_to_text(image):
    # Get rid of little noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
+    opened = cv2.dilate(opened, kernel)

    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    NUM_PX_COMMA = 6
    MIN_CHAR_AREA = 5 * 9
-    if bounding_rects:
+    char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
+    if char_sized_bounding_rects:
        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
-        for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
+        for x, y, w, h in char_sized_bounding_rects:
            minx = min(minx, x)
            miny = min(miny, y)
            maxx = max(maxx, x + w)
@ -562,8 +595,8 @@ def crop_to_text(image):
        x, y, w, h = minx, miny, maxx - minx, maxy - miny
        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
    else:
-        # If we morphed out all of the text, fallback to using the unmorphed image.
-        cropped = image
+        # If we morphed out all of the text, assume an empty image.
+        cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
 #+END_SRC
@ -571,6 +604,7 @@ def crop_to_text(image):
 #+HEADER: :post html-image-size(text=*this*, width="200px")
 #+BEGIN_SRC python :noweb no-export :results raw :exports both
 import cv2
+import numpy as np
 <<crop-to-text>>
 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
 image = crop_to_text(image)
@ -606,6 +640,7 @@ def ocr_image(image, config):
 #+BEGIN_SRC python :noweb no-export :exports both
 import pytesseract
 import cv2
+import numpy as np
 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
 <<crop-to-text>>
 <<ocr-image>>
@ -884,6 +919,8 @@ if __name__ == "__main__":
 import math

 import cv2
+import numpy as np
+import pytesseract

 <<crop-to-text>>
 <<ocr-image>>
@ -902,7 +939,8 @@ import os
 import sys

 import cv2
-import pytesseract
+
+from table_ocr.ocr_image import crop_to_text, ocr_image

 description="""Takes a single argument that is the image to OCR.
 Remaining arguments are passed directly to Tesseract.
@ -913,9 +951,6 @@ Filenames are of the format for training with tesstrain."""
 parser = argparse.ArgumentParser(description=description)
 parser.add_argument("image", help="filepath of image to perform OCR")

-<<crop-to-text>>
-<<ocr-image>>
-
 def main(image_file, tess_args):
    directory, filename = os.path.split(image_file)
    filename_sans_ext, ext = os.path.splitext(filename)
--- a/table_ocr/ocr_image/init.py
+++ b/table_ocr/ocr_image/init.py
@ -1,6 +1,8 @@
 import math

 import cv2
+import numpy as np
+import pytesseract

 def crop_to_text(image):
    MAX_COLOR_VAL = 255
@ -27,14 +29,16 @@ def crop_to_text(image):
    # Get rid of little noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
+    opened = cv2.dilate(opened, kernel)

    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    NUM_PX_COMMA = 6
    MIN_CHAR_AREA = 5 * 9
-    if bounding_rects:
+    char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
+    if char_sized_bounding_rects:
        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
-        for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
+        for x, y, w, h in char_sized_bounding_rects:
            minx = min(minx, x)
            miny = min(miny, y)
            maxx = max(maxx, x + w)
@ -42,8 +46,8 @@ def crop_to_text(image):
        x, y, w, h = minx, miny, maxx - minx, maxy - miny
        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
    else:
-        # If we morphed out all of the text, fallback to using the unmorphed image.
-        cropped = image
+        # If we morphed out all of the text, assume an empty image.
+        cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
 def ocr_image(image, config):
--- a/table_ocr/ocr_image/main.py
+++ b/table_ocr/ocr_image/main.py
@ -4,7 +4,8 @@ import os
 import sys

 import cv2
-import pytesseract
+
+from table_ocr.ocr_image import crop_to_text, ocr_image

 description="""Takes a single argument that is the image to OCR.
 Remaining arguments are passed directly to Tesseract.
@ -15,56 +16,6 @@ Filenames are of the format for training with tesstrain."""
 parser = argparse.ArgumentParser(description=description)
 parser.add_argument("image", help="filepath of image to perform OCR")

-def crop_to_text(image):
-    MAX_COLOR_VAL = 255
-    BLOCK_SIZE = 15
-    SUBTRACT_FROM_MEAN = -2
-
-    img_bin = cv2.adaptiveThreshold(
-        ~image,
-        MAX_COLOR_VAL,
-        cv2.ADAPTIVE_THRESH_MEAN_C,
-        cv2.THRESH_BINARY,
-        BLOCK_SIZE,
-        SUBTRACT_FROM_MEAN,
-    )
-
-    img_h, img_w = image.shape
-    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
-    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
-    horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
-    vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
-    both = horizontal_lines + vertical_lines
-    cleaned = img_bin - both
-
-    # Get rid of little noise.
-    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
-    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
-
-    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
-    bounding_rects = [cv2.boundingRect(c) for c in contours]
-    NUM_PX_COMMA = 6
-    MIN_CHAR_AREA = 5 * 9
-    if bounding_rects:
-        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
-        for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
-            minx = min(minx, x)
-            miny = min(miny, y)
-            maxx = max(maxx, x + w)
-            maxy = max(maxy, y + h)
-        x, y, w, h = minx, miny, maxx - minx, maxy - miny
-        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
-    else:
-        # If we morphed out all of the text, fallback to using the unmorphed image.
-        cropped = image
-    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
-    return bordered
-def ocr_image(image, config):
-    return pytesseract.image_to_string(
-        image,
-        config=config
-    )
-
 def main(image_file, tess_args):
    directory, filename = os.path.split(image_file)
    filename_sans_ext, ext = os.path.splitext(filename)