diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index 812df8b..c95f1e5 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -22,6 +22,34 @@ output~ to a code block will minimize that noise. * Overview +This Python package provides utilities for extracting tabular data from PDF +files and images of tables. + +Given an image that contains a table... + +#+ATTR_HTML: :width 25% +[[file:resources/examples/example-page.png]] + +Extract the the text into a CSV format... + +#+BEGIN_EXAMPLE +PRIZE,ODDS 1 IN:,# OF WINNERS* +$3,9.09,"282,447" +$5,16.66,"154,097" +$7,40.01,"64,169" +$10,26.67,"96,283" +$20,100.00,"25,677" +$30,290.83,"8,829" +$50,239.66,"10,714" +$100,919.66,"2,792" +$500,"6,652.07",386 +"$40,000","855,899.99",3 +1,i223, +Toa,, +,, +,,"* Based upon 2,567,700" +#+END_EXAMPLE + ** To get CSV data from a table in a scanned pdf document: #+BEGIN_SRC shell :results none :session *Shell* @@ -367,7 +395,8 @@ header bar or something. If we know our cells are all within a certain size (by area of pixels) then we can filter out the junk cells by removing cells above/below certain sizes. -#+BEGIN_SRC python :noweb-ref bounding-rects :results none +#+NAME: bounding-rects +#+BEGIN_SRC python :results none contours, heirarchy = cv2.findContours( mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE, ) @@ -409,7 +438,8 @@ of the rectangles that have a center that is within the top-y and bottom-y values of that top-left rectangle. Then we'll sort those rectangles by the x value of their center. We'll remove those rectangles from the list and repeat. -#+BEGIN_SRC python :noweb-ref sort-contours :results none +#+NAME: sort-contours +#+BEGIN_SRC python :results none def cell_in_same_row(c1, c2): c1_center = c1[1] + c1[3] - c1[3] / 2 c2_bottom = c2[1] + c2[3] @@ -479,7 +509,8 @@ cv2.imwrite("resources/examples/example-table-cells-numbered.png", image) #+ATTR_HTML: :width 500px :height 100% [[file:resources/examples/example-table-cells-numbered.png]] -#+BEGIN_SRC python :noweb-ref extract-cells-from-table :noweb yes :eval no +#+NAME: extract-cells-from-table +#+BEGIN_SRC python :noweb yes :eval no def extract_cell_images_from_table(image): <> <> @@ -547,14 +578,16 @@ def crop_to_text(image): # Get rid of little noise. kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel) + opened = cv2.dilate(opened, kernel) contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) bounding_rects = [cv2.boundingRect(c) for c in contours] NUM_PX_COMMA = 6 MIN_CHAR_AREA = 5 * 9 - if bounding_rects: + char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA] + if char_sized_bounding_rects: minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 - for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]: + for x, y, w, h in char_sized_bounding_rects: minx = min(minx, x) miny = min(miny, y) maxx = max(maxx, x + w) @@ -562,8 +595,8 @@ def crop_to_text(image): x, y, w, h = minx, miny, maxx - minx, maxy - miny cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)] else: - # If we morphed out all of the text, fallback to using the unmorphed image. - cropped = image + # If we morphed out all of the text, assume an empty image. + cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8) bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) return bordered #+END_SRC @@ -571,6 +604,7 @@ def crop_to_text(image): #+HEADER: :post html-image-size(text=*this*, width="200px") #+BEGIN_SRC python :noweb no-export :results raw :exports both import cv2 +import numpy as np <> image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE) image = crop_to_text(image) @@ -606,6 +640,7 @@ def ocr_image(image, config): #+BEGIN_SRC python :noweb no-export :exports both import pytesseract import cv2 +import numpy as np image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE) <> <> @@ -884,6 +919,8 @@ if __name__ == "__main__": import math import cv2 +import numpy as np +import pytesseract <> <> @@ -902,7 +939,8 @@ import os import sys import cv2 -import pytesseract + +from table_ocr.ocr_image import crop_to_text, ocr_image description="""Takes a single argument that is the image to OCR. Remaining arguments are passed directly to Tesseract. @@ -913,9 +951,6 @@ Filenames are of the format for training with tesstrain.""" parser = argparse.ArgumentParser(description=description) parser.add_argument("image", help="filepath of image to perform OCR") -<> -<> - def main(image_file, tess_args): directory, filename = os.path.split(image_file) filename_sans_ext, ext = os.path.splitext(filename) diff --git a/table_ocr/ocr_image/__init__.py b/table_ocr/ocr_image/__init__.py index 1a13d1f..526202c 100644 --- a/table_ocr/ocr_image/__init__.py +++ b/table_ocr/ocr_image/__init__.py @@ -1,6 +1,8 @@ import math import cv2 +import numpy as np +import pytesseract def crop_to_text(image): MAX_COLOR_VAL = 255 @@ -27,14 +29,16 @@ def crop_to_text(image): # Get rid of little noise. kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel) + opened = cv2.dilate(opened, kernel) contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) bounding_rects = [cv2.boundingRect(c) for c in contours] NUM_PX_COMMA = 6 MIN_CHAR_AREA = 5 * 9 - if bounding_rects: + char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA] + if char_sized_bounding_rects: minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 - for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]: + for x, y, w, h in char_sized_bounding_rects: minx = min(minx, x) miny = min(miny, y) maxx = max(maxx, x + w) @@ -42,8 +46,8 @@ def crop_to_text(image): x, y, w, h = minx, miny, maxx - minx, maxy - miny cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)] else: - # If we morphed out all of the text, fallback to using the unmorphed image. - cropped = image + # If we morphed out all of the text, assume an empty image. + cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8) bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) return bordered def ocr_image(image, config): diff --git a/table_ocr/ocr_image/__main__.py b/table_ocr/ocr_image/__main__.py index 387d37a..f849b53 100644 --- a/table_ocr/ocr_image/__main__.py +++ b/table_ocr/ocr_image/__main__.py @@ -4,7 +4,8 @@ import os import sys import cv2 -import pytesseract + +from table_ocr.ocr_image import crop_to_text, ocr_image description="""Takes a single argument that is the image to OCR. Remaining arguments are passed directly to Tesseract. @@ -15,56 +16,6 @@ Filenames are of the format for training with tesstrain.""" parser = argparse.ArgumentParser(description=description) parser.add_argument("image", help="filepath of image to perform OCR") -def crop_to_text(image): - MAX_COLOR_VAL = 255 - BLOCK_SIZE = 15 - SUBTRACT_FROM_MEAN = -2 - - img_bin = cv2.adaptiveThreshold( - ~image, - MAX_COLOR_VAL, - cv2.ADAPTIVE_THRESH_MEAN_C, - cv2.THRESH_BINARY, - BLOCK_SIZE, - SUBTRACT_FROM_MEAN, - ) - - img_h, img_w = image.shape - horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1)) - vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7))) - horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) - vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) - both = horizontal_lines + vertical_lines - cleaned = img_bin - both - - # Get rid of little noise. - kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) - opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel) - - contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) - bounding_rects = [cv2.boundingRect(c) for c in contours] - NUM_PX_COMMA = 6 - MIN_CHAR_AREA = 5 * 9 - if bounding_rects: - minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 - for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]: - minx = min(minx, x) - miny = min(miny, y) - maxx = max(maxx, x + w) - maxy = max(maxy, y + h) - x, y, w, h = minx, miny, maxx - minx, maxy - miny - cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)] - else: - # If we morphed out all of the text, fallback to using the unmorphed image. - cropped = image - bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) - return bordered -def ocr_image(image, config): - return pytesseract.image_to_string( - image, - config=config - ) - def main(image_file, tess_args): directory, filename = os.path.split(image_file) filename_sans_ext, ext = os.path.splitext(filename)