diff --git a/pdf_table_extraction_and_ocr.html b/pdf_table_extraction_and_ocr.html index 83dcee6..703f770 100644 --- a/pdf_table_extraction_and_ocr.html +++ b/pdf_table_extraction_and_ocr.html @@ -3,7 +3,7 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - + PDF Parsing @@ -225,53 +225,53 @@

Table of Contents

-
-

1 Preparing our data

+
+

1 Preparing our data

-
-

1.1 Converting PDFs to images

+
+

1.1 Converting PDFs to images

Not all pdfs need to be sent through OCR to extract the text content. If you can @@ -284,7 +284,7 @@ This code calls out to -

def pdf_to_images(pdf_filepath):
+
def pdf_to_images(pdf_filepath):
     """
     Turn a pdf into images
     """
@@ -323,8 +323,8 @@ This code calls out to 
-

1.2 Detecting image orientation and applying rotation.

+
-
def preprocess_img(filepath):
+
def preprocess_img(filepath):
     """
     Processing that involves running shell executables,
     like mogrify to rotate.
@@ -376,8 +376,8 @@ Script confidence: 2.44
 
-
-

2 Detecting tables

+
+

2 Detecting tables

This answer from opencv.org was heavily referenced while writing the code around @@ -435,11 +435,16 @@ cv2.imwrite("resources/examples/example-table.png" "resources/examples/example-table.png"

+ +
+

example-table.png +

+
-
-

3 OCR tables

+
+

3 OCR tables

Find the bounding box of each cell in the table. Run tesseract on each cell. @@ -451,8 +456,8 @@ We’ll start with an image shown at the end of the previous section.

-
-

3.0.1 Blur

+
+

3.0.1 Blur

Blurring helps to make noise less noisy so that the overall structure of an @@ -493,8 +498,8 @@ cv2.imwrite("resources/examples/example-table-blur

-
-

3.0.2 Threshold

+
+

3.0.2 Threshold

We’ve got a bunch of pixels that are gray. Thresholding will turn them all @@ -533,8 +538,8 @@ cv2.imwrite("resources/examples/example-table-thre

-
-

3.0.3 Finding the vertical and horizontal lines of the table

+
+

3.0.3 Finding the vertical and horizontal lines of the table

Note: There’s a wierd issue with the results of the example below when it’s @@ -574,8 +579,8 @@ cv2.imwrite("resources/examples/example-table-line

-
-

3.0.4 Finding the contours

+
+

3.0.4 Finding the contours

Blurring and thresholding allow us to find the lines. Opening the lines allows @@ -656,8 +661,8 @@ above/below certain sizes.

-
-

3.0.5 Sorting the bounding rectangles

+
+

3.0.5 Sorting the bounding rectangles

We want to process these from left-to-right, top-to-bottom. @@ -857,11 +862,17 @@ cv2.imwrite("resources/examples/example-table-cell "resources/examples/example-table-cell-1-1.png"

+ + +
+

example-table-cell-1-1.png +

+
-
-

3.0.6 Cropping each cell to the text

+
+

3.0.6 Cropping each cell to the text

OCR with Tesseract works best when there is about 10 pixels of white border @@ -914,8 +925,8 @@ cv2.imwrite("resources/examples/example-table-cell

-
-

3.0.7 OCR each cell

+
+

3.0.7 OCR each cell

If we cleaned up the images well enough, we might get some accurate OCR! @@ -961,6 +972,7 @@ period into a comma, then you might need to do some custom Tesseract training.

import pytesseract
+import cv2
 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
 <<ocr-image>>
 ocr_image(image, "--psm 7")
@@ -974,8 +986,8 @@ ocr_image(image, "--psm 7")
 
-
-

4 Files

+
+

4 Files

@@ -983,8 +995,8 @@ ocr_image(image, "--psm 7")
 
-
-

4.1 setup.py

+
+

4.1 setup.py

import setuptools
@@ -1014,12 +1026,12 @@ setuptools.setup(
 
-
-

4.2 table_image_ocr

+
+

4.2 table_image_ocr

-
-

4.2.1 table_image_ocr/__init__.py

+
+

4.2.1 table_image_ocr/__init__.py

@@ -1028,8 +1040,8 @@ setuptools.setup(
 
-
-

4.2.2 table_image_ocr/util.py

+
+

4.2.2 table_image_ocr/util.py

from contextlib import contextmanager
@@ -1073,8 +1085,8 @@ setuptools.setup(
 
-
-

4.2.3 table_image_ocr/prepare_pdfs.py

+
+

4.2.3 table_image_ocr/prepare_pdfs.py

Takes a variable number of pdf files and creates images out of each page of the @@ -1190,8 +1202,8 @@ parser.add_argument("files", nargs=

-
-

4.2.4 table_image_ocr/extract_tables.py

+
+

4.2.4 table_image_ocr/extract_tables.py

. ~/.virtualenvs/lotto_odds/bin/activate
@@ -1284,8 +1296,8 @@ parser.add_argument("files", nargs=
 
-
-

4.2.5 table_image_ocr/extract_cells_from_table.py

+
+

4.2.5 table_image_ocr/extract_cells_from_table.py

. ~/.virtualenvs/lotto_odds/bin/activate
@@ -1420,8 +1432,8 @@ python -m pdf.extract_cells_from_table "resources/
 
-
-

5 Utils

+
+

5 Utils

The following code lets us specify a size for images when they are exported to @@ -1448,9 +1460,12 @@ with advice-add.

-
(concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]")
+
(concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]")
 
+

+ +

(defun remove-attributes-from-src-block-result (&rest args)
@@ -1473,7 +1488,7 @@ with advice-add.
 

Author: Eric Ihli

-

Created: 2020-04-10 Fri 13:49

+

Created: 2020-04-10 Fri 14:10

diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index b5997bb..b8e1f79 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -164,6 +164,10 @@ cv2.imwrite("resources/examples/example-table.png", image) "resources/examples/example-table.png" #+END_SRC +#+RESULTS: +#+ATTR_HTML: :width 500px :height 100% +[[file:resources/examples/example-table.png]] + * OCR tables Find the bounding box of each cell in the table. Run tesseract on each cell. @@ -437,6 +441,10 @@ cv2.imwrite("resources/examples/example-table-cell-1-1.png", cell_images_rows[1] "resources/examples/example-table-cell-1-1.png" #+END_SRC +#+RESULTS: +#+ATTR_HTML: :width 200px :height 100% +[[file:resources/examples/example-table-cell-1-1.png]] + *** Cropping each cell to the text OCR with Tesseract works best when there is about 10 pixels of white border @@ -505,6 +513,7 @@ def ocr_image(image, config): #+BEGIN_SRC python :noweb no-export :exports both import pytesseract +import cv2 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE) <> ocr_image(image, "--psm 7") @@ -522,8 +531,6 @@ ocr_image(image, "--psm 7") #+END_SRC -#+RESULTS: - ** setup.py #+BEGIN_SRC python :tangle setup.py :results none import setuptools @@ -753,10 +760,14 @@ we can define a function to do some cleanup and then add it as a before hook with ~advice-add~. #+NAME: html-image-size -#+BEGIN_SRC emacs-lisp :var text="" :var width="100%" :var height="100%" :results none +#+BEGIN_SRC emacs-lisp :var text="" :var width="100%" :var height="100%" :results raw :export code (concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]") #+END_SRC +#+RESULTS: html-image-size +#+ATTR_HTML: :width 100% :height 100% +[[file:]] + #+BEGIN_SRC emacs-lisp :results none (defun remove-attributes-from-src-block-result (&rest args) (let ((location (org-babel-where-is-src-block-result)) diff --git a/resources/examples/cells/000-002.png b/resources/examples/cells/000-002.png index ae1793e..b1f414c 100644 Binary files a/resources/examples/cells/000-002.png and b/resources/examples/cells/000-002.png differ diff --git a/resources/examples/cells/001-002.png b/resources/examples/cells/001-002.png index 9e15547..8cc2d9b 100644 Binary files a/resources/examples/cells/001-002.png and b/resources/examples/cells/001-002.png differ diff --git a/resources/examples/cells/002-002.png b/resources/examples/cells/002-002.png index 7b8c2ae..e5ace35 100644 Binary files a/resources/examples/cells/002-002.png and b/resources/examples/cells/002-002.png differ diff --git a/resources/examples/cells/003-002.png b/resources/examples/cells/003-002.png index 8c944c2..98f9661 100644 Binary files a/resources/examples/cells/003-002.png and b/resources/examples/cells/003-002.png differ diff --git a/resources/examples/cells/004-001.png b/resources/examples/cells/004-001.png index 15baa30..3faf3a1 100644 Binary files a/resources/examples/cells/004-001.png and b/resources/examples/cells/004-001.png differ diff --git a/resources/examples/cells/004-002.png b/resources/examples/cells/004-002.png index f35d077..a5475fc 100644 Binary files a/resources/examples/cells/004-002.png and b/resources/examples/cells/004-002.png differ diff --git a/resources/examples/cells/005-002.png b/resources/examples/cells/005-002.png index 9342660..b39d58a 100644 Binary files a/resources/examples/cells/005-002.png and b/resources/examples/cells/005-002.png differ diff --git a/resources/examples/cells/006-002.png b/resources/examples/cells/006-002.png index 7457910..6af7bfd 100644 Binary files a/resources/examples/cells/006-002.png and b/resources/examples/cells/006-002.png differ diff --git a/resources/examples/cells/007-002.png b/resources/examples/cells/007-002.png index 6f484b6..7b94d5f 100644 Binary files a/resources/examples/cells/007-002.png and b/resources/examples/cells/007-002.png differ diff --git a/resources/examples/cells/008-002.png b/resources/examples/cells/008-002.png index e4d4535..cee114d 100644 Binary files a/resources/examples/cells/008-002.png and b/resources/examples/cells/008-002.png differ diff --git a/resources/examples/cells/009-002.png b/resources/examples/cells/009-002.png index ec2921f..285d88c 100644 Binary files a/resources/examples/cells/009-002.png and b/resources/examples/cells/009-002.png differ diff --git a/resources/examples/cells/012-000.png b/resources/examples/cells/012-000.png index 43d7b35..40138c2 100644 Binary files a/resources/examples/cells/012-000.png and b/resources/examples/cells/012-000.png differ diff --git a/resources/examples/cells/013-000.png b/resources/examples/cells/013-000.png index 1738f61..4f90a23 100644 Binary files a/resources/examples/cells/013-000.png and b/resources/examples/cells/013-000.png differ diff --git a/resources/examples/cells/014-000.png b/resources/examples/cells/014-000.png index 3c0f622..c1882cd 100644 Binary files a/resources/examples/cells/014-000.png and b/resources/examples/cells/014-000.png differ diff --git a/resources/examples/cells/014-001.png b/resources/examples/cells/014-001.png index b1be395..4c3ace7 100644 Binary files a/resources/examples/cells/014-001.png and b/resources/examples/cells/014-001.png differ diff --git a/resources/examples/example-page-table-000.png b/resources/examples/example-page-table-000.png index 2830fbc..7b63856 100644 Binary files a/resources/examples/example-page-table-000.png and b/resources/examples/example-page-table-000.png differ diff --git a/resources/examples/example-table-cells-numbered.png b/resources/examples/example-table-cells-numbered.png index 9607f75..982a15b 100644 Binary files a/resources/examples/example-table-cells-numbered.png and b/resources/examples/example-table-cells-numbered.png differ diff --git a/resources/examples/example-table-lines.png b/resources/examples/example-table-lines.png index e331b81..15b2d2d 100644 Binary files a/resources/examples/example-table-lines.png and b/resources/examples/example-table-lines.png differ diff --git a/resources/examples/example-table-thresholded.png b/resources/examples/example-table-thresholded.png index 7ccb27b..f0829d6 100644 Binary files a/resources/examples/example-table-thresholded.png and b/resources/examples/example-table-thresholded.png differ