diff --git a/pdf_table_extraction_and_ocr.html b/pdf_table_extraction_and_ocr.html index 83dcee6..703f770 100644 --- a/pdf_table_extraction_and_ocr.html +++ b/pdf_table_extraction_and_ocr.html @@ -3,7 +3,7 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
- +
Not all pdfs need to be sent through OCR to extract the text content. If you can
@@ -284,7 +284,7 @@ This code calls out to
-
Tesseract can detect orientation and we can then use ImageMagick’s mogrify to
@@ -347,7 +347,7 @@ Script confidence: 2.44
This answer from opencv.org was heavily referenced while writing the code around
@@ -435,11 +435,16 @@ cv2.imwrite("resources/examples/example-table.png"
"resources/examples/example-table.png"
+
Find the bounding box of each cell in the table. Run tesseract on each cell.
@@ -451,8 +456,8 @@ We’ll start with an image shown at the end of the previous section.
Blurring helps to make noise less noisy so that the overall structure of an
@@ -493,8 +498,8 @@ cv2.imwrite("resources/examples/example-table-blur
We’ve got a bunch of pixels that are gray. Thresholding will turn them all
@@ -533,8 +538,8 @@ cv2.imwrite("resources/examples/example-table-thre
Note: There’s a wierd issue with the results of the example below when it’s
@@ -574,8 +579,8 @@ cv2.imwrite("resources/examples/example-table-line
Blurring and thresholding allow us to find the lines. Opening the lines allows
@@ -656,8 +661,8 @@ above/below certain sizes.
We want to process these from left-to-right, top-to-bottom.
@@ -857,11 +862,17 @@ cv2.imwrite("resources/examples/example-table-cell
"resources/examples/example-table-cell-1-1.png"
+
OCR with Tesseract works best when there is about 10 pixels of white border
@@ -914,8 +925,8 @@ cv2.imwrite("resources/examples/example-table-cell
If we cleaned up the images well enough, we might get some accurate OCR!
@@ -961,6 +972,7 @@ period into a comma, then you might need to do some custom Tesseract training.
Takes a variable number of pdf files and creates images out of each page of the
@@ -1190,8 +1202,8 @@ parser.add_argument("files", nargs=
The following code lets us specify a size for images when they are exported to
@@ -1448,9 +1460,12 @@ with Created: 2020-04-10 Fri 13:49 Created: 2020-04-10 Fri 14:10def pdf_to_images(pdf_filepath):
+
def pdf_to_images(pdf_filepath):
"""
Turn a pdf into images
"""
@@ -323,8 +323,8 @@ This code calls out to
-
1.2 Detecting image orientation and applying rotation.
+1.2 Detecting image orientation and applying rotation.
def preprocess_img(filepath):
+
def preprocess_img(filepath):
"""
Processing that involves running shell executables,
like mogrify to rotate.
@@ -376,8 +376,8 @@ Script confidence: 2.44
2 Detecting tables
+2 Detecting tables
3 OCR tables
+3 OCR tables
3.0.1 Blur
+3.0.1 Blur
3.0.2 Threshold
+3.0.2 Threshold
3.0.3 Finding the vertical and horizontal lines of the table
+3.0.3 Finding the vertical and horizontal lines of the table
3.0.4 Finding the contours
+3.0.4 Finding the contours
3.0.5 Sorting the bounding rectangles
+3.0.5 Sorting the bounding rectangles
3.0.6 Cropping each cell to the text
+3.0.6 Cropping each cell to the text
3.0.7 OCR each cell
+3.0.7 OCR each cell
import pytesseract
+import cv2
image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
<<ocr-image>>
ocr_image(image, "--psm 7")
@@ -974,8 +986,8 @@ ocr_image(image, "--psm 7")
4 Files
+4 Files
@@ -983,8 +995,8 @@ ocr_image(image, "--psm 7")
4.1 setup.py
+4.1 setup.py
import setuptools
@@ -1014,12 +1026,12 @@ setuptools.setup(
4.2 table_image_ocr
+4.2 table_image_ocr
4.2.1 table_image_ocr/__init__.py
+4.2.1 table_image_ocr/__init__.py
@@ -1028,8 +1040,8 @@ setuptools.setup(
4.2.2 table_image_ocr/util.py
+4.2.2 table_image_ocr/util.py
from contextlib import contextmanager
@@ -1073,8 +1085,8 @@ setuptools.setup(
4.2.3 table_image_ocr/prepare_pdfs.py
+4.2.3 table_image_ocr/prepare_pdfs.py
4.2.4 table_image_ocr/extract_tables.py
+4.2.4 table_image_ocr/extract_tables.py
. ~/.virtualenvs/lotto_odds/bin/activate
@@ -1284,8 +1296,8 @@ parser.add_argument("files", nargs=
4.2.5 table_image_ocr/extract_cells_from_table.py
+4.2.5 table_image_ocr/extract_cells_from_table.py
. ~/.virtualenvs/lotto_odds/bin/activate
@@ -1420,8 +1432,8 @@ python -m pdf.extract_cells_from_table "resources/
5 Utils
+5 Utils
advice-add
.
(concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]")
+
(concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]")
(defun remove-attributes-from-src-block-result (&rest args)
@@ -1473,7 +1488,7 @@ with
advice-add
.