commit 28bcdbd4f7e3dbb363e7819c0c03d63d2ee24feb Author: Eric Ihli Date: Fri Apr 10 13:52:29 2020 -0700 Initial commit diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8aa2645 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [year] [fullname] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/pdf_table_extraction_and_ocr.html b/pdf_table_extraction_and_ocr.html new file mode 100644 index 0000000..83dcee6 --- /dev/null +++ b/pdf_table_extraction_and_ocr.html @@ -0,0 +1,1479 @@ + + + + + + + +PDF Parsing + + + + + + +

PDF Parsing

+ + +

1 Preparing our data


1.1 Converting PDFs to images


+Not all pdfs need to be sent through OCR to extract the text content. If you can +click and drag to highlight text in the pdf, then the tools in this library +probably aren’t necessary. +

+ +

+This code calls out to pdfimages from Poppler. +

+ +
def pdf_to_images(pdf_filepath):
+    """
+    Turn a pdf into images
+    """
+    directory, filename = os.path.split(pdf_filepath)
+    with working_dir(directory):
+        image_filenames = pdfimages(pdf_filepath)
+    # Since pdfimages creates a number of files named each for there page number
+    # and doesn't return us the list that it created
+    return [os.path.join(directory, f) for f in image_filenames]
+def pdfimages(pdf_filepath):
+    """
+    Uses the `pdfimages` utility from Poppler
+    ( Creates images out of each page. Images
+    are prefixed by their name sans extension and suffixed by their page number.
+    """
+    directory, filename = os.path.split(pdf_filepath)
+    filename_sans_ext = filename.split(".pdf")[0]
+["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
+    image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
+    logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
+    return image_filenames
+def find_matching_files_in_dir(file_prefix, directory):
+    files = [
+        filename
+        for filename in os.listdir(directory)
+        if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename)
+    ]
+    return files
+ +

1.2 Detecting image orientation and applying rotation.


+Tesseract can detect orientation and we can then use ImageMagick’s mogrify to +rotate the image. +

+ +

+Here’s an example of the output we get from orientation detection with +Tesseract. +

+ +
+➜  example/ tesseract --psm 0 example-000.png -
+Page number: 0
+Orientation in degrees: 90
+Rotate: 270
+Orientation confidence: 26.86
+Script: Latin
+Script confidence: 2.44
+ +
def preprocess_img(filepath):
+    """
+    Processing that involves running shell executables,
+    like mogrify to rotate.
+    """
+    rotate = get_rotate(filepath)
+    logger.debug("Rotating {} by {}.".format(filepath, rotate))
+    mogrify(filepath, rotate)
+def get_rotate(image_filepath):
+    output = (
+        subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
+        .decode("utf-8")
+        .split("\n")
+    )
+    output = next(l for l in output if "Rotate: " in l)
+    output = output.split(": ")[1]
+    return output
+def mogrify(image_filepath, rotate):
+["mogrify", "-rotate", rotate, image_filepath])
+ +

2 Detecting tables


+This answer from was heavily referenced while writing the code around +table detection: + +

+ +

+It’s much easier to OCR a table when the table is the only thing in the image. +This code detects tables in an image and returns a list of images of just the +tables, no surrounding text or noise. +

+ +

+The blurring, thresholding, and line detection is used here as well as later on +for cell extraction. They are good techniques for cleaning an image up in a way +that makes things like shape detection more accurate. +

+ +
def find_tables(image):
+    <<blur>>
+    <<threshold>>
+    <<lines-of-table>>
+    contours, heirarchy = cv2.findContours(
+        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
+    )
+    MIN_TABLE_AREA = 1e5
+    contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
+    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
+    epsilons = [0.1 * p for p in perimeter_lengths]
+    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
+    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
+    # The link where a lot of this code was borrowed from recommends an
+    # additional step to check the number of "joints" inside this bounding rectangle.
+    # A table should have a lot of intersections. We might have a rectangular image
+    # here though which would only have 4 intersections, 1 at each corner.
+    # Leaving that step as a future TODO if it is ever necessary.
+    images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
+    return images
+ +
import cv2
+image_filename = "resources/examples/example-page.png"
+image = cv2.imread(image_filename, cv2.IMREAD_GRAYSCALE)
+image = find_tables(image)[0]
+cv2.imwrite("resources/examples/example-table.png", image)
+ +

3 OCR tables


+Find the bounding box of each cell in the table. Run tesseract on each cell. +Print a comma seperated output. +

+ +

+We’ll start with an image shown at the end of the previous section. +

+ +

3.0.1 Blur


+Blurring helps to make noise less noisy so that the overall structure of an +image is more detectable. +

+ +

+That gray row at the bottom is kind of noisy. If we don’t somehow clean it up, +OpenCV will detect all sorts of odd shapes in there and it will throw off our +cell detection. +

+ +

+Cleanup can be accomplished with a blur followed by some thresholding. +

+ +
+ +
image = ~cv2.imread("resources/examples/example-table.png", cv2.IMREAD_GRAYSCALE)
+cv2.imwrite("resources/examples/example-table-blurred.png", blurred)
+ + +

example-table-blurred.png +

+ +

3.0.2 Threshold


+We’ve got a bunch of pixels that are gray. Thresholding will turn them all +either black or white. Having all black or white pixels lets us do morphological +transformations like erosion and dilation. +

+ +
+img_bin = cv2.adaptiveThreshold(
+    ~blurred,
+ +
+cv2.imwrite("resources/examples/example-table-thresholded.png", img_bin)
+ + +

example-table-thresholded.png +

+ +

3.0.3 Finding the vertical and horizontal lines of the table


+Note: There’s a wierd issue with the results of the example below when it’s +evaluated as part of an export or a full-buffer evaluation. If you evaluate the +example by itself, it looks the way it’s intended. If you evaluate it as part of +an entire buffer evaluation, it’s distorted. +

+ +
vertical = horizontal = img_bin.copy()
+SCALE = 5
+image_width, image_height = horizontal.shape
+horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
+horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
+vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
+vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
+mask = horizontally_dilated + vertically_dilated
+ +
+cv2.imwrite("resources/examples/example-table-lines.png", mask)
+ + +

example-table-lines.png +

+ +

3.0.4 Finding the contours


+Blurring and thresholding allow us to find the lines. Opening the lines allows +us to find the contours. +

+ +

+An “Opening” is an erosion followed by a dilation. Great examples and +descriptions of each morphological operation can be found at + +

+ +

+Contours can be explained simply as a curve joining all the continuous points +(along the boundary), having same color or intensity. The contours are a useful +tool for shape analysis and object detection and recognition. +

+ +

+We can search those contours to find rectangles of certain size. +

+ +

+To do that, we can use OpenCV’s approxPolyEP function. It takes as arguments +the contour (list of contiguous points), and a number representing how different +the polygon perimeter length can be from the true perimeter length of the +contour. 0.1 (10%) seems to be a good value. The difference in perimeter +length between a 4-sided polygon and a 3-sided polygon is greater than 10% and +the difference between a 5+ sided polygon and a 4-sided polygon is less than +10%. So a 4-sided polygon is the polygon with the fewest sides that leaves the +difference in perimeter length within our 10% threshold. +

+ +

+Then we just get the bounding rectangle of that polygon and we have our cells! +

+ +

+We might need to do a little more filtering of those rectangles though. We might +have accidentally found some noise such as another image on the page or a title +header bar or something. If we know our cells are all within a certain size (by +area of pixels) then we can filter out the junk cells by removing cells +above/below certain sizes. +

+ +
contours, heirarchy = cv2.findContours(
+    mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
+perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
+epsilons = [0.05 * p for p in perimeter_lengths]
+approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
+# Filter out contours that aren't rectangular. Those that aren't rectangular
+# are probably noise.
+approx_rects = [p for p in approx_polys if len(p) == 4]
+bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
+# Filter out rectangles that are too narrow or too short.
+bounding_rects = [
+    r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
+# The largest bounding rectangle is assumed to be the entire table.
+# Remove it from the list. We don't want to accidentally try to OCR
+# the entire table.
+largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
+bounding_rects = [b for b in bounding_rects if b is not largest_rect]
+cells = [c for c in bounding_rects]
+ +

3.0.5 Sorting the bounding rectangles


+We want to process these from left-to-right, top-to-bottom. +

+ +

+I’ve thought of a straightforward algorithm for it, but it could probably be +made more efficient. +

+ +

+We’ll find the most rectangle with the most top-left corner. Then we’ll find all +of the rectangles that have a center that is within the top-y and bottom-y +values of that top-left rectangle. Then we’ll sort those rectangles by the x +value of their center. We’ll remove those rectangles from the list and repeat. +

+ +
def cell_in_same_row(c1, c2):
+    c1_center = c1[1] + c1[3] - c1[3] / 2
+    c2_bottom = c2[1] + c2[3]
+    c2_top = c2[1]
+    return c2_top < c1_center < c2_bottom
+orig_cells = [c for c in cells]
+rows = []
+while cells:
+    first = cells[0]
+    rest = cells[1:]
+    cells_in_same_row = sorted(
+        [
+            c for c in rest
+            if cell_in_same_row(c, first)
+        ],
+        key=lambda c: c[0]
+    )
+    row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
+    rows.append(row_cells)
+    cells = [
+        c for c in rest
+        if not cell_in_same_row(c, first)
+    ]
+# Sort rows by average height of their center.
+def avg_height_of_center(row):
+    centers = [y + h - h / 2 for x, y, w, h in row]
+    return sum(centers) / len(centers)
+ +

+To test if this code works, let’s try sorting the bounding rectangles and +numbering them from right to left, top to bottom. +

+ +
import cv2
+image = cv2.imread("resources/examples/example-table.png", cv2.IMREAD_GRAYSCALE)
+FONT_COLOR = (127, 127, 127)
+for i, row in enumerate(rows):
+    for j, cell in enumerate(row):
+        x, y, w, h = cell
+        cv2.putText(
+            image,
+            "{},{}".format(i, j),
+            (int(x + w - w / 2), int(y + h - h / 2)),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            FONT_SCALE,
+            FONT_COLOR,
+            2,
+        )
+cv2.imwrite("resources/examples/example-table-cells-numbered.png", image)
+ + +

example-table-cells-numbered.png +

+ +
def extract_cell_images_from_table(image):
+    BLUR_KERNEL_SIZE = (17, 17)
+    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    img_bin = cv2.adaptiveThreshold(
+        ~blurred,
+        MAX_COLOR_VAL,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+    )
+    vertical = horizontal = img_bin.copy()
+    SCALE = 5
+    image_width, image_height = horizontal.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
+    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
+    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
+    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
+    mask = horizontally_dilated + vertically_dilated
+    contours, heirarchy = cv2.findContours(
+        mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
+    )
+    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
+    epsilons = [0.05 * p for p in perimeter_lengths]
+    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
+    # Filter out contours that aren't rectangular. Those that aren't rectangular
+    # are probably noise.
+    approx_rects = [p for p in approx_polys if len(p) == 4]
+    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
+    # Filter out rectangles that are too narrow or too short.
+    MIN_RECT_WIDTH = 40
+    bounding_rects = [
+        r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
+    ]
+    # The largest bounding rectangle is assumed to be the entire table.
+    # Remove it from the list. We don't want to accidentally try to OCR
+    # the entire table.
+    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
+    bounding_rects = [b for b in bounding_rects if b is not largest_rect]
+    cells = [c for c in bounding_rects]
+    def cell_in_same_row(c1, c2):
+        c1_center = c1[1] + c1[3] - c1[3] / 2
+        c2_bottom = c2[1] + c2[3]
+        c2_top = c2[1]
+        return c2_top < c1_center < c2_bottom
+    orig_cells = [c for c in cells]
+    rows = []
+    while cells:
+        first = cells[0]
+        rest = cells[1:]
+        cells_in_same_row = sorted(
+            [
+                c for c in rest
+                if cell_in_same_row(c, first)
+            ],
+            key=lambda c: c[0]
+        )
+        row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
+        rows.append(row_cells)
+        cells = [
+            c for c in rest
+            if not cell_in_same_row(c, first)
+        ]
+    # Sort rows by average height of their center.
+    def avg_height_of_center(row):
+        centers = [y + h - h / 2 for x, y, w, h in row]
+        return sum(centers) / len(centers)
+    rows.sort(key=avg_height_of_center)
+    cell_images_rows = []
+    for row in rows:
+        cell_images_row = []
+        for x, y, w, h in row:
+            cell_images_row.append(image[y:y+h, x:x+w])
+        cell_images_rows.append(cell_images_row)
+    return cell_images_rows
+ +
+image = cv2.imread("resources/examples/example-table.png", cv2.IMREAD_GRAYSCALE)
+cell_images_rows = extract_cell_images_from_table(image)
+cv2.imwrite("resources/examples/example-table-cell-1-1.png", cell_images_rows[1][1])
+ +

3.0.6 Cropping each cell to the text


+OCR with Tesseract works best when there is about 10 pixels of white border +around the text. +

+ +

+Our bounding rectangles may have picked up some stray pixels from the horizontal +and vertical lines of the cells in the table. It’s probobly just a few pixels, +much fewer than the width of the text. If that’s the case, then we can remove +that noise with a simple open morph. +

+ +

+Once the stray border pixels have been removed, we can expand our border using +openMakeBorder. +

+ +
def crop_to_text(image):
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
+    opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
+    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+    bounding_rects = [cv2.boundingRect(c) for c in contours]
+    # The largest contour is certainly the text that we're looking for.
+    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
+    x, y, w, h = largest_rect
+    cropped = image[y:y+h, x:x+w]
+    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
+    return bordered
+ +
import cv2
+image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
+image = crop_to_text(image)
+cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image)
+ + +

example-table-cell-1-1-cropped.png +

+ +

3.0.7 OCR each cell


+If we cleaned up the images well enough, we might get some accurate OCR! +

+ +

+There is plenty that could have gone wrong along the way. +

+ +

+The first step to troubleshooting is to view the intermediate images and see if +there’s something about your image that is obviously abnormal, like some really +thick noise or a wrongly detected table. +

+ +

+If everything looks reasonable but the OCR is doing something like turning a +period into a comma, then you might need to do some custom Tesseract training. +

+ +
def crop_to_text(image):
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
+    opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
+    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+    bounding_rects = [cv2.boundingRect(c) for c in contours]
+    # The largest contour is certainly the text that we're looking for.
+    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
+    x, y, w, h = largest_rect
+    cropped = image[y:y+h, x:x+w]
+    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
+    return bordered
+def ocr_image(image, config):
+    cropped = crop_to_text(image)
+    return pytesseract.image_to_string(
+        ~cropped,
+        config=config
+    )
+ +
import pytesseract
+image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
+ocr_image(image, "--psm 7")
+ +
+ +

4 Files

+ +


import setuptools
+with open("", "r") as fh:
+    long_description =
+    name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username
+    version="0.0.1",
+    author="Example Author",
+    author_email="",
+    description="A small example package",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="",
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+ +

4.2 table_image_ocr


4.2.1 table_image_ocr/

+ +

4.2.2 table_image_ocr/

from contextlib import contextmanager
+import functools
+import logging
+import os
+import tempfile
+from bs4 import BeautifulSoup as bs
+import requests
+logger = get_logger()
+def working_dir(directory):
+    original_working_dir = os.getcwd()
+    try:
+        os.chdir(directory)
+        yield directory
+    finally:
+        os.chdir(original_working_dir)
+def download(url, filepath):
+    response = request_get(url)
+    data = response.content
+    with open(filepath, "wb") as f:
+        f.write(data)
+def make_tempdir(identifier):
+    return tempfile.mkdtemp(prefix="{}_".format(identifier))
+ +

4.2.3 table_image_ocr/


+Takes a variable number of pdf files and creates images out of each page of the +file using pdfimages from Poppler. Images are created in the same directory +that contains the pdf. +

+ +

+Prints each pdf followed by the images extracted from that pdf followed by a +blank line. +

+ +
python -m pdf.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
+ + +
import argparse
+import logging
+import os
+import re
+import subprocess
+import sys
+from pdf.util import request_get, working_dir, download, make_tempdir
+logger = get_logger()
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+def main(files):
+    pdf_images = []
+    for f in files:
+        pdf_images.append((f, pdf_to_images(f)))
+    for pdf, images in pdf_images:
+        for image in images:
+            preprocess_img(image)
+    for pdf, images in pdf_images:
+        print("{}\n{}\n".format(pdf, "\n".join(images)))
+def pdf_to_images(pdf_filepath):
+    """
+    Turn a pdf into images
+    """
+    directory, filename = os.path.split(pdf_filepath)
+    with working_dir(directory):
+        image_filenames = pdfimages(pdf_filepath)
+    # Since pdfimages creates a number of files named each for there page number
+    # and doesn't return us the list that it created
+    return [os.path.join(directory, f) for f in image_filenames]
+def pdfimages(pdf_filepath):
+    """
+    Uses the `pdfimages` utility from Poppler
+    ( Creates images out of each page. Images
+    are prefixed by their name sans extension and suffixed by their page number.
+    """
+    directory, filename = os.path.split(pdf_filepath)
+    filename_sans_ext = filename.split(".pdf")[0]
+["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
+    image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
+    logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
+    return image_filenames
+def find_matching_files_in_dir(file_prefix, directory):
+    files = [
+        filename
+        for filename in os.listdir(directory)
+        if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename)
+    ]
+    return files
+def preprocess_img(filepath):
+    """
+    Processing that involves running shell executables,
+    like mogrify to rotate.
+    """
+    rotate = get_rotate(filepath)
+    logger.debug("Rotating {} by {}.".format(filepath, rotate))
+    mogrify(filepath, rotate)
+def get_rotate(image_filepath):
+    output = (
+        subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
+        .decode("utf-8")
+        .split("\n")
+    )
+    output = next(l for l in output if "Rotate: " in l)
+    output = output.split(": ")[1]
+    return output
+def mogrify(image_filepath, rotate):
+["mogrify", "-rotate", rotate, image_filepath])
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.files)
+ +

4.2.4 table_image_ocr/

. ~/.virtualenvs/lotto_odds/bin/activate
+python -m pdf.extract_tables "resources/examples/example-page.png"
+ +
import argparse
+import os
+import cv2
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+def main(files):
+    results = []
+    for f in files:
+        directory, filename = os.path.split(f)
+        image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
+        tables = find_tables(image)
+        files = []
+        for i, table in enumerate(tables):
+            filename_sans_extension = os.path.splitext(filename)[0]
+            table_filename = "{}-table-{:03d}.png".format(filename_sans_extension, i)
+            table_filepath = os.path.join(directory, table_filename)
+            files.append(table_filepath)
+            cv2.imwrite(table_filepath, table)
+        results.append((f, files))
+    for image_filename, table_filenames in results:
+        print("{}\n{}\n".format(image_filename, "\n".join(table_filenames)))
+def find_tables(image):
+    BLUR_KERNEL_SIZE = (17, 17)
+    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    img_bin = cv2.adaptiveThreshold(
+        ~blurred,
+        MAX_COLOR_VAL,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+    )
+    vertical = horizontal = img_bin.copy()
+    SCALE = 5
+    image_width, image_height = horizontal.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
+    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
+    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
+    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
+    mask = horizontally_dilated + vertically_dilated
+    contours, heirarchy = cv2.findContours(
+        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
+    )
+    MIN_TABLE_AREA = 1e5
+    contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
+    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
+    epsilons = [0.1 * p for p in perimeter_lengths]
+    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
+    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
+    # The link where a lot of this code was borrowed from recommends an
+    # additional step to check the number of "joints" inside this bounding rectangle.
+    # A table should have a lot of intersections. We might have a rectangular image
+    # here though which would only have 4 intersections, 1 at each corner.
+    # Leaving that step as a future TODO if it is ever necessary.
+    images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
+    return images
+if __name__ == "__main__":
+    args = parser.parse_args()
+    files = args.files
+    main(files)
+ +

4.2.5 table_image_ocr/

. ~/.virtualenvs/lotto_odds/bin/activate
+python -m pdf.extract_cells_from_table "resources/examples/example-table.png"
+ +
import os
+import sys
+import cv2
+import pytesseract
+def main(f):
+    results = []
+    directory, filename = os.path.split(f)
+    table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
+    rows = extract_cell_images_from_table(table)
+    cell_img_dir = os.path.join(directory, "cells")
+    os.makedirs(cell_img_dir, exist_ok=True)
+    for i, row in enumerate(rows):
+        for j, cell in enumerate(row):
+            cell_filename = "{:03d}-{:03d}.png".format(i, j)
+            path = os.path.join(cell_img_dir, cell_filename)
+            cv2.imwrite(path, cell)
+            print(cell_filename)
+def extract_cell_images_from_table(image):
+    BLUR_KERNEL_SIZE = (17, 17)
+    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    img_bin = cv2.adaptiveThreshold(
+        ~blurred,
+        MAX_COLOR_VAL,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+    )
+    vertical = horizontal = img_bin.copy()
+    SCALE = 5
+    image_width, image_height = horizontal.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
+    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
+    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
+    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
+    mask = horizontally_dilated + vertically_dilated
+    contours, heirarchy = cv2.findContours(
+        mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
+    )
+    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
+    epsilons = [0.05 * p for p in perimeter_lengths]
+    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
+    # Filter out contours that aren't rectangular. Those that aren't rectangular
+    # are probably noise.
+    approx_rects = [p for p in approx_polys if len(p) == 4]
+    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
+    # Filter out rectangles that are too narrow or too short.
+    MIN_RECT_WIDTH = 40
+    bounding_rects = [
+        r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
+    ]
+    # The largest bounding rectangle is assumed to be the entire table.
+    # Remove it from the list. We don't want to accidentally try to OCR
+    # the entire table.
+    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
+    bounding_rects = [b for b in bounding_rects if b is not largest_rect]
+    cells = [c for c in bounding_rects]
+    def cell_in_same_row(c1, c2):
+        c1_center = c1[1] + c1[3] - c1[3] / 2
+        c2_bottom = c2[1] + c2[3]
+        c2_top = c2[1]
+        return c2_top < c1_center < c2_bottom
+    orig_cells = [c for c in cells]
+    rows = []
+    while cells:
+        first = cells[0]
+        rest = cells[1:]
+        cells_in_same_row = sorted(
+            [
+                c for c in rest
+                if cell_in_same_row(c, first)
+            ],
+            key=lambda c: c[0]
+        )
+        row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
+        rows.append(row_cells)
+        cells = [
+            c for c in rest
+            if not cell_in_same_row(c, first)
+        ]
+    # Sort rows by average height of their center.
+    def avg_height_of_center(row):
+        centers = [y + h - h / 2 for x, y, w, h in row]
+        return sum(centers) / len(centers)
+    rows.sort(key=avg_height_of_center)
+    cell_images_rows = []
+    for row in rows:
+        cell_images_row = []
+        for x, y, w, h in row:
+            cell_images_row.append(image[y:y+h, x:x+w])
+        cell_images_rows.append(cell_images_row)
+    return cell_images_rows
+if __name__ == "__main__":
+    main(sys.argv[1])
+ +

5 Utils


+The following code lets us specify a size for images when they are exported to +html. +

+ +

+Org supports specifying an export size for an image by putting the #+ATTR_HTML: +:width 100px before the image. But since our images are in a results drawer, we +need a way for our results drawer to do that for us automatically. +

+ +

+Adding #+ATTR_HTML after the beginning of the result block introduces a new +problem. Org-babel no longer recognizes the result as a result block and doesn’t +remove it when a src block is re-evaluated, so we end up just appending new +results on each evaluation. +

+ +

+There is nothing configurable that will tell org-babel to remove our line. But +we can define a function to do some cleanup and then add it as a before hook +with advice-add. +

+ +
(concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]")
+ +
(defun remove-attributes-from-src-block-result (&rest args)
+  (let ((location (org-babel-where-is-src-block-result))
+        (attr-regexp "[   ]*#\\+ATTR.*$"))
+    (when location
+      (save-excursion
+        (goto-char location)
+        (when (looking-at (concat org-babel-result-regexp ".*$"))
+          (next-line)
+          (while (looking-at attr-regexp)
+            (kill-whole-line)))))))
+(advice-add 'org-babel-remove-result :before #'remove-attributes-from-src-block-result)
+(advice-add 'org-babel-execute-src-block :before #'remove-attributes-from-src-block-result)

Author: Eric Ihli


Created: 2020-04-10 Fri 13:49

+ + diff --git a/ b/ new file mode 100644 index 0000000..b5997bb --- /dev/null +++ b/ @@ -0,0 +1,775 @@ +# -*- org-image-actual-width: 500; -*- + +#+TITLE: PDF Parsing +#+PROPERTY: header-args :session *Python* +#+STARTUP: inlineimages +#+OPTIONS: ^:nil + +#+BEGIN_COMMENT +Some notes about the header for those not familiar with Org Mode: + +The property `header-args` with ~:session \*Python\*~ will cause all evaluated +source code blocks to be evaluated in the buffer named "\*Python\*", which is the +default buffer name for the buffer connected to an inferior python process. This +is useful for interactive development. It gives you a REPL to work with rather +than having to constantly evaluate source code blocks and view the results +output to try any change. + +Another note along those lines is that when source code blocks are evaluated, +some unnecessary output is printed in the ~*Python*~ buffer. Adding ~:results +output~ to a code block will minimize that noise. +#+END_COMMENT + +* Preparing our data +** Converting PDFs to images + +Not all pdfs need to be sent through OCR to extract the text content. If you can +click and drag to highlight text in the pdf, then the tools in this library +probably aren't necessary. + +This code calls out to [[][pdfimages]] from [[][Poppler]]. + +#+NAME: pdf-to-images +#+BEGIN_SRC python :results none +def pdf_to_images(pdf_filepath): + """ + Turn a pdf into images + """ + directory, filename = os.path.split(pdf_filepath) + with working_dir(directory): + image_filenames = pdfimages(pdf_filepath) + + # Since pdfimages creates a number of files named each for there page number + # and doesn't return us the list that it created + return [os.path.join(directory, f) for f in image_filenames] + + +def pdfimages(pdf_filepath): + """ + Uses the `pdfimages` utility from Poppler + ( Creates images out of each page. Images + are prefixed by their name sans extension and suffixed by their page number. + """ + directory, filename = os.path.split(pdf_filepath) + filename_sans_ext = filename.split(".pdf")[0] +["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]]) + image_filenames = find_matching_files_in_dir(filename_sans_ext, directory) + logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))) + return image_filenames + + +def find_matching_files_in_dir(file_prefix, directory): + files = [ + filename + for filename in os.listdir(directory) + if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename) + ] + return files +#+END_SRC + +** Detecting image orientation and applying rotation. + +Tesseract can detect orientation and we can then use [[][ImageMagick's mogrify]] to +rotate the image. + +Here's an example of the output we get from orientation detection with +Tesseract. + +#+BEGIN_EXAMPLE +➜ example/ tesseract --psm 0 example-000.png - +Page number: 0 +Orientation in degrees: 90 +Rotate: 270 +Orientation confidence: 26.86 +Script: Latin +Script confidence: 2.44 +#+END_EXAMPLE + +#+NAME: fix-orientation +#+BEGIN_SRC python :results none +def preprocess_img(filepath): + """ + Processing that involves running shell executables, + like mogrify to rotate. + """ + rotate = get_rotate(filepath) + logger.debug("Rotating {} by {}.".format(filepath, rotate)) + mogrify(filepath, rotate) + + +def get_rotate(image_filepath): + output = ( + subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"]) + .decode("utf-8") + .split("\n") + ) + output = next(l for l in output if "Rotate: " in l) + output = output.split(": ")[1] + return output + + +def mogrify(image_filepath, rotate): +["mogrify", "-rotate", rotate, image_filepath]) +#+END_SRC + +* Detecting tables + +This answer from was heavily referenced while writing the code around +table detection: + + +It's much easier to OCR a table when the table is the only thing in the image. +This code detects tables in an image and returns a list of images of just the +tables, no surrounding text or noise. + +The blurring, thresholding, and line detection is used here as well as later on +for cell extraction. They are good techniques for cleaning an image up in a way +that makes things like shape detection more accurate. + +#+BEGIN_SRC python :noweb-ref detect-table :results none :noweb no-export +def find_tables(image): + <> + <> + <> + contours, heirarchy = cv2.findContours( + mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, + ) + + MIN_TABLE_AREA = 1e5 + contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA] + perimeter_lengths = [cv2.arcLength(c, True) for c in contours] + epsilons = [0.1 * p for p in perimeter_lengths] + approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)] + bounding_rects = [cv2.boundingRect(a) for a in approx_polys] + + # The link where a lot of this code was borrowed from recommends an + # additional step to check the number of "joints" inside this bounding rectangle. + # A table should have a lot of intersections. We might have a rectangular image + # here though which would only have 4 intersections, 1 at each corner. + # Leaving that step as a future TODO if it is ever necessary. + images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects] + return images +#+END_SRC + +#+HEADER: :post html-image-size(text=*this*, width="500px") +#+BEGIN_SRC python :noweb-ref test-detect-table :noweb no-export :results raw +import cv2 + +<> + +image_filename = "resources/examples/example-page.png" +image = cv2.imread(image_filename, cv2.IMREAD_GRAYSCALE) +image = find_tables(image)[0] +cv2.imwrite("resources/examples/example-table.png", image) +"resources/examples/example-table.png" +#+END_SRC + +* OCR tables + +Find the bounding box of each cell in the table. Run tesseract on each cell. +Print a comma seperated output. + +We'll start with an image shown at the end of the previous section. + +*** Blur + +Blurring helps to make noise less noisy so that the overall structure of an +image is more detectable. + +That gray row at the bottom is kind of noisy. If we don't somehow clean it up, +OpenCV will detect all sorts of odd shapes in there and it will throw off our +cell detection. + +Cleanup can be accomplished with a blur followed by some thresholding. + +#+BEGIN_SRC python :noweb-ref blur :results none +BLUR_KERNEL_SIZE = (17, 17) +STD_DEV_X_DIRECTION = 0 +STD_DEV_Y_DIRECTION = 0 +blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION) +#+END_SRC + +#+HEADER: :post html-image-size(text=*this*, width="500px") +#+BEGIN_SRC python :noweb no-export :results raw :exports both +image = ~cv2.imread("resources/examples/example-table.png", cv2.IMREAD_GRAYSCALE) +<> +cv2.imwrite("resources/examples/example-table-blurred.png", blurred) +"resources/examples/example-table-blurred.png" +#+END_SRC + +#+RESULTS: +#+ATTR_HTML: :width 500px :height 100% +[[file:resources/examples/example-table-blurred.png]] + +*** Threshold + +We've got a bunch of pixels that are gray. Thresholding will turn them all +either black or white. Having all black or white pixels lets us do morphological +transformations like erosion and dilation. + +#+BEGIN_SRC python :noweb-ref threshold :results none +MAX_COLOR_VAL = 255 +BLOCK_SIZE = 15 +SUBTRACT_FROM_MEAN = -2 + +img_bin = cv2.adaptiveThreshold( + ~blurred, + MAX_COLOR_VAL, + cv2.ADAPTIVE_THRESH_MEAN_C, + cv2.THRESH_BINARY, + BLOCK_SIZE, + SUBTRACT_FROM_MEAN, +) +#+END_SRC + +#+HEADER: :post html-image-size(text=*this*, width="500px") +#+BEGIN_SRC python :noweb no-export :results raw :exports both +<> +cv2.imwrite("resources/examples/example-table-thresholded.png", img_bin) +"resources/examples/example-table-thresholded.png" +#+END_SRC + +#+RESULTS: +#+ATTR_HTML: :width 500px :height 100% +[[file:resources/examples/example-table-thresholded.png]] + +*** Finding the vertical and horizontal lines of the table + +Note: There's a wierd issue with the results of the example below when it's +evaluated as part of an export or a full-buffer evaluation. If you evaluate the +example by itself, it looks the way it's intended. If you evaluate it as part of +an entire buffer evaluation, it's distorted. + +#+BEGIN_SRC python :noweb-ref lines-of-table :results none +vertical = horizontal = img_bin.copy() +SCALE = 5 +image_width, image_height = horizontal.shape +horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1)) +horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) +vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE))) +vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) + +horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))) +vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60))) + +mask = horizontally_dilated + vertically_dilated +#+END_SRC + +#+HEADER: :post html-image-size(text=*this*, width="500px") +#+BEGIN_SRC python :noweb no-export :results raw :exports both +<> +cv2.imwrite("resources/examples/example-table-lines.png", mask) +"resources/examples/example-table-lines.png" +#+END_SRC + +#+RESULTS: +#+ATTR_HTML: :width 500px :height 100% +[[file:resources/examples/example-table-lines.png]] + +*** Finding the contours + +Blurring and thresholding allow us to find the lines. Opening the lines allows +us to find the contours. + +An "Opening" is an erosion followed by a dilation. Great examples and +descriptions of each morphological operation can be found at +[[][]]. + +#+BEGIN_QUOTE +Contours can be explained simply as a curve joining all the continuous points +(along the boundary), having same color or intensity. The contours are a useful +tool for shape analysis and object detection and recognition. +#+END_QUOTE + +We can search those contours to find rectangles of certain size. + +To do that, we can use OpenCV's ~approxPolyEP~ function. It takes as arguments +the contour (list of contiguous points), and a number representing how different +the polygon perimeter length can be from the true perimeter length of the +contour. ~0.1~ (10%) seems to be a good value. The difference in perimeter +length between a 4-sided polygon and a 3-sided polygon is greater than 10% and +the difference between a 5+ sided polygon and a 4-sided polygon is less than +10%. So a 4-sided polygon is the polygon with the fewest sides that leaves the +difference in perimeter length within our 10% threshold. + +Then we just get the bounding rectangle of that polygon and we have our cells! + +We might need to do a little more filtering of those rectangles though. We might +have accidentally found some noise such as another image on the page or a title +header bar or something. If we know our cells are all within a certain size (by +area of pixels) then we can filter out the junk cells by removing cells +above/below certain sizes. + +#+BEGIN_SRC python :noweb-ref bounding-rects :results none +contours, heirarchy = cv2.findContours( + mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE, +) + +perimeter_lengths = [cv2.arcLength(c, True) for c in contours] +epsilons = [0.05 * p for p in perimeter_lengths] +approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)] + +# Filter out contours that aren't rectangular. Those that aren't rectangular +# are probably noise. +approx_rects = [p for p in approx_polys if len(p) == 4] +bounding_rects = [cv2.boundingRect(a) for a in approx_polys] + +# Filter out rectangles that are too narrow or too short. +MIN_RECT_WIDTH = 40 +MIN_RECT_HEIGHT = 10 +bounding_rects = [ + r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3] +] + +# The largest bounding rectangle is assumed to be the entire table. +# Remove it from the list. We don't want to accidentally try to OCR +# the entire table. +largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) +bounding_rects = [b for b in bounding_rects if b is not largest_rect] + +cells = [c for c in bounding_rects] +#+END_SRC + +*** Sorting the bounding rectangles + +We want to process these from left-to-right, top-to-bottom. + +I've thought of a straightforward algorithm for it, but it could probably be +made more efficient. + +We'll find the most rectangle with the most top-left corner. Then we'll find all +of the rectangles that have a center that is within the top-y and bottom-y +values of that top-left rectangle. Then we'll sort those rectangles by the x +value of their center. We'll remove those rectangles from the list and repeat. + +#+BEGIN_SRC python :noweb-ref sort-contours :results none +def cell_in_same_row(c1, c2): + c1_center = c1[1] + c1[3] - c1[3] / 2 + c2_bottom = c2[1] + c2[3] + c2_top = c2[1] + return c2_top < c1_center < c2_bottom + +orig_cells = [c for c in cells] +rows = [] +while cells: + first = cells[0] + rest = cells[1:] + cells_in_same_row = sorted( + [ + c for c in rest + if cell_in_same_row(c, first) + ], + key=lambda c: c[0] + ) + + row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0]) + rows.append(row_cells) + cells = [ + c for c in rest + if not cell_in_same_row(c, first) + ] + +# Sort rows by average height of their center. +def avg_height_of_center(row): + centers = [y + h - h / 2 for x, y, w, h in row] + return sum(centers) / len(centers) + +rows.sort(key=avg_height_of_center) +#+END_SRC + +To test if this code works, let's try sorting the bounding rectangles and +numbering them from right to left, top to bottom. + +#+HEADER: :post html-image-size(text=*this*, width="500px") +#+BEGIN_SRC python :noweb no-export :results raw :exports both +import cv2 +image = cv2.imread("resources/examples/example-table.png", cv2.IMREAD_GRAYSCALE) +<> +<> +<> +<> +<> + +FONT_SCALE = 0.7 +FONT_COLOR = (127, 127, 127) +for i, row in enumerate(rows): + for j, cell in enumerate(row): + x, y, w, h = cell + cv2.putText( + image, + "{},{}".format(i, j), + (int(x + w - w / 2), int(y + h - h / 2)), + cv2.FONT_HERSHEY_SIMPLEX, + FONT_SCALE, + FONT_COLOR, + 2, + ) +cv2.imwrite("resources/examples/example-table-cells-numbered.png", image) +"resources/examples/example-table-cells-numbered.png" +#+END_SRC + +#+RESULTS: +#+ATTR_HTML: :width 500px :height 100% +[[file:resources/examples/example-table-cells-numbered.png]] + +#+BEGIN_SRC python :noweb-ref extract-cells-from-table :noweb yes :eval no +def extract_cell_images_from_table(image): + <> + <> + <> + <> + <> + cell_images_rows = [] + for row in rows: + cell_images_row = [] + for x, y, w, h in row: + cell_images_row.append(image[y:y+h, x:x+w]) + cell_images_rows.append(cell_images_row) + return cell_images_rows +#+END_SRC + +#+HEADER: :post html-image-size(text=*this*, width="200px") +#+BEGIN_SRC python :noweb no-export :results raw :exports both +<> +image = cv2.imread("resources/examples/example-table.png", cv2.IMREAD_GRAYSCALE) +cell_images_rows = extract_cell_images_from_table(image) +cv2.imwrite("resources/examples/example-table-cell-1-1.png", cell_images_rows[1][1]) +"resources/examples/example-table-cell-1-1.png" +#+END_SRC + +*** Cropping each cell to the text + +OCR with Tesseract works best when there is about 10 pixels of white border +around the text. + +Our bounding rectangles may have picked up some stray pixels from the horizontal +and vertical lines of the cells in the table. It's probobly just a few pixels, +much fewer than the width of the text. If that's the case, then we can remove +that noise with a simple open morph. + +Once the stray border pixels have been removed, we can expand our border using +~openMakeBorder~. + +#+BEGIN_SRC python :eval no :noweb-ref crop-to-text +def crop_to_text(image): + kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4)) + opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel) + + contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + bounding_rects = [cv2.boundingRect(c) for c in contours] + # The largest contour is certainly the text that we're looking for. + largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) + x, y, w, h = largest_rect + cropped = image[y:y+h, x:x+w] + bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) + return bordered +#+END_SRC + +#+HEADER: :post html-image-size(text=*this*, width="200px") +#+BEGIN_SRC python :noweb no-export :results raw :exports both +import cv2 +<> +image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE) +image = crop_to_text(image) +cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image) +"resources/examples/example-table-cell-1-1-cropped.png" +#+END_SRC + +#+RESULTS: +#+ATTR_HTML: :width 200px :height 100% +[[file:resources/examples/example-table-cell-1-1-cropped.png]] + +*** OCR each cell + +If we cleaned up the images well enough, we might get some accurate OCR! + +There is plenty that could have gone wrong along the way. + +The first step to troubleshooting is to view the intermediate images and see if +there's something about your image that is obviously abnormal, like some really +thick noise or a wrongly detected table. + +If everything looks reasonable but the OCR is doing something like turning a +period into a comma, then you might need to do some custom Tesseract training. + +#+BEGIN_SRC python :noweb-ref ocr-image :eval no :noweb yes +<> + +def ocr_image(image, config): + cropped = crop_to_text(image) + return pytesseract.image_to_string( + ~cropped, + config=config + ) +#+END_SRC + +#+BEGIN_SRC python :noweb no-export :exports both +import pytesseract +image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE) +<> +ocr_image(image, "--psm 7") +#+END_SRC + +#+RESULTS: +: 9.09 + +* Files +:PROPERTIES: +:header-args: :mkdirp yes :noweb yes +:END: + +#+BEGIN_SRC python :tangle pdf/ :mkdirp yes :results none + +#+END_SRC + +#+RESULTS: + +** +#+BEGIN_SRC python :tangle :results none +import setuptools + +with open("", "r") as fh: + long_description = + +setuptools.setup( + name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username + version="0.0.1", + author="Example Author", + author_email="", + description="A small example package", + long_description=long_description, + long_description_content_type="text/markdown", + url="", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', +) +#+END_SRC + +** table_image_ocr +*** table_image_ocr/ +#+BEGIN_SRC python :tangle table_image_ocr/ :mkdirp yes :results none + +#+END_SRC + +*** table_image_ocr/ + +#+BEGIN_SRC python :tangle table_image_ocr/ :mkdirp yes :results none +from contextlib import contextmanager +import functools +import logging +import os +import tempfile + +from bs4 import BeautifulSoup as bs +import requests + + +<> + +logger = get_logger() + + +<> + +@contextmanager +def working_dir(directory): + original_working_dir = os.getcwd() + try: + os.chdir(directory) + yield directory + finally: + os.chdir(original_working_dir) + + +def download(url, filepath): + response = request_get(url) + data = response.content + with open(filepath, "wb") as f: + f.write(data) + + +def make_tempdir(identifier): + return tempfile.mkdtemp(prefix="{}_".format(identifier)) +#+END_SRC + +*** table_image_ocr/ + +Takes a variable number of pdf files and creates images out of each page of the +file using ~pdfimages~ from Poppler. Images are created in the same directory +that contains the pdf. + +Prints each pdf followed by the images extracted from that pdf followed by a +blank line. + +#+BEGIN_SRC shell :eval no :exports code +python -m pdf.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ... +#+END_SRC + + +#+BEGIN_SRC python :tangle pdf/ :noweb yes +import argparse +import logging +import os +import re +import subprocess +import sys + +from pdf.util import request_get, working_dir, download, make_tempdir + + +<> + +logger = get_logger() + +parser = argparse.ArgumentParser() +parser.add_argument("files", nargs="+") + +def main(files): + pdf_images = [] + for f in files: + pdf_images.append((f, pdf_to_images(f))) + + for pdf, images in pdf_images: + for image in images: + preprocess_img(image) + + for pdf, images in pdf_images: + print("{}\n{}\n".format(pdf, "\n".join(images))) + + +<> +<> + +if __name__ == "__main__": + args = parser.parse_args() + main(args.files) +#+END_SRC + +#+RESULTS: + +*** table_image_ocr/ + +#+BEGIN_SRC shell +. ~/.virtualenvs/lotto_odds/bin/activate +python -m pdf.extract_tables "resources/examples/example-page.png" +#+END_SRC + +#+RESULTS: +| resources/examples/example-page.png | +| resources/examples/example-page-table-000.png | + +#+BEGIN_SRC python :noweb yes :tangle pdf/ :results none +import argparse +import os + +import cv2 + +parser = argparse.ArgumentParser() +parser.add_argument("files", nargs="+") + + +def main(files): + results = [] + for f in files: + directory, filename = os.path.split(f) + image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) + tables = find_tables(image) + files = [] + for i, table in enumerate(tables): + filename_sans_extension = os.path.splitext(filename)[0] + table_filename = "{}-table-{:03d}.png".format(filename_sans_extension, i) + table_filepath = os.path.join(directory, table_filename) + files.append(table_filepath) + cv2.imwrite(table_filepath, table) + results.append((f, files)) + + for image_filename, table_filenames in results: + print("{}\n{}\n".format(image_filename, "\n".join(table_filenames))) + +<> + +if __name__ == "__main__": + args = parser.parse_args() + files = args.files + main(files) +#+END_SRC + +*** table_image_ocr/ + +#+BEGIN_SRC shell :results none +. ~/.virtualenvs/lotto_odds/bin/activate +python -m pdf.extract_cells_from_table "resources/examples/example-table.png" +#+END_SRC + +#+BEGIN_SRC python :noweb yes :tangle pdf/ :results none +import os +import sys + +import cv2 +import pytesseract + +def main(f): + results = [] + directory, filename = os.path.split(f) + table = cv2.imread(f, cv2.IMREAD_GRAYSCALE) + rows = extract_cell_images_from_table(table) + cell_img_dir = os.path.join(directory, "cells") + os.makedirs(cell_img_dir, exist_ok=True) + for i, row in enumerate(rows): + for j, cell in enumerate(row): + cell_filename = "{:03d}-{:03d}.png".format(i, j) + path = os.path.join(cell_img_dir, cell_filename) + cv2.imwrite(path, cell) + print(cell_filename) + + +<> + +if __name__ == "__main__": + main(sys.argv[1]) +#+END_SRC + +* Utils + +The following code lets us specify a size for images when they are exported to +html. + +Org supports specifying an export size for an image by putting the ~#+ATTR_HTML: +:width 100px~ before the image. But since our images are in a results drawer, we +need a way for our results drawer to do that for us automatically. + +Adding ~#+ATTR_HTML~ after the beginning of the result block introduces a new +problem. Org-babel no longer recognizes the result as a result block and doesn't +remove it when a src block is re-evaluated, so we end up just appending new +results on each evaluation. + +There is nothing configurable that will tell org-babel to remove our line. But +we can define a function to do some cleanup and then add it as a before hook +with ~advice-add~. + +#+NAME: html-image-size +#+BEGIN_SRC emacs-lisp :var text="" :var width="100%" :var height="100%" :results none +(concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]") +#+END_SRC + +#+BEGIN_SRC emacs-lisp :results none +(defun remove-attributes-from-src-block-result (&rest args) + (let ((location (org-babel-where-is-src-block-result)) + (attr-regexp "[ ]*#\\+ATTR.*$")) + (when location + (save-excursion + (goto-char location) + (when (looking-at (concat org-babel-result-regexp ".*$")) + (next-line) + (while (looking-at attr-regexp) + (kill-whole-line))))))) + +(advice-add 'org-babel-remove-result :before #'remove-attributes-from-src-block-result) +(advice-add 'org-babel-execute-src-block :before #'remove-attributes-from-src-block-result) +#+END_SRC + diff --git a/resources/examples/cells/000-000.png b/resources/examples/cells/000-000.png new file mode 100644 index 0000000..73edbf2 Binary files /dev/null and b/resources/examples/cells/000-000.png differ diff --git a/resources/examples/cells/000-001.png b/resources/examples/cells/000-001.png new file mode 100644 index 0000000..0f440a8 Binary files /dev/null and b/resources/examples/cells/000-001.png differ diff --git a/resources/examples/cells/000-002.png b/resources/examples/cells/000-002.png new file mode 100644 index 0000000..ae1793e Binary files /dev/null and b/resources/examples/cells/000-002.png differ diff --git a/resources/examples/cells/001-000.png b/resources/examples/cells/001-000.png new file mode 100644 index 0000000..3ab59e9 Binary files /dev/null and b/resources/examples/cells/001-000.png differ diff --git a/resources/examples/cells/001-001.png b/resources/examples/cells/001-001.png new file mode 100644 index 0000000..2d14904 Binary files /dev/null and b/resources/examples/cells/001-001.png differ diff --git a/resources/examples/cells/001-002.png b/resources/examples/cells/001-002.png new file mode 100644 index 0000000..9e15547 Binary files /dev/null and b/resources/examples/cells/001-002.png differ diff --git a/resources/examples/cells/002-000.png b/resources/examples/cells/002-000.png new file mode 100644 index 0000000..45d8256 Binary files /dev/null and b/resources/examples/cells/002-000.png differ diff --git a/resources/examples/cells/002-001.png b/resources/examples/cells/002-001.png new file mode 100644 index 0000000..497cddf Binary files /dev/null and b/resources/examples/cells/002-001.png differ diff --git a/resources/examples/cells/002-002.png b/resources/examples/cells/002-002.png new file mode 100644 index 0000000..7b8c2ae Binary files /dev/null and b/resources/examples/cells/002-002.png differ diff --git a/resources/examples/cells/003-000.png b/resources/examples/cells/003-000.png new file mode 100644 index 0000000..ea9c320 Binary files /dev/null and b/resources/examples/cells/003-000.png differ diff --git a/resources/examples/cells/003-001.png b/resources/examples/cells/003-001.png new file mode 100644 index 0000000..c2ef24b Binary files /dev/null and b/resources/examples/cells/003-001.png differ diff --git a/resources/examples/cells/003-002.png b/resources/examples/cells/003-002.png new file mode 100644 index 0000000..8c944c2 Binary files /dev/null and b/resources/examples/cells/003-002.png differ diff --git a/resources/examples/cells/004-000.png b/resources/examples/cells/004-000.png new file mode 100644 index 0000000..d9c596d Binary files /dev/null and b/resources/examples/cells/004-000.png differ diff --git a/resources/examples/cells/004-001.png b/resources/examples/cells/004-001.png new file mode 100644 index 0000000..15baa30 Binary files /dev/null and b/resources/examples/cells/004-001.png differ diff --git a/resources/examples/cells/004-002.png b/resources/examples/cells/004-002.png new file mode 100644 index 0000000..f35d077 Binary files /dev/null and b/resources/examples/cells/004-002.png differ diff --git a/resources/examples/cells/005-000.png b/resources/examples/cells/005-000.png new file mode 100644 index 0000000..15d0622 Binary files /dev/null and b/resources/examples/cells/005-000.png differ diff --git a/resources/examples/cells/005-001.png b/resources/examples/cells/005-001.png new file mode 100644 index 0000000..48ab914 Binary files /dev/null and b/resources/examples/cells/005-001.png differ diff --git a/resources/examples/cells/005-002.png b/resources/examples/cells/005-002.png new file mode 100644 index 0000000..9342660 Binary files /dev/null and b/resources/examples/cells/005-002.png differ diff --git a/resources/examples/cells/006-000.png b/resources/examples/cells/006-000.png new file mode 100644 index 0000000..e0ec9ff Binary files /dev/null and b/resources/examples/cells/006-000.png differ diff --git a/resources/examples/cells/006-001.png b/resources/examples/cells/006-001.png new file mode 100644 index 0000000..2b76e44 Binary files /dev/null and b/resources/examples/cells/006-001.png differ diff --git a/resources/examples/cells/006-002.png b/resources/examples/cells/006-002.png new file mode 100644 index 0000000..7457910 Binary files /dev/null and b/resources/examples/cells/006-002.png differ diff --git a/resources/examples/cells/007-000.png b/resources/examples/cells/007-000.png new file mode 100644 index 0000000..9cfc8de Binary files /dev/null and b/resources/examples/cells/007-000.png differ diff --git a/resources/examples/cells/007-001.png b/resources/examples/cells/007-001.png new file mode 100644 index 0000000..317c289 Binary files /dev/null and b/resources/examples/cells/007-001.png differ diff --git a/resources/examples/cells/007-002.png b/resources/examples/cells/007-002.png new file mode 100644 index 0000000..6f484b6 Binary files /dev/null and b/resources/examples/cells/007-002.png differ diff --git a/resources/examples/cells/008-000.png b/resources/examples/cells/008-000.png new file mode 100644 index 0000000..7067b03 Binary files /dev/null and b/resources/examples/cells/008-000.png differ diff --git a/resources/examples/cells/008-001.png b/resources/examples/cells/008-001.png new file mode 100644 index 0000000..868b41c Binary files /dev/null and b/resources/examples/cells/008-001.png differ diff --git a/resources/examples/cells/008-002.png b/resources/examples/cells/008-002.png new file mode 100644 index 0000000..e4d4535 Binary files /dev/null and b/resources/examples/cells/008-002.png differ diff --git a/resources/examples/cells/009-000.png b/resources/examples/cells/009-000.png new file mode 100644 index 0000000..2153e0c Binary files /dev/null and b/resources/examples/cells/009-000.png differ diff --git a/resources/examples/cells/009-001.png b/resources/examples/cells/009-001.png new file mode 100644 index 0000000..31f28c5 Binary files /dev/null and b/resources/examples/cells/009-001.png differ diff --git a/resources/examples/cells/009-002.png b/resources/examples/cells/009-002.png new file mode 100644 index 0000000..ec2921f Binary files /dev/null and b/resources/examples/cells/009-002.png differ diff --git a/resources/examples/cells/010-000.png b/resources/examples/cells/010-000.png new file mode 100644 index 0000000..46b1bc6 Binary files /dev/null and b/resources/examples/cells/010-000.png differ diff --git a/resources/examples/cells/010-001.png b/resources/examples/cells/010-001.png new file mode 100644 index 0000000..7da6b54 Binary files /dev/null and b/resources/examples/cells/010-001.png differ diff --git a/resources/examples/cells/010-002.png b/resources/examples/cells/010-002.png new file mode 100644 index 0000000..08daab0 Binary files /dev/null and b/resources/examples/cells/010-002.png differ diff --git a/resources/examples/cells/011-000.png b/resources/examples/cells/011-000.png new file mode 100644 index 0000000..d2df24d Binary files /dev/null and b/resources/examples/cells/011-000.png differ diff --git a/resources/examples/cells/011-001.png b/resources/examples/cells/011-001.png new file mode 100644 index 0000000..aefbf1a Binary files /dev/null and b/resources/examples/cells/011-001.png differ diff --git a/resources/examples/cells/011-002.png b/resources/examples/cells/011-002.png new file mode 100644 index 0000000..0003c83 Binary files /dev/null and b/resources/examples/cells/011-002.png differ diff --git a/resources/examples/cells/012-000.png b/resources/examples/cells/012-000.png new file mode 100644 index 0000000..43d7b35 Binary files /dev/null and b/resources/examples/cells/012-000.png differ diff --git a/resources/examples/cells/012-001.png b/resources/examples/cells/012-001.png new file mode 100644 index 0000000..34086cb Binary files /dev/null and b/resources/examples/cells/012-001.png differ diff --git a/resources/examples/cells/012-002.png b/resources/examples/cells/012-002.png new file mode 100644 index 0000000..896b6d8 Binary files /dev/null and b/resources/examples/cells/012-002.png differ diff --git a/resources/examples/cells/013-000.png b/resources/examples/cells/013-000.png new file mode 100644 index 0000000..1738f61 Binary files /dev/null and b/resources/examples/cells/013-000.png differ diff --git a/resources/examples/cells/013-001.png b/resources/examples/cells/013-001.png new file mode 100644 index 0000000..c2a9b14 Binary files /dev/null and b/resources/examples/cells/013-001.png differ diff --git a/resources/examples/cells/013-002.png b/resources/examples/cells/013-002.png new file mode 100644 index 0000000..2efc059 Binary files /dev/null and b/resources/examples/cells/013-002.png differ diff --git a/resources/examples/cells/014-000.png b/resources/examples/cells/014-000.png new file mode 100644 index 0000000..3c0f622 Binary files /dev/null and b/resources/examples/cells/014-000.png differ diff --git a/resources/examples/cells/014-001.png b/resources/examples/cells/014-001.png new file mode 100644 index 0000000..b1be395 Binary files /dev/null and b/resources/examples/cells/014-001.png differ diff --git a/resources/examples/cells/014-002.png b/resources/examples/cells/014-002.png new file mode 100644 index 0000000..df4a293 Binary files /dev/null and b/resources/examples/cells/014-002.png differ diff --git a/resources/examples/example-page-table-000.png b/resources/examples/example-page-table-000.png new file mode 100644 index 0000000..2830fbc Binary files /dev/null and b/resources/examples/example-page-table-000.png differ diff --git a/resources/examples/example-page.png b/resources/examples/example-page.png new file mode 100644 index 0000000..d6e5121 Binary files /dev/null and b/resources/examples/example-page.png differ diff --git a/resources/examples/example-table-blurred.png b/resources/examples/example-table-blurred.png new file mode 100644 index 0000000..d853a4b Binary files /dev/null and b/resources/examples/example-table-blurred.png differ diff --git a/resources/examples/example-table-cell-1-1-cropped.png b/resources/examples/example-table-cell-1-1-cropped.png new file mode 100644 index 0000000..2ba2327 Binary files /dev/null and b/resources/examples/example-table-cell-1-1-cropped.png differ diff --git a/resources/examples/example-table-cell-1-1.png b/resources/examples/example-table-cell-1-1.png new file mode 100644 index 0000000..2d14904 Binary files /dev/null and b/resources/examples/example-table-cell-1-1.png differ diff --git a/resources/examples/example-table-cells-numbered.png b/resources/examples/example-table-cells-numbered.png new file mode 100644 index 0000000..9607f75 Binary files /dev/null and b/resources/examples/example-table-cells-numbered.png differ diff --git a/resources/examples/example-table-lines.png b/resources/examples/example-table-lines.png new file mode 100644 index 0000000..e331b81 Binary files /dev/null and b/resources/examples/example-table-lines.png differ diff --git a/resources/examples/example-table-thresholded.png b/resources/examples/example-table-thresholded.png new file mode 100644 index 0000000..7ccb27b Binary files /dev/null and b/resources/examples/example-table-thresholded.png differ diff --git a/resources/examples/example-table.png b/resources/examples/example-table.png new file mode 100644 index 0000000..7b63856 Binary files /dev/null and b/resources/examples/example-table.png differ diff --git a/resources/examples/example.pdf b/resources/examples/example.pdf new file mode 100644 index 0000000..f68e801 Binary files /dev/null and b/resources/examples/example.pdf differ diff --git a/ b/ new file mode 100644 index 0000000..1b90591 --- /dev/null +++ b/ @@ -0,0 +1,22 @@ +import setuptools + +with open("", "r") as fh: + long_description = + +setuptools.setup( + name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username + version="0.0.1", + author="Example Author", + author_email="", + description="A small example package", + long_description=long_description, + long_description_content_type="text/markdown", + url="", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', +) diff --git a/table_image_ocr/ b/table_image_ocr/ new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/table_image_ocr/ @@ -0,0 +1 @@ + diff --git a/table_image_ocr/ b/table_image_ocr/ new file mode 100644 index 0000000..c35c689 --- /dev/null +++ b/table_image_ocr/ @@ -0,0 +1,36 @@ +from contextlib import contextmanager +import functools +import logging +import os +import tempfile + +from bs4 import BeautifulSoup as bs +import requests + + + + +logger = get_logger() + + + + +@contextmanager +def working_dir(directory): + original_working_dir = os.getcwd() + try: + os.chdir(directory) + yield directory + finally: + os.chdir(original_working_dir) + + +def download(url, filepath): + response = request_get(url) + data = response.content + with open(filepath, "wb") as f: + f.write(data) + + +def make_tempdir(identifier): + return tempfile.mkdtemp(prefix="{}_".format(identifier))