Add gitignore, rename modules, remove unused code

5 years ago · 78e9cdb3f5
parent 8546902e64
commit 78e9cdb3f5
13 changed files with 414 additions and 60 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,10 @@
 .DS_Store
 .idea
 *.log
 tmp/
 *.py[cod]
 *.egg
 build
 htmlcov
 dist
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -527,7 +527,7 @@ ocr_image(image, "--psm 7")
 :header-args: :mkdirp yes :noweb yes
 :END:
-#+BEGIN_SRC python :tangle pdf/__init__.py :mkdirp yes :results none
+#+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none
 #+END_SRC
@ -535,54 +535,54 @@ ocr_image(image, "--psm 7")
 #+BEGIN_SRC python :tangle setup.py :results none
 import setuptools
-with open("README.md", "r") as fh:
+long_description = """
-    long_description = fh.read()
+Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
 Requires binaries for tesseract and pdfimages (from Poppler).
 """
 setuptools.setup(
-    name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username
+    name="table_ocr",
    version="0.0.1",
-    author="Example Author",
+    author="Eric Ihli",
-    author_email="author@example.com",
+    author_email="eihli@owoga.com",
-    description="A small example package",
+    description="Turn images of tables into CSV data.",
    long_description=long_description,
-    long_description_content_type="text/markdown",
+    long_description_content_type="text/plain",
-    url="https://github.com/pypa/sampleproject",
+    url="https://github.com/eihli/image-table-ocr",
    packages=setuptools.find_packages(),
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
    install_requires=[
        "pytesseract~=0.3",
        "opencv-python~=4.2",
    ],
    python_requires='>=3.6',
 )
 #+END_SRC
-** table_image_ocr
+** table_ocr
-*** table_image_ocr/__init__.py
+*** table_ocr/__init__.py
-#+BEGIN_SRC python :tangle table_image_ocr/__init__.py :mkdirp yes :results none
+#+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none
 #+END_SRC
-*** table_image_ocr/util.py
+*** table_ocr/util.py
-#+BEGIN_SRC python :tangle table_image_ocr/util.py :mkdirp yes :results none
+#+BEGIN_SRC python :tangle table_ocr/util.py :mkdirp yes :results none
 from contextlib import contextmanager
 import functools
 import logging
 import os
 import tempfile
-from bs4 import BeautifulSoup as bs
+<<get-logger>>
 import requests
 <<get_logger>>
 logger = get_logger()
 <<request_cacheing>>
@contextmanager
 def working_dir(directory):
    original_working_dir = os.getcwd()
@ -593,18 +593,11 @@ def working_dir(directory):
        os.chdir(original_working_dir)
 def download(url, filepath):
    response = request_get(url)
    data = response.content
    with open(filepath, "wb") as f:
        f.write(data)
 def make_tempdir(identifier):
    return tempfile.mkdtemp(prefix="{}_".format(identifier))
 #+END_SRC
-*** table_image_ocr/prepare_pdfs.py
+*** table_ocr/prepare_pdfs.py
 Takes a variable number of pdf files and creates images out of each page of the
 file using ~pdfimages~ from Poppler. Images are created in the same directory
@ -614,11 +607,11 @@ Prints each pdf followed by the images extracted from that pdf followed by a
 blank line.
 #+BEGIN_SRC shell :eval no :exports code
-python -m pdf.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
+python -m table_ocr.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
 #+END_SRC
-#+BEGIN_SRC python :tangle pdf/prepare_pdfs.py :noweb yes
+#+BEGIN_SRC python :tangle table_ocr/prepare_pdfs.py :noweb yes
 import argparse
 import logging
 import os
@ -626,7 +619,7 @@ import re
 import subprocess
 import sys
-from pdf.util import request_get, working_dir, download, make_tempdir
+from table_ocr.util import working_dir, make_tempdir
 <<get-logger>>
@ -657,9 +650,7 @@ if __name__ == "__main__":
    main(args.files)
 #+END_SRC
-#+RESULTS:
+*** table_ocr/extract_tables.py
 *** table_image_ocr/extract_tables.py
 #+BEGIN_SRC shell
 . ~/.virtualenvs/lotto_odds/bin/activate
@ -670,7 +661,7 @@ python -m pdf.extract_tables "resources/examples/example-page.png"
 | resources/examples/example-page.png           |
 | resources/examples/example-page-table-000.png |
-#+BEGIN_SRC python :noweb yes :tangle pdf/extract_tables.py :results none
+#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_tables.py :results none
 import argparse
 import os
@ -684,6 +675,7 @@ def main(files):
    results = []
    for f in files:
        directory, filename = os.path.split(f)
        image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
        tables = find_tables(image)
        files = []
@ -706,14 +698,14 @@ if __name__ == "__main__":
    main(files)
 #+END_SRC
-*** table_image_ocr/extract_cells_from_table.py
+*** table_ocr/extract_cells_from_table.py
 #+BEGIN_SRC shell :results none
 . ~/.virtualenvs/lotto_odds/bin/activate
 python -m pdf.extract_cells_from_table "resources/examples/example-table.png"
 #+END_SRC
-#+BEGIN_SRC python :noweb yes :tangle pdf/extract_cells_from_table.py :results none
+#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_cells_from_table.py :results none
 import os
 import sys
@ -784,3 +776,17 @@ with ~advice-add~.
 (advice-add 'org-babel-execute-src-block :before #'remove-attributes-from-src-block-result)
 #+END_SRC
 ** Logging
 #+BEGIN_SRC python :eval query :noweb-ref get-logger
 def get_logger():
    logger = logging.getLogger(__name__)
    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
    handler = logging.StreamHandler()
    formatter = logging.Formatter(logging.BASIC_FORMAT)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    handler.setLevel(lvl)
    logger.setLevel(lvl)
    return logger
 #+END_SRC
--- a/setup.py
+++ b/setup.py
@ -1,22 +1,28 @@
 import setuptools
-with open("README.md", "r") as fh:
+long_description = """
-    long_description = fh.read()
+Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
 Requires binaries for tesseract and pdfimages (from Poppler).
 """
 setuptools.setup(
-    name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username
+    name="table_ocr",
    version="0.0.1",
-    author="Example Author",
+    author="Eric Ihli",
-    author_email="author@example.com",
+    author_email="eihli@owoga.com",
-    description="A small example package",
+    description="Turn images of tables into CSV data.",
    long_description=long_description,
-    long_description_content_type="text/markdown",
+    long_description_content_type="text/plain",
-    url="https://github.com/pypa/sampleproject",
+    url="https://github.com/eihli/image-table-ocr",
    packages=setuptools.find_packages(),
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
    install_requires=[
        "pytesseract~=0.3",
        "opencv-python~=4.2",
    ],
    python_requires='>=3.6',
 )
--- a/table_ocr.egg-info/PKG-INFO
+++ b/table_ocr.egg-info/PKG-INFO
@ -0,0 +1,19 @@
 Metadata-Version: 2.1
 Name: table-ocr
 Version: 0.0.1
 Summary: Turn images of tables into CSV data.
 Home-page: https://github.com/eihli/image-table-ocr
 Author: Eric Ihli
 Author-email: eihli@owoga.com
 License: UNKNOWN
 Description: 
        Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
        Requires binaries for tesseract and pdfimages (from Poppler).
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Requires-Python: >=3.6
 Description-Content-Type: text/plain
--- a/table_ocr.egg-info/SOURCES.txt
+++ b/table_ocr.egg-info/SOURCES.txt
@ -0,0 +1,11 @@
 setup.py
 table_ocr/__init__.py
 table_ocr/extract_cells_from_table.py
 table_ocr/extract_tables.py
 table_ocr/prepare_pdfs.py
 table_ocr/util.py
 table_ocr.egg-info/PKG-INFO
 table_ocr.egg-info/SOURCES.txt
 table_ocr.egg-info/dependency_links.txt
 table_ocr.egg-info/requires.txt
 table_ocr.egg-info/top_level.txt
--- a/table_ocr.egg-info/dependency_links.txt
+++ b/table_ocr.egg-info/dependency_links.txt
--- a/table_ocr.egg-info/requires.txt
+++ b/table_ocr.egg-info/requires.txt
@ -0,0 +1,2 @@
 pytesseract~=0.3
 opencv-python~=4.2
--- a/table_ocr.egg-info/top_level.txt
+++ b/table_ocr.egg-info/top_level.txt
@ -0,0 +1 @@
 table_ocr
--- a/table_ocr/init.py
+++ b/table_ocr/init.py
@ -0,0 +1,3 @@
--- a/table_ocr/extract_cells_from_table.py
+++ b/table_ocr/extract_cells_from_table.py
@ -0,0 +1,119 @@
 import os
 import sys
 import cv2
 import pytesseract
 def main(f):
    results = []
    directory, filename = os.path.split(f)
    table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
    rows = extract_cell_images_from_table(table)
    cell_img_dir = os.path.join(directory, "cells")
    os.makedirs(cell_img_dir, exist_ok=True)
    for i, row in enumerate(rows):
        for j, cell in enumerate(row):
            cell_filename = "{:03d}-{:03d}.png".format(i, j)
            path = os.path.join(cell_img_dir, cell_filename)
            cv2.imwrite(path, cell)
            print(cell_filename)
 def extract_cell_images_from_table(image):
    BLUR_KERNEL_SIZE = (17, 17)
    STD_DEV_X_DIRECTION = 0
    STD_DEV_Y_DIRECTION = 0
    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    img_bin = cv2.adaptiveThreshold(
        ~blurred,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    vertical = horizontal = img_bin.copy()
    SCALE = 5
    image_width, image_height = horizontal.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
    mask = horizontally_dilated + vertically_dilated
    contours, heirarchy = cv2.findContours(
        mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
    )
    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
    epsilons = [0.05 * p for p in perimeter_lengths]
    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
    # Filter out contours that aren't rectangular. Those that aren't rectangular
    # are probably noise.
    approx_rects = [p for p in approx_polys if len(p) == 4]
    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
    # Filter out rectangles that are too narrow or too short.
    MIN_RECT_WIDTH = 40
    MIN_RECT_HEIGHT = 10
    bounding_rects = [
        r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
    ]
    # The largest bounding rectangle is assumed to be the entire table.
    # Remove it from the list. We don't want to accidentally try to OCR
    # the entire table.
    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
    bounding_rects = [b for b in bounding_rects if b is not largest_rect]
    cells = [c for c in bounding_rects]
    def cell_in_same_row(c1, c2):
        c1_center = c1[1] + c1[3] - c1[3] / 2
        c2_bottom = c2[1] + c2[3]
        c2_top = c2[1]
        return c2_top < c1_center < c2_bottom
    orig_cells = [c for c in cells]
    rows = []
    while cells:
        first = cells[0]
        rest = cells[1:]
        cells_in_same_row = sorted(
            [
                c for c in rest
                if cell_in_same_row(c, first)
            ],
            key=lambda c: c[0]
        )
        row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
        rows.append(row_cells)
        cells = [
            c for c in rest
            if not cell_in_same_row(c, first)
        ]
    # Sort rows by average height of their center.
    def avg_height_of_center(row):
        centers = [y + h - h / 2 for x, y, w, h in row]
        return sum(centers) / len(centers)
    rows.sort(key=avg_height_of_center)
    cell_images_rows = []
    for row in rows:
        cell_images_row = []
        for x, y, w, h in row:
            cell_images_row.append(image[y:y+h, x:x+w])
        cell_images_rows.append(cell_images_row)
    return cell_images_rows
 if __name__ == "__main__":
    main(sys.argv[1])
--- a/table_ocr/extract_tables.py
+++ b/table_ocr/extract_tables.py
@ -0,0 +1,81 @@
 import argparse
 import os
 import cv2
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
 def main(files):
    results = []
    for f in files:
        directory, filename = os.path.split(f)
        image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
        print("Reading {}".format(f))
        tables = find_tables(image)
        files = []
        for i, table in enumerate(tables):
            filename_sans_extension = os.path.splitext(filename)[0]
            table_filename = "{}-table-{:03d}.png".format(filename_sans_extension, i)
            table_filepath = os.path.join(directory, table_filename)
            files.append(table_filepath)
            cv2.imwrite(table_filepath, table)
        results.append((f, files))
    for image_filename, table_filenames in results:
        print("{}\n{}\n".format(image_filename, "\n".join(table_filenames)))
 def find_tables(image):
    BLUR_KERNEL_SIZE = (17, 17)
    STD_DEV_X_DIRECTION = 0
    STD_DEV_Y_DIRECTION = 0
    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    img_bin = cv2.adaptiveThreshold(
        ~blurred,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    vertical = horizontal = img_bin.copy()
    SCALE = 5
    image_width, image_height = horizontal.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
    mask = horizontally_dilated + vertically_dilated
    contours, heirarchy = cv2.findContours(
        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )
    MIN_TABLE_AREA = 1e5
    contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
    epsilons = [0.1 * p for p in perimeter_lengths]
    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
    # The link where a lot of this code was borrowed from recommends an
    # additional step to check the number of "joints" inside this bounding rectangle.
    # A table should have a lot of intersections. We might have a rectangular image
    # here though which would only have 4 intersections, 1 at each corner.
    # Leaving that step as a future TODO if it is ever necessary.
    images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
    return images
 if __name__ == "__main__":
    args = parser.parse_args()
    files = args.files
    main(files)
--- a/table_ocr/prepare_pdfs.py
+++ b/table_ocr/prepare_pdfs.py
@ -0,0 +1,100 @@
 import argparse
 import logging
 import os
 import re
 import subprocess
 import sys
 from table_ocr.util import working_dir, make_tempdir
 def get_logger():
    logger = logging.getLogger(__name__)
    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
    handler = logging.StreamHandler()
    formatter = logging.Formatter(logging.BASIC_FORMAT)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    handler.setLevel(lvl)
    logger.setLevel(lvl)
    return logger
 logger = get_logger()
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
 def main(files):
    pdf_images = []
    for f in files:
        pdf_images.append((f, pdf_to_images(f)))
    for pdf, images in pdf_images:
        for image in images:
            preprocess_img(image)
    for pdf, images in pdf_images:
        print("{}\n{}\n".format(pdf, "\n".join(images)))
 def pdf_to_images(pdf_filepath):
    """
    Turn a pdf into images
    """
    directory, filename = os.path.split(pdf_filepath)
    with working_dir(directory):
        image_filenames = pdfimages(pdf_filepath)
    # Since pdfimages creates a number of files named each for there page number
    # and doesn't return us the list that it created
    return [os.path.join(directory, f) for f in image_filenames]
 def pdfimages(pdf_filepath):
    """
    Uses the `pdfimages` utility from Poppler
    (https://poppler.freedesktop.org/). Creates images out of each page. Images
    are prefixed by their name sans extension and suffixed by their page number.
    """
    directory, filename = os.path.split(pdf_filepath)
    filename_sans_ext = filename.split(".pdf")[0]
    subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
    image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
    logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
    return image_filenames
 def find_matching_files_in_dir(file_prefix, directory):
    files = [
        filename
        for filename in os.listdir(directory)
        if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename)
    ]
    return files
 def preprocess_img(filepath):
    """
    Processing that involves running shell executables,
    like mogrify to rotate.
    """
    rotate = get_rotate(filepath)
    logger.debug("Rotating {} by {}.".format(filepath, rotate))
    mogrify(filepath, rotate)
 def get_rotate(image_filepath):
    output = (
        subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
        .decode("utf-8")
        .split("\n")
    )
    output = next(l for l in output if "Rotate: " in l)
    output = output.split(": ")[1]
    return output
 def mogrify(image_filepath, rotate):
    subprocess.run(["mogrify", "-rotate", rotate, image_filepath])
 if __name__ == "__main__":
    args = parser.parse_args()
    main(args.files)
--- a/table_image_ocr/util.py
+++ b/table_image_ocr/util.py
@ -4,17 +4,20 @@ import logging
 import os
 import tempfile
-from bs4 import BeautifulSoup as bs
+def get_logger():
-import requests
+    logger = logging.getLogger(__name__)
-
+    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
-
+    handler = logging.StreamHandler()
-
+    formatter = logging.Formatter(logging.BASIC_FORMAT)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    handler.setLevel(lvl)
    logger.setLevel(lvl)
    return logger
 logger = get_logger()
@contextmanager
 def working_dir(directory):
    original_working_dir = os.getcwd()
@ -25,12 +28,5 @@ def working_dir(directory):
        os.chdir(original_working_dir)
 def download(url, filepath):
    response = request_get(url)
    data = response.content
    with open(filepath, "wb") as f:
        f.write(data)
 def make_tempdir(identifier):
    return tempfile.mkdtemp(prefix="{}_".format(identifier))