Add gitignore, rename modules, remove unused code

5 years ago · 78e9cdb3f5
parent 8546902e64
commit 78e9cdb3f5
13 changed files with 414 additions and 60 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,10 @@
+.DS_Store
+.idea
+*.log
+tmp/
+
+*.py[cod]
+*.egg
+build
+htmlcov
+dist
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -527,7 +527,7 @@ ocr_image(image, "--psm 7")
 :header-args: :mkdirp yes :noweb yes
 :END:

-#+BEGIN_SRC python :tangle pdf/__init__.py :mkdirp yes :results none
+#+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none

 #+END_SRC

@ -535,54 +535,54 @@ ocr_image(image, "--psm 7")
 #+BEGIN_SRC python :tangle setup.py :results none
 import setuptools

-with open("README.md", "r") as fh:
-    long_description = fh.read()
+long_description = """
+Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.

+Requires binaries for tesseract and pdfimages (from Poppler).
+"""
 setuptools.setup(
-    name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username
+    name="table_ocr",
    version="0.0.1",
-    author="Example Author",
-    author_email="author@example.com",
-    description="A small example package",
+    author="Eric Ihli",
+    author_email="eihli@owoga.com",
+    description="Turn images of tables into CSV data.",
    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/pypa/sampleproject",
+    long_description_content_type="text/plain",
+    url="https://github.com/eihli/image-table-ocr",
    packages=setuptools.find_packages(),
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
+    install_requires=[
+        "pytesseract~=0.3",
+        "opencv-python~=4.2",
+    ],
    python_requires='>=3.6',
 )
 #+END_SRC

-** table_image_ocr
-*** table_image_ocr/__init__.py
-#+BEGIN_SRC python :tangle table_image_ocr/__init__.py :mkdirp yes :results none
+** table_ocr
+*** table_ocr/__init__.py
+#+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none

 #+END_SRC

-*** table_image_ocr/util.py
+*** table_ocr/util.py

-#+BEGIN_SRC python :tangle table_image_ocr/util.py :mkdirp yes :results none
+#+BEGIN_SRC python :tangle table_ocr/util.py :mkdirp yes :results none
 from contextlib import contextmanager
 import functools
 import logging
 import os
 import tempfile

-from bs4 import BeautifulSoup as bs
-import requests
-
-
-<<get_logger>>
+<<get-logger>>

 logger = get_logger()


-<<request_cacheing>>
-
@contextmanager
 def working_dir(directory):
    original_working_dir = os.getcwd()
@ -593,18 +593,11 @@ def working_dir(directory):
        os.chdir(original_working_dir)


-def download(url, filepath):
-    response = request_get(url)
-    data = response.content
-    with open(filepath, "wb") as f:
-        f.write(data)
-
-
 def make_tempdir(identifier):
    return tempfile.mkdtemp(prefix="{}_".format(identifier))
 #+END_SRC

-*** table_image_ocr/prepare_pdfs.py
+*** table_ocr/prepare_pdfs.py

 Takes a variable number of pdf files and creates images out of each page of the
 file using ~pdfimages~ from Poppler. Images are created in the same directory
@ -614,11 +607,11 @@ Prints each pdf followed by the images extracted from that pdf followed by a
 blank line.

 #+BEGIN_SRC shell :eval no :exports code
-python -m pdf.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
+python -m table_ocr.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
 #+END_SRC


-#+BEGIN_SRC python :tangle pdf/prepare_pdfs.py :noweb yes
+#+BEGIN_SRC python :tangle table_ocr/prepare_pdfs.py :noweb yes
 import argparse
 import logging
 import os
@ -626,7 +619,7 @@ import re
 import subprocess
 import sys

-from pdf.util import request_get, working_dir, download, make_tempdir
+from table_ocr.util import working_dir, make_tempdir


 <<get-logger>>
@ -657,9 +650,7 @@ if __name__ == "__main__":
    main(args.files)
 #+END_SRC

-#+RESULTS:
-
-*** table_image_ocr/extract_tables.py
+*** table_ocr/extract_tables.py

 #+BEGIN_SRC shell
 . ~/.virtualenvs/lotto_odds/bin/activate
@ -670,7 +661,7 @@ python -m pdf.extract_tables "resources/examples/example-page.png"
 | resources/examples/example-page.png           |
 | resources/examples/example-page-table-000.png |

-#+BEGIN_SRC python :noweb yes :tangle pdf/extract_tables.py :results none
+#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_tables.py :results none
 import argparse
 import os

@ -684,6 +675,7 @@ def main(files):
    results = []
    for f in files:
        directory, filename = os.path.split(f)
+
        image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
        tables = find_tables(image)
        files = []
@ -706,14 +698,14 @@ if __name__ == "__main__":
    main(files)
 #+END_SRC

-*** table_image_ocr/extract_cells_from_table.py
+*** table_ocr/extract_cells_from_table.py

 #+BEGIN_SRC shell :results none
 . ~/.virtualenvs/lotto_odds/bin/activate
 python -m pdf.extract_cells_from_table "resources/examples/example-table.png"
 #+END_SRC

-#+BEGIN_SRC python :noweb yes :tangle pdf/extract_cells_from_table.py :results none
+#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_cells_from_table.py :results none
 import os
 import sys

@ -784,3 +776,17 @@ with ~advice-add~.
 (advice-add 'org-babel-execute-src-block :before #'remove-attributes-from-src-block-result)
 #+END_SRC

+** Logging
+
+#+BEGIN_SRC python :eval query :noweb-ref get-logger
+def get_logger():
+    logger = logging.getLogger(__name__)
+    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter(logging.BASIC_FORMAT)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    handler.setLevel(lvl)
+    logger.setLevel(lvl)
+    return logger
+#+END_SRC
--- a/setup.py
+++ b/setup.py
@ -1,22 +1,28 @@
 import setuptools

-with open("README.md", "r") as fh:
-    long_description = fh.read()
+long_description = """
+Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.

+Requires binaries for tesseract and pdfimages (from Poppler).
+"""
 setuptools.setup(
-    name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username
+    name="table_ocr",
    version="0.0.1",
-    author="Example Author",
-    author_email="author@example.com",
-    description="A small example package",
+    author="Eric Ihli",
+    author_email="eihli@owoga.com",
+    description="Turn images of tables into CSV data.",
    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/pypa/sampleproject",
+    long_description_content_type="text/plain",
+    url="https://github.com/eihli/image-table-ocr",
    packages=setuptools.find_packages(),
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
+    install_requires=[
+        "pytesseract~=0.3",
+        "opencv-python~=4.2",
+    ],
    python_requires='>=3.6',
 )
--- a/table_ocr.egg-info/PKG-INFO
+++ b/table_ocr.egg-info/PKG-INFO
@ -0,0 +1,19 @@
+Metadata-Version: 2.1
+Name: table-ocr
+Version: 0.0.1
+Summary: Turn images of tables into CSV data.
+Home-page: https://github.com/eihli/image-table-ocr
+Author: Eric Ihli
+Author-email: eihli@owoga.com
+License: UNKNOWN
+Description: 
+        Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
+        
+        Requires binaries for tesseract and pdfimages (from Poppler).
+        
+Platform: UNKNOWN
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.6
+Description-Content-Type: text/plain
--- a/table_ocr.egg-info/SOURCES.txt
+++ b/table_ocr.egg-info/SOURCES.txt
@ -0,0 +1,11 @@
+setup.py
+table_ocr/__init__.py
+table_ocr/extract_cells_from_table.py
+table_ocr/extract_tables.py
+table_ocr/prepare_pdfs.py
+table_ocr/util.py
+table_ocr.egg-info/PKG-INFO
+table_ocr.egg-info/SOURCES.txt
+table_ocr.egg-info/dependency_links.txt
+table_ocr.egg-info/requires.txt
+table_ocr.egg-info/top_level.txt
--- a/table_ocr.egg-info/dependency_links.txt
+++ b/table_ocr.egg-info/dependency_links.txt
--- a/table_ocr.egg-info/requires.txt
+++ b/table_ocr.egg-info/requires.txt
@ -0,0 +1,2 @@
+pytesseract~=0.3
+opencv-python~=4.2
--- a/table_ocr.egg-info/top_level.txt
+++ b/table_ocr.egg-info/top_level.txt
@ -0,0 +1 @@
+table_ocr
--- a/table_ocr/init.py
+++ b/table_ocr/init.py
@ -0,0 +1,3 @@
+
+
+
--- a/table_ocr/extract_cells_from_table.py
+++ b/table_ocr/extract_cells_from_table.py
@ -0,0 +1,119 @@
+import os
+import sys
+
+import cv2
+import pytesseract
+
+def main(f):
+    results = []
+    directory, filename = os.path.split(f)
+    table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
+    rows = extract_cell_images_from_table(table)
+    cell_img_dir = os.path.join(directory, "cells")
+    os.makedirs(cell_img_dir, exist_ok=True)
+    for i, row in enumerate(rows):
+        for j, cell in enumerate(row):
+            cell_filename = "{:03d}-{:03d}.png".format(i, j)
+            path = os.path.join(cell_img_dir, cell_filename)
+            cv2.imwrite(path, cell)
+            print(cell_filename)
+
+
+def extract_cell_images_from_table(image):
+    BLUR_KERNEL_SIZE = (17, 17)
+    STD_DEV_X_DIRECTION = 0
+    STD_DEV_Y_DIRECTION = 0
+    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    SUBTRACT_FROM_MEAN = -2
+    
+    img_bin = cv2.adaptiveThreshold(
+        ~blurred,
+        MAX_COLOR_VAL,
+        cv2.ADAPTIVE_THRESH_MEAN_C,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+        SUBTRACT_FROM_MEAN,
+    )
+    vertical = horizontal = img_bin.copy()
+    SCALE = 5
+    image_width, image_height = horizontal.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
+    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
+    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    
+    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
+    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
+    
+    mask = horizontally_dilated + vertically_dilated
+    contours, heirarchy = cv2.findContours(
+        mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
+    )
+    
+    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
+    epsilons = [0.05 * p for p in perimeter_lengths]
+    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
+    
+    # Filter out contours that aren't rectangular. Those that aren't rectangular
+    # are probably noise.
+    approx_rects = [p for p in approx_polys if len(p) == 4]
+    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
+    
+    # Filter out rectangles that are too narrow or too short.
+    MIN_RECT_WIDTH = 40
+    MIN_RECT_HEIGHT = 10
+    bounding_rects = [
+        r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
+    ]
+    
+    # The largest bounding rectangle is assumed to be the entire table.
+    # Remove it from the list. We don't want to accidentally try to OCR
+    # the entire table.
+    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
+    bounding_rects = [b for b in bounding_rects if b is not largest_rect]
+    
+    cells = [c for c in bounding_rects]
+    def cell_in_same_row(c1, c2):
+        c1_center = c1[1] + c1[3] - c1[3] / 2
+        c2_bottom = c2[1] + c2[3]
+        c2_top = c2[1]
+        return c2_top < c1_center < c2_bottom
+    
+    orig_cells = [c for c in cells]
+    rows = []
+    while cells:
+        first = cells[0]
+        rest = cells[1:]
+        cells_in_same_row = sorted(
+            [
+                c for c in rest
+                if cell_in_same_row(c, first)
+            ],
+            key=lambda c: c[0]
+        )
+    
+        row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
+        rows.append(row_cells)
+        cells = [
+            c for c in rest
+            if not cell_in_same_row(c, first)
+        ]
+    
+    # Sort rows by average height of their center.
+    def avg_height_of_center(row):
+        centers = [y + h - h / 2 for x, y, w, h in row]
+        return sum(centers) / len(centers)
+    
+    rows.sort(key=avg_height_of_center)
+    cell_images_rows = []
+    for row in rows:
+        cell_images_row = []
+        for x, y, w, h in row:
+            cell_images_row.append(image[y:y+h, x:x+w])
+        cell_images_rows.append(cell_images_row)
+    return cell_images_rows
+
+if __name__ == "__main__":
+    main(sys.argv[1])
--- a/table_ocr/extract_tables.py
+++ b/table_ocr/extract_tables.py
@ -0,0 +1,81 @@
+import argparse
+import os
+
+import cv2
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+
+def main(files):
+    results = []
+    for f in files:
+        directory, filename = os.path.split(f)
+
+        image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
+        print("Reading {}".format(f))
+        tables = find_tables(image)
+        files = []
+        for i, table in enumerate(tables):
+            filename_sans_extension = os.path.splitext(filename)[0]
+            table_filename = "{}-table-{:03d}.png".format(filename_sans_extension, i)
+            table_filepath = os.path.join(directory, table_filename)
+            files.append(table_filepath)
+            cv2.imwrite(table_filepath, table)
+        results.append((f, files))
+
+    for image_filename, table_filenames in results:
+        print("{}\n{}\n".format(image_filename, "\n".join(table_filenames)))
+
+def find_tables(image):
+    BLUR_KERNEL_SIZE = (17, 17)
+    STD_DEV_X_DIRECTION = 0
+    STD_DEV_Y_DIRECTION = 0
+    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    SUBTRACT_FROM_MEAN = -2
+    
+    img_bin = cv2.adaptiveThreshold(
+        ~blurred,
+        MAX_COLOR_VAL,
+        cv2.ADAPTIVE_THRESH_MEAN_C,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+        SUBTRACT_FROM_MEAN,
+    )
+    vertical = horizontal = img_bin.copy()
+    SCALE = 5
+    image_width, image_height = horizontal.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
+    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
+    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    
+    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
+    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
+    
+    mask = horizontally_dilated + vertically_dilated
+    contours, heirarchy = cv2.findContours(
+        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
+    )
+
+    MIN_TABLE_AREA = 1e5
+    contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
+    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
+    epsilons = [0.1 * p for p in perimeter_lengths]
+    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
+    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
+
+    # The link where a lot of this code was borrowed from recommends an
+    # additional step to check the number of "joints" inside this bounding rectangle.
+    # A table should have a lot of intersections. We might have a rectangular image
+    # here though which would only have 4 intersections, 1 at each corner.
+    # Leaving that step as a future TODO if it is ever necessary.
+    images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
+    return images
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    files = args.files
+    main(files)
--- a/table_ocr/prepare_pdfs.py
+++ b/table_ocr/prepare_pdfs.py
@ -0,0 +1,100 @@
+import argparse
+import logging
+import os
+import re
+import subprocess
+import sys
+
+from table_ocr.util import working_dir, make_tempdir
+
+
+def get_logger():
+    logger = logging.getLogger(__name__)
+    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter(logging.BASIC_FORMAT)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    handler.setLevel(lvl)
+    logger.setLevel(lvl)
+    return logger
+
+logger = get_logger()
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+def main(files):
+    pdf_images = []
+    for f in files:
+        pdf_images.append((f, pdf_to_images(f)))
+
+    for pdf, images in pdf_images:
+        for image in images:
+            preprocess_img(image)
+
+    for pdf, images in pdf_images:
+        print("{}\n{}\n".format(pdf, "\n".join(images)))
+
+
+def pdf_to_images(pdf_filepath):
+    """
+    Turn a pdf into images
+    """
+    directory, filename = os.path.split(pdf_filepath)
+    with working_dir(directory):
+        image_filenames = pdfimages(pdf_filepath)
+
+    # Since pdfimages creates a number of files named each for there page number
+    # and doesn't return us the list that it created
+    return [os.path.join(directory, f) for f in image_filenames]
+
+
+def pdfimages(pdf_filepath):
+    """
+    Uses the `pdfimages` utility from Poppler
+    (https://poppler.freedesktop.org/). Creates images out of each page. Images
+    are prefixed by their name sans extension and suffixed by their page number.
+    """
+    directory, filename = os.path.split(pdf_filepath)
+    filename_sans_ext = filename.split(".pdf")[0]
+    subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
+    image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
+    logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
+    return image_filenames
+
+
+def find_matching_files_in_dir(file_prefix, directory):
+    files = [
+        filename
+        for filename in os.listdir(directory)
+        if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename)
+    ]
+    return files
+def preprocess_img(filepath):
+    """
+    Processing that involves running shell executables,
+    like mogrify to rotate.
+    """
+    rotate = get_rotate(filepath)
+    logger.debug("Rotating {} by {}.".format(filepath, rotate))
+    mogrify(filepath, rotate)
+
+
+def get_rotate(image_filepath):
+    output = (
+        subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
+        .decode("utf-8")
+        .split("\n")
+    )
+    output = next(l for l in output if "Rotate: " in l)
+    output = output.split(": ")[1]
+    return output
+
+
+def mogrify(image_filepath, rotate):
+    subprocess.run(["mogrify", "-rotate", rotate, image_filepath])
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.files)
--- a/table_image_ocr/util.py
+++ b/table_image_ocr/util.py
@ -4,17 +4,20 @@ import logging
 import os
 import tempfile

-from bs4 import BeautifulSoup as bs
-import requests
-
-
-
+def get_logger():
+    logger = logging.getLogger(__name__)
+    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter(logging.BASIC_FORMAT)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    handler.setLevel(lvl)
+    logger.setLevel(lvl)
+    return logger

 logger = get_logger()


-
-
@contextmanager
 def working_dir(directory):
    original_working_dir = os.getcwd()
@ -25,12 +28,5 @@ def working_dir(directory):
        os.chdir(original_working_dir)


-def download(url, filepath):
-    response = request_get(url)
-    data = response.content
-    with open(filepath, "wb") as f:
-        f.write(data)
-
-
 def make_tempdir(identifier):
    return tempfile.mkdtemp(prefix="{}_".format(identifier))