diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4277629 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +.DS_Store +.idea +*.log +tmp/ + +*.py[cod] +*.egg +build +htmlcov +dist diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index b8e1f79..0ab5127 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -527,7 +527,7 @@ ocr_image(image, "--psm 7") :header-args: :mkdirp yes :noweb yes :END: -#+BEGIN_SRC python :tangle pdf/__init__.py :mkdirp yes :results none +#+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none #+END_SRC @@ -535,54 +535,54 @@ ocr_image(image, "--psm 7") #+BEGIN_SRC python :tangle setup.py :results none import setuptools -with open("README.md", "r") as fh: - long_description = fh.read() +long_description = """ +Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV. +Requires binaries for tesseract and pdfimages (from Poppler). +""" setuptools.setup( - name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username + name="table_ocr", version="0.0.1", - author="Example Author", - author_email="author@example.com", - description="A small example package", + author="Eric Ihli", + author_email="eihli@owoga.com", + description="Turn images of tables into CSV data.", long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/pypa/sampleproject", + long_description_content_type="text/plain", + url="https://github.com/eihli/image-table-ocr", packages=setuptools.find_packages(), classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], + install_requires=[ + "pytesseract~=0.3", + "opencv-python~=4.2", + ], python_requires='>=3.6', ) #+END_SRC -** table_image_ocr -*** table_image_ocr/__init__.py -#+BEGIN_SRC python :tangle table_image_ocr/__init__.py :mkdirp yes :results none +** table_ocr +*** table_ocr/__init__.py +#+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none #+END_SRC -*** table_image_ocr/util.py +*** table_ocr/util.py -#+BEGIN_SRC python :tangle table_image_ocr/util.py :mkdirp yes :results none +#+BEGIN_SRC python :tangle table_ocr/util.py :mkdirp yes :results none from contextlib import contextmanager import functools import logging import os import tempfile -from bs4 import BeautifulSoup as bs -import requests - - -<> +<> logger = get_logger() -<> - @contextmanager def working_dir(directory): original_working_dir = os.getcwd() @@ -593,18 +593,11 @@ def working_dir(directory): os.chdir(original_working_dir) -def download(url, filepath): - response = request_get(url) - data = response.content - with open(filepath, "wb") as f: - f.write(data) - - def make_tempdir(identifier): return tempfile.mkdtemp(prefix="{}_".format(identifier)) #+END_SRC -*** table_image_ocr/prepare_pdfs.py +*** table_ocr/prepare_pdfs.py Takes a variable number of pdf files and creates images out of each page of the file using ~pdfimages~ from Poppler. Images are created in the same directory @@ -614,11 +607,11 @@ Prints each pdf followed by the images extracted from that pdf followed by a blank line. #+BEGIN_SRC shell :eval no :exports code -python -m pdf.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ... +python -m table_ocr.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ... #+END_SRC -#+BEGIN_SRC python :tangle pdf/prepare_pdfs.py :noweb yes +#+BEGIN_SRC python :tangle table_ocr/prepare_pdfs.py :noweb yes import argparse import logging import os @@ -626,7 +619,7 @@ import re import subprocess import sys -from pdf.util import request_get, working_dir, download, make_tempdir +from table_ocr.util import working_dir, make_tempdir <> @@ -657,9 +650,7 @@ if __name__ == "__main__": main(args.files) #+END_SRC -#+RESULTS: - -*** table_image_ocr/extract_tables.py +*** table_ocr/extract_tables.py #+BEGIN_SRC shell . ~/.virtualenvs/lotto_odds/bin/activate @@ -670,7 +661,7 @@ python -m pdf.extract_tables "resources/examples/example-page.png" | resources/examples/example-page.png | | resources/examples/example-page-table-000.png | -#+BEGIN_SRC python :noweb yes :tangle pdf/extract_tables.py :results none +#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_tables.py :results none import argparse import os @@ -684,6 +675,7 @@ def main(files): results = [] for f in files: directory, filename = os.path.split(f) + image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) tables = find_tables(image) files = [] @@ -706,14 +698,14 @@ if __name__ == "__main__": main(files) #+END_SRC -*** table_image_ocr/extract_cells_from_table.py +*** table_ocr/extract_cells_from_table.py #+BEGIN_SRC shell :results none . ~/.virtualenvs/lotto_odds/bin/activate python -m pdf.extract_cells_from_table "resources/examples/example-table.png" #+END_SRC -#+BEGIN_SRC python :noweb yes :tangle pdf/extract_cells_from_table.py :results none +#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_cells_from_table.py :results none import os import sys @@ -784,3 +776,17 @@ with ~advice-add~. (advice-add 'org-babel-execute-src-block :before #'remove-attributes-from-src-block-result) #+END_SRC +** Logging + +#+BEGIN_SRC python :eval query :noweb-ref get-logger +def get_logger(): + logger = logging.getLogger(__name__) + lvl = os.environ.get("PY_LOG_LVL", "info").upper() + handler = logging.StreamHandler() + formatter = logging.Formatter(logging.BASIC_FORMAT) + handler.setFormatter(formatter) + logger.addHandler(handler) + handler.setLevel(lvl) + logger.setLevel(lvl) + return logger +#+END_SRC diff --git a/setup.py b/setup.py index 1b90591..75f821b 100644 --- a/setup.py +++ b/setup.py @@ -1,22 +1,28 @@ import setuptools -with open("README.md", "r") as fh: - long_description = fh.read() +long_description = """ +Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV. +Requires binaries for tesseract and pdfimages (from Poppler). +""" setuptools.setup( - name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username + name="table_ocr", version="0.0.1", - author="Example Author", - author_email="author@example.com", - description="A small example package", + author="Eric Ihli", + author_email="eihli@owoga.com", + description="Turn images of tables into CSV data.", long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/pypa/sampleproject", + long_description_content_type="text/plain", + url="https://github.com/eihli/image-table-ocr", packages=setuptools.find_packages(), classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], + install_requires=[ + "pytesseract~=0.3", + "opencv-python~=4.2", + ], python_requires='>=3.6', ) diff --git a/table_ocr.egg-info/PKG-INFO b/table_ocr.egg-info/PKG-INFO new file mode 100644 index 0000000..b3c8fe5 --- /dev/null +++ b/table_ocr.egg-info/PKG-INFO @@ -0,0 +1,19 @@ +Metadata-Version: 2.1 +Name: table-ocr +Version: 0.0.1 +Summary: Turn images of tables into CSV data. +Home-page: https://github.com/eihli/image-table-ocr +Author: Eric Ihli +Author-email: eihli@owoga.com +License: UNKNOWN +Description: + Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV. + + Requires binaries for tesseract and pdfimages (from Poppler). + +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Requires-Python: >=3.6 +Description-Content-Type: text/plain diff --git a/table_ocr.egg-info/SOURCES.txt b/table_ocr.egg-info/SOURCES.txt new file mode 100644 index 0000000..1d59802 --- /dev/null +++ b/table_ocr.egg-info/SOURCES.txt @@ -0,0 +1,11 @@ +setup.py +table_ocr/__init__.py +table_ocr/extract_cells_from_table.py +table_ocr/extract_tables.py +table_ocr/prepare_pdfs.py +table_ocr/util.py +table_ocr.egg-info/PKG-INFO +table_ocr.egg-info/SOURCES.txt +table_ocr.egg-info/dependency_links.txt +table_ocr.egg-info/requires.txt +table_ocr.egg-info/top_level.txt \ No newline at end of file diff --git a/table_image_ocr/__init__.py b/table_ocr.egg-info/dependency_links.txt similarity index 100% rename from table_image_ocr/__init__.py rename to table_ocr.egg-info/dependency_links.txt diff --git a/table_ocr.egg-info/requires.txt b/table_ocr.egg-info/requires.txt new file mode 100644 index 0000000..f89f746 --- /dev/null +++ b/table_ocr.egg-info/requires.txt @@ -0,0 +1,2 @@ +pytesseract~=0.3 +opencv-python~=4.2 diff --git a/table_ocr.egg-info/top_level.txt b/table_ocr.egg-info/top_level.txt new file mode 100644 index 0000000..20f2e4f --- /dev/null +++ b/table_ocr.egg-info/top_level.txt @@ -0,0 +1 @@ +table_ocr diff --git a/table_ocr/__init__.py b/table_ocr/__init__.py new file mode 100644 index 0000000..b28b04f --- /dev/null +++ b/table_ocr/__init__.py @@ -0,0 +1,3 @@ + + + diff --git a/table_ocr/extract_cells_from_table.py b/table_ocr/extract_cells_from_table.py new file mode 100644 index 0000000..2dcbbd4 --- /dev/null +++ b/table_ocr/extract_cells_from_table.py @@ -0,0 +1,119 @@ +import os +import sys + +import cv2 +import pytesseract + +def main(f): + results = [] + directory, filename = os.path.split(f) + table = cv2.imread(f, cv2.IMREAD_GRAYSCALE) + rows = extract_cell_images_from_table(table) + cell_img_dir = os.path.join(directory, "cells") + os.makedirs(cell_img_dir, exist_ok=True) + for i, row in enumerate(rows): + for j, cell in enumerate(row): + cell_filename = "{:03d}-{:03d}.png".format(i, j) + path = os.path.join(cell_img_dir, cell_filename) + cv2.imwrite(path, cell) + print(cell_filename) + + +def extract_cell_images_from_table(image): + BLUR_KERNEL_SIZE = (17, 17) + STD_DEV_X_DIRECTION = 0 + STD_DEV_Y_DIRECTION = 0 + blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION) + MAX_COLOR_VAL = 255 + BLOCK_SIZE = 15 + SUBTRACT_FROM_MEAN = -2 + + img_bin = cv2.adaptiveThreshold( + ~blurred, + MAX_COLOR_VAL, + cv2.ADAPTIVE_THRESH_MEAN_C, + cv2.THRESH_BINARY, + BLOCK_SIZE, + SUBTRACT_FROM_MEAN, + ) + vertical = horizontal = img_bin.copy() + SCALE = 5 + image_width, image_height = horizontal.shape + horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1)) + horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) + vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE))) + vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) + + horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))) + vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60))) + + mask = horizontally_dilated + vertically_dilated + contours, heirarchy = cv2.findContours( + mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE, + ) + + perimeter_lengths = [cv2.arcLength(c, True) for c in contours] + epsilons = [0.05 * p for p in perimeter_lengths] + approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)] + + # Filter out contours that aren't rectangular. Those that aren't rectangular + # are probably noise. + approx_rects = [p for p in approx_polys if len(p) == 4] + bounding_rects = [cv2.boundingRect(a) for a in approx_polys] + + # Filter out rectangles that are too narrow or too short. + MIN_RECT_WIDTH = 40 + MIN_RECT_HEIGHT = 10 + bounding_rects = [ + r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3] + ] + + # The largest bounding rectangle is assumed to be the entire table. + # Remove it from the list. We don't want to accidentally try to OCR + # the entire table. + largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) + bounding_rects = [b for b in bounding_rects if b is not largest_rect] + + cells = [c for c in bounding_rects] + def cell_in_same_row(c1, c2): + c1_center = c1[1] + c1[3] - c1[3] / 2 + c2_bottom = c2[1] + c2[3] + c2_top = c2[1] + return c2_top < c1_center < c2_bottom + + orig_cells = [c for c in cells] + rows = [] + while cells: + first = cells[0] + rest = cells[1:] + cells_in_same_row = sorted( + [ + c for c in rest + if cell_in_same_row(c, first) + ], + key=lambda c: c[0] + ) + + row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0]) + rows.append(row_cells) + cells = [ + c for c in rest + if not cell_in_same_row(c, first) + ] + + # Sort rows by average height of their center. + def avg_height_of_center(row): + centers = [y + h - h / 2 for x, y, w, h in row] + return sum(centers) / len(centers) + + rows.sort(key=avg_height_of_center) + cell_images_rows = [] + for row in rows: + cell_images_row = [] + for x, y, w, h in row: + cell_images_row.append(image[y:y+h, x:x+w]) + cell_images_rows.append(cell_images_row) + return cell_images_rows + +if __name__ == "__main__": + main(sys.argv[1]) diff --git a/table_ocr/extract_tables.py b/table_ocr/extract_tables.py new file mode 100644 index 0000000..963e9f6 --- /dev/null +++ b/table_ocr/extract_tables.py @@ -0,0 +1,81 @@ +import argparse +import os + +import cv2 + +parser = argparse.ArgumentParser() +parser.add_argument("files", nargs="+") + + +def main(files): + results = [] + for f in files: + directory, filename = os.path.split(f) + + image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) + print("Reading {}".format(f)) + tables = find_tables(image) + files = [] + for i, table in enumerate(tables): + filename_sans_extension = os.path.splitext(filename)[0] + table_filename = "{}-table-{:03d}.png".format(filename_sans_extension, i) + table_filepath = os.path.join(directory, table_filename) + files.append(table_filepath) + cv2.imwrite(table_filepath, table) + results.append((f, files)) + + for image_filename, table_filenames in results: + print("{}\n{}\n".format(image_filename, "\n".join(table_filenames))) + +def find_tables(image): + BLUR_KERNEL_SIZE = (17, 17) + STD_DEV_X_DIRECTION = 0 + STD_DEV_Y_DIRECTION = 0 + blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION) + MAX_COLOR_VAL = 255 + BLOCK_SIZE = 15 + SUBTRACT_FROM_MEAN = -2 + + img_bin = cv2.adaptiveThreshold( + ~blurred, + MAX_COLOR_VAL, + cv2.ADAPTIVE_THRESH_MEAN_C, + cv2.THRESH_BINARY, + BLOCK_SIZE, + SUBTRACT_FROM_MEAN, + ) + vertical = horizontal = img_bin.copy() + SCALE = 5 + image_width, image_height = horizontal.shape + horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1)) + horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) + vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE))) + vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) + + horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))) + vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60))) + + mask = horizontally_dilated + vertically_dilated + contours, heirarchy = cv2.findContours( + mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, + ) + + MIN_TABLE_AREA = 1e5 + contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA] + perimeter_lengths = [cv2.arcLength(c, True) for c in contours] + epsilons = [0.1 * p for p in perimeter_lengths] + approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)] + bounding_rects = [cv2.boundingRect(a) for a in approx_polys] + + # The link where a lot of this code was borrowed from recommends an + # additional step to check the number of "joints" inside this bounding rectangle. + # A table should have a lot of intersections. We might have a rectangular image + # here though which would only have 4 intersections, 1 at each corner. + # Leaving that step as a future TODO if it is ever necessary. + images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects] + return images + +if __name__ == "__main__": + args = parser.parse_args() + files = args.files + main(files) diff --git a/table_ocr/prepare_pdfs.py b/table_ocr/prepare_pdfs.py new file mode 100644 index 0000000..53ad6b6 --- /dev/null +++ b/table_ocr/prepare_pdfs.py @@ -0,0 +1,100 @@ +import argparse +import logging +import os +import re +import subprocess +import sys + +from table_ocr.util import working_dir, make_tempdir + + +def get_logger(): + logger = logging.getLogger(__name__) + lvl = os.environ.get("PY_LOG_LVL", "info").upper() + handler = logging.StreamHandler() + formatter = logging.Formatter(logging.BASIC_FORMAT) + handler.setFormatter(formatter) + logger.addHandler(handler) + handler.setLevel(lvl) + logger.setLevel(lvl) + return logger + +logger = get_logger() + +parser = argparse.ArgumentParser() +parser.add_argument("files", nargs="+") + +def main(files): + pdf_images = [] + for f in files: + pdf_images.append((f, pdf_to_images(f))) + + for pdf, images in pdf_images: + for image in images: + preprocess_img(image) + + for pdf, images in pdf_images: + print("{}\n{}\n".format(pdf, "\n".join(images))) + + +def pdf_to_images(pdf_filepath): + """ + Turn a pdf into images + """ + directory, filename = os.path.split(pdf_filepath) + with working_dir(directory): + image_filenames = pdfimages(pdf_filepath) + + # Since pdfimages creates a number of files named each for there page number + # and doesn't return us the list that it created + return [os.path.join(directory, f) for f in image_filenames] + + +def pdfimages(pdf_filepath): + """ + Uses the `pdfimages` utility from Poppler + (https://poppler.freedesktop.org/). Creates images out of each page. Images + are prefixed by their name sans extension and suffixed by their page number. + """ + directory, filename = os.path.split(pdf_filepath) + filename_sans_ext = filename.split(".pdf")[0] + subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]]) + image_filenames = find_matching_files_in_dir(filename_sans_ext, directory) + logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))) + return image_filenames + + +def find_matching_files_in_dir(file_prefix, directory): + files = [ + filename + for filename in os.listdir(directory) + if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename) + ] + return files +def preprocess_img(filepath): + """ + Processing that involves running shell executables, + like mogrify to rotate. + """ + rotate = get_rotate(filepath) + logger.debug("Rotating {} by {}.".format(filepath, rotate)) + mogrify(filepath, rotate) + + +def get_rotate(image_filepath): + output = ( + subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"]) + .decode("utf-8") + .split("\n") + ) + output = next(l for l in output if "Rotate: " in l) + output = output.split(": ")[1] + return output + + +def mogrify(image_filepath, rotate): + subprocess.run(["mogrify", "-rotate", rotate, image_filepath]) + +if __name__ == "__main__": + args = parser.parse_args() + main(args.files) diff --git a/table_image_ocr/util.py b/table_ocr/util.py similarity index 54% rename from table_image_ocr/util.py rename to table_ocr/util.py index c35c689..8c6d3bb 100644 --- a/table_image_ocr/util.py +++ b/table_ocr/util.py @@ -4,17 +4,20 @@ import logging import os import tempfile -from bs4 import BeautifulSoup as bs -import requests - - - +def get_logger(): + logger = logging.getLogger(__name__) + lvl = os.environ.get("PY_LOG_LVL", "info").upper() + handler = logging.StreamHandler() + formatter = logging.Formatter(logging.BASIC_FORMAT) + handler.setFormatter(formatter) + logger.addHandler(handler) + handler.setLevel(lvl) + logger.setLevel(lvl) + return logger logger = get_logger() - - @contextmanager def working_dir(directory): original_working_dir = os.getcwd() @@ -25,12 +28,5 @@ def working_dir(directory): os.chdir(original_working_dir) -def download(url, filepath): - response = request_get(url) - data = response.content - with open(filepath, "wb") as f: - f.write(data) - - def make_tempdir(identifier): return tempfile.mkdtemp(prefix="{}_".format(identifier))