Add gitignore, rename modules, remove unused code

main
Eric Ihli 5 years ago
parent 8546902e64
commit 78e9cdb3f5

10
.gitignore vendored

@ -0,0 +1,10 @@
.DS_Store
.idea
*.log
tmp/
*.py[cod]
*.egg
build
htmlcov
dist

@ -527,7 +527,7 @@ ocr_image(image, "--psm 7")
:header-args: :mkdirp yes :noweb yes
:END:
#+BEGIN_SRC python :tangle pdf/__init__.py :mkdirp yes :results none
#+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none
#+END_SRC
@ -535,54 +535,54 @@ ocr_image(image, "--psm 7")
#+BEGIN_SRC python :tangle setup.py :results none
import setuptools
with open("README.md", "r") as fh:
long_description = fh.read()
long_description = """
Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
Requires binaries for tesseract and pdfimages (from Poppler).
"""
setuptools.setup(
name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username
name="table_ocr",
version="0.0.1",
author="Example Author",
author_email="author@example.com",
description="A small example package",
author="Eric Ihli",
author_email="eihli@owoga.com",
description="Turn images of tables into CSV data.",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/pypa/sampleproject",
long_description_content_type="text/plain",
url="https://github.com/eihli/image-table-ocr",
packages=setuptools.find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
install_requires=[
"pytesseract~=0.3",
"opencv-python~=4.2",
],
python_requires='>=3.6',
)
#+END_SRC
** table_image_ocr
*** table_image_ocr/__init__.py
#+BEGIN_SRC python :tangle table_image_ocr/__init__.py :mkdirp yes :results none
** table_ocr
*** table_ocr/__init__.py
#+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none
#+END_SRC
*** table_image_ocr/util.py
*** table_ocr/util.py
#+BEGIN_SRC python :tangle table_image_ocr/util.py :mkdirp yes :results none
#+BEGIN_SRC python :tangle table_ocr/util.py :mkdirp yes :results none
from contextlib import contextmanager
import functools
import logging
import os
import tempfile
from bs4 import BeautifulSoup as bs
import requests
<<get_logger>>
<<get-logger>>
logger = get_logger()
<<request_cacheing>>
@contextmanager
def working_dir(directory):
original_working_dir = os.getcwd()
@ -593,18 +593,11 @@ def working_dir(directory):
os.chdir(original_working_dir)
def download(url, filepath):
response = request_get(url)
data = response.content
with open(filepath, "wb") as f:
f.write(data)
def make_tempdir(identifier):
return tempfile.mkdtemp(prefix="{}_".format(identifier))
#+END_SRC
*** table_image_ocr/prepare_pdfs.py
*** table_ocr/prepare_pdfs.py
Takes a variable number of pdf files and creates images out of each page of the
file using ~pdfimages~ from Poppler. Images are created in the same directory
@ -614,11 +607,11 @@ Prints each pdf followed by the images extracted from that pdf followed by a
blank line.
#+BEGIN_SRC shell :eval no :exports code
python -m pdf.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
python -m table_ocr.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
#+END_SRC
#+BEGIN_SRC python :tangle pdf/prepare_pdfs.py :noweb yes
#+BEGIN_SRC python :tangle table_ocr/prepare_pdfs.py :noweb yes
import argparse
import logging
import os
@ -626,7 +619,7 @@ import re
import subprocess
import sys
from pdf.util import request_get, working_dir, download, make_tempdir
from table_ocr.util import working_dir, make_tempdir
<<get-logger>>
@ -657,9 +650,7 @@ if __name__ == "__main__":
main(args.files)
#+END_SRC
#+RESULTS:
*** table_image_ocr/extract_tables.py
*** table_ocr/extract_tables.py
#+BEGIN_SRC shell
. ~/.virtualenvs/lotto_odds/bin/activate
@ -670,7 +661,7 @@ python -m pdf.extract_tables "resources/examples/example-page.png"
| resources/examples/example-page.png |
| resources/examples/example-page-table-000.png |
#+BEGIN_SRC python :noweb yes :tangle pdf/extract_tables.py :results none
#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_tables.py :results none
import argparse
import os
@ -684,6 +675,7 @@ def main(files):
results = []
for f in files:
directory, filename = os.path.split(f)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
tables = find_tables(image)
files = []
@ -706,14 +698,14 @@ if __name__ == "__main__":
main(files)
#+END_SRC
*** table_image_ocr/extract_cells_from_table.py
*** table_ocr/extract_cells_from_table.py
#+BEGIN_SRC shell :results none
. ~/.virtualenvs/lotto_odds/bin/activate
python -m pdf.extract_cells_from_table "resources/examples/example-table.png"
#+END_SRC
#+BEGIN_SRC python :noweb yes :tangle pdf/extract_cells_from_table.py :results none
#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_cells_from_table.py :results none
import os
import sys
@ -784,3 +776,17 @@ with ~advice-add~.
(advice-add 'org-babel-execute-src-block :before #'remove-attributes-from-src-block-result)
#+END_SRC
** Logging
#+BEGIN_SRC python :eval query :noweb-ref get-logger
def get_logger():
logger = logging.getLogger(__name__)
lvl = os.environ.get("PY_LOG_LVL", "info").upper()
handler = logging.StreamHandler()
formatter = logging.Formatter(logging.BASIC_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
handler.setLevel(lvl)
logger.setLevel(lvl)
return logger
#+END_SRC

@ -1,22 +1,28 @@
import setuptools
with open("README.md", "r") as fh:
long_description = fh.read()
long_description = """
Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
Requires binaries for tesseract and pdfimages (from Poppler).
"""
setuptools.setup(
name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username
name="table_ocr",
version="0.0.1",
author="Example Author",
author_email="author@example.com",
description="A small example package",
author="Eric Ihli",
author_email="eihli@owoga.com",
description="Turn images of tables into CSV data.",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/pypa/sampleproject",
long_description_content_type="text/plain",
url="https://github.com/eihli/image-table-ocr",
packages=setuptools.find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
install_requires=[
"pytesseract~=0.3",
"opencv-python~=4.2",
],
python_requires='>=3.6',
)

@ -0,0 +1,19 @@
Metadata-Version: 2.1
Name: table-ocr
Version: 0.0.1
Summary: Turn images of tables into CSV data.
Home-page: https://github.com/eihli/image-table-ocr
Author: Eric Ihli
Author-email: eihli@owoga.com
License: UNKNOWN
Description:
Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
Requires binaries for tesseract and pdfimages (from Poppler).
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Requires-Python: >=3.6
Description-Content-Type: text/plain

@ -0,0 +1,11 @@
setup.py
table_ocr/__init__.py
table_ocr/extract_cells_from_table.py
table_ocr/extract_tables.py
table_ocr/prepare_pdfs.py
table_ocr/util.py
table_ocr.egg-info/PKG-INFO
table_ocr.egg-info/SOURCES.txt
table_ocr.egg-info/dependency_links.txt
table_ocr.egg-info/requires.txt
table_ocr.egg-info/top_level.txt

@ -0,0 +1,2 @@
pytesseract~=0.3
opencv-python~=4.2

@ -0,0 +1,119 @@
import os
import sys
import cv2
import pytesseract
def main(f):
results = []
directory, filename = os.path.split(f)
table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
rows = extract_cell_images_from_table(table)
cell_img_dir = os.path.join(directory, "cells")
os.makedirs(cell_img_dir, exist_ok=True)
for i, row in enumerate(rows):
for j, cell in enumerate(row):
cell_filename = "{:03d}-{:03d}.png".format(i, j)
path = os.path.join(cell_img_dir, cell_filename)
cv2.imwrite(path, cell)
print(cell_filename)
def extract_cell_images_from_table(image):
BLUR_KERNEL_SIZE = (17, 17)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~blurred,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
vertical = horizontal = img_bin.copy()
SCALE = 5
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
mask = horizontally_dilated + vertically_dilated
contours, heirarchy = cv2.findContours(
mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
)
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.05 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
# Filter out contours that aren't rectangular. Those that aren't rectangular
# are probably noise.
approx_rects = [p for p in approx_polys if len(p) == 4]
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
# Filter out rectangles that are too narrow or too short.
MIN_RECT_WIDTH = 40
MIN_RECT_HEIGHT = 10
bounding_rects = [
r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
]
# The largest bounding rectangle is assumed to be the entire table.
# Remove it from the list. We don't want to accidentally try to OCR
# the entire table.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
bounding_rects = [b for b in bounding_rects if b is not largest_rect]
cells = [c for c in bounding_rects]
def cell_in_same_row(c1, c2):
c1_center = c1[1] + c1[3] - c1[3] / 2
c2_bottom = c2[1] + c2[3]
c2_top = c2[1]
return c2_top < c1_center < c2_bottom
orig_cells = [c for c in cells]
rows = []
while cells:
first = cells[0]
rest = cells[1:]
cells_in_same_row = sorted(
[
c for c in rest
if cell_in_same_row(c, first)
],
key=lambda c: c[0]
)
row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
rows.append(row_cells)
cells = [
c for c in rest
if not cell_in_same_row(c, first)
]
# Sort rows by average height of their center.
def avg_height_of_center(row):
centers = [y + h - h / 2 for x, y, w, h in row]
return sum(centers) / len(centers)
rows.sort(key=avg_height_of_center)
cell_images_rows = []
for row in rows:
cell_images_row = []
for x, y, w, h in row:
cell_images_row.append(image[y:y+h, x:x+w])
cell_images_rows.append(cell_images_row)
return cell_images_rows
if __name__ == "__main__":
main(sys.argv[1])

@ -0,0 +1,81 @@
import argparse
import os
import cv2
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
results = []
for f in files:
directory, filename = os.path.split(f)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
print("Reading {}".format(f))
tables = find_tables(image)
files = []
for i, table in enumerate(tables):
filename_sans_extension = os.path.splitext(filename)[0]
table_filename = "{}-table-{:03d}.png".format(filename_sans_extension, i)
table_filepath = os.path.join(directory, table_filename)
files.append(table_filepath)
cv2.imwrite(table_filepath, table)
results.append((f, files))
for image_filename, table_filenames in results:
print("{}\n{}\n".format(image_filename, "\n".join(table_filenames)))
def find_tables(image):
BLUR_KERNEL_SIZE = (17, 17)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~blurred,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
vertical = horizontal = img_bin.copy()
SCALE = 5
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
mask = horizontally_dilated + vertically_dilated
contours, heirarchy = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
MIN_TABLE_AREA = 1e5
contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.1 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
# The link where a lot of this code was borrowed from recommends an
# additional step to check the number of "joints" inside this bounding rectangle.
# A table should have a lot of intersections. We might have a rectangular image
# here though which would only have 4 intersections, 1 at each corner.
# Leaving that step as a future TODO if it is ever necessary.
images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
return images
if __name__ == "__main__":
args = parser.parse_args()
files = args.files
main(files)

@ -0,0 +1,100 @@
import argparse
import logging
import os
import re
import subprocess
import sys
from table_ocr.util import working_dir, make_tempdir
def get_logger():
logger = logging.getLogger(__name__)
lvl = os.environ.get("PY_LOG_LVL", "info").upper()
handler = logging.StreamHandler()
formatter = logging.Formatter(logging.BASIC_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
handler.setLevel(lvl)
logger.setLevel(lvl)
return logger
logger = get_logger()
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
pdf_images = []
for f in files:
pdf_images.append((f, pdf_to_images(f)))
for pdf, images in pdf_images:
for image in images:
preprocess_img(image)
for pdf, images in pdf_images:
print("{}\n{}\n".format(pdf, "\n".join(images)))
def pdf_to_images(pdf_filepath):
"""
Turn a pdf into images
"""
directory, filename = os.path.split(pdf_filepath)
with working_dir(directory):
image_filenames = pdfimages(pdf_filepath)
# Since pdfimages creates a number of files named each for there page number
# and doesn't return us the list that it created
return [os.path.join(directory, f) for f in image_filenames]
def pdfimages(pdf_filepath):
"""
Uses the `pdfimages` utility from Poppler
(https://poppler.freedesktop.org/). Creates images out of each page. Images
are prefixed by their name sans extension and suffixed by their page number.
"""
directory, filename = os.path.split(pdf_filepath)
filename_sans_ext = filename.split(".pdf")[0]
subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
return image_filenames
def find_matching_files_in_dir(file_prefix, directory):
files = [
filename
for filename in os.listdir(directory)
if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename)
]
return files
def preprocess_img(filepath):
"""
Processing that involves running shell executables,
like mogrify to rotate.
"""
rotate = get_rotate(filepath)
logger.debug("Rotating {} by {}.".format(filepath, rotate))
mogrify(filepath, rotate)
def get_rotate(image_filepath):
output = (
subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
.decode("utf-8")
.split("\n")
)
output = next(l for l in output if "Rotate: " in l)
output = output.split(": ")[1]
return output
def mogrify(image_filepath, rotate):
subprocess.run(["mogrify", "-rotate", rotate, image_filepath])
if __name__ == "__main__":
args = parser.parse_args()
main(args.files)

@ -4,17 +4,20 @@ import logging
import os
import tempfile
from bs4 import BeautifulSoup as bs
import requests
def get_logger():
logger = logging.getLogger(__name__)
lvl = os.environ.get("PY_LOG_LVL", "info").upper()
handler = logging.StreamHandler()
formatter = logging.Formatter(logging.BASIC_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
handler.setLevel(lvl)
logger.setLevel(lvl)
return logger
logger = get_logger()
@contextmanager
def working_dir(directory):
original_working_dir = os.getcwd()
@ -25,12 +28,5 @@ def working_dir(directory):
os.chdir(original_working_dir)
def download(url, filepath):
response = request_get(url)
data = response.content
with open(filepath, "wb") as f:
f.write(data)
def make_tempdir(identifier):
return tempfile.mkdtemp(prefix="{}_".format(identifier))
Loading…
Cancel
Save