Add gitignore, rename modules, remove unused code

main
Eric Ihli 5 years ago
parent 8546902e64
commit 78e9cdb3f5

10
.gitignore vendored

@ -0,0 +1,10 @@
.DS_Store
.idea
*.log
tmp/
*.py[cod]
*.egg
build
htmlcov
dist

@ -527,7 +527,7 @@ ocr_image(image, "--psm 7")
:header-args: :mkdirp yes :noweb yes :header-args: :mkdirp yes :noweb yes
:END: :END:
#+BEGIN_SRC python :tangle pdf/__init__.py :mkdirp yes :results none #+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none
#+END_SRC #+END_SRC
@ -535,54 +535,54 @@ ocr_image(image, "--psm 7")
#+BEGIN_SRC python :tangle setup.py :results none #+BEGIN_SRC python :tangle setup.py :results none
import setuptools import setuptools
with open("README.md", "r") as fh: long_description = """
long_description = fh.read() Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
Requires binaries for tesseract and pdfimages (from Poppler).
"""
setuptools.setup( setuptools.setup(
name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username name="table_ocr",
version="0.0.1", version="0.0.1",
author="Example Author", author="Eric Ihli",
author_email="author@example.com", author_email="eihli@owoga.com",
description="A small example package", description="Turn images of tables into CSV data.",
long_description=long_description, long_description=long_description,
long_description_content_type="text/markdown", long_description_content_type="text/plain",
url="https://github.com/pypa/sampleproject", url="https://github.com/eihli/image-table-ocr",
packages=setuptools.find_packages(), packages=setuptools.find_packages(),
classifiers=[ classifiers=[
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License", "License :: OSI Approved :: MIT License",
"Operating System :: OS Independent", "Operating System :: OS Independent",
], ],
install_requires=[
"pytesseract~=0.3",
"opencv-python~=4.2",
],
python_requires='>=3.6', python_requires='>=3.6',
) )
#+END_SRC #+END_SRC
** table_image_ocr ** table_ocr
*** table_image_ocr/__init__.py *** table_ocr/__init__.py
#+BEGIN_SRC python :tangle table_image_ocr/__init__.py :mkdirp yes :results none #+BEGIN_SRC python :tangle table_ocr/__init__.py :mkdirp yes :results none
#+END_SRC #+END_SRC
*** table_image_ocr/util.py *** table_ocr/util.py
#+BEGIN_SRC python :tangle table_image_ocr/util.py :mkdirp yes :results none #+BEGIN_SRC python :tangle table_ocr/util.py :mkdirp yes :results none
from contextlib import contextmanager from contextlib import contextmanager
import functools import functools
import logging import logging
import os import os
import tempfile import tempfile
from bs4 import BeautifulSoup as bs <<get-logger>>
import requests
<<get_logger>>
logger = get_logger() logger = get_logger()
<<request_cacheing>>
@contextmanager @contextmanager
def working_dir(directory): def working_dir(directory):
original_working_dir = os.getcwd() original_working_dir = os.getcwd()
@ -593,18 +593,11 @@ def working_dir(directory):
os.chdir(original_working_dir) os.chdir(original_working_dir)
def download(url, filepath):
response = request_get(url)
data = response.content
with open(filepath, "wb") as f:
f.write(data)
def make_tempdir(identifier): def make_tempdir(identifier):
return tempfile.mkdtemp(prefix="{}_".format(identifier)) return tempfile.mkdtemp(prefix="{}_".format(identifier))
#+END_SRC #+END_SRC
*** table_image_ocr/prepare_pdfs.py *** table_ocr/prepare_pdfs.py
Takes a variable number of pdf files and creates images out of each page of the Takes a variable number of pdf files and creates images out of each page of the
file using ~pdfimages~ from Poppler. Images are created in the same directory file using ~pdfimages~ from Poppler. Images are created in the same directory
@ -614,11 +607,11 @@ Prints each pdf followed by the images extracted from that pdf followed by a
blank line. blank line.
#+BEGIN_SRC shell :eval no :exports code #+BEGIN_SRC shell :eval no :exports code
python -m pdf.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ... python -m table_ocr.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
#+END_SRC #+END_SRC
#+BEGIN_SRC python :tangle pdf/prepare_pdfs.py :noweb yes #+BEGIN_SRC python :tangle table_ocr/prepare_pdfs.py :noweb yes
import argparse import argparse
import logging import logging
import os import os
@ -626,7 +619,7 @@ import re
import subprocess import subprocess
import sys import sys
from pdf.util import request_get, working_dir, download, make_tempdir from table_ocr.util import working_dir, make_tempdir
<<get-logger>> <<get-logger>>
@ -657,9 +650,7 @@ if __name__ == "__main__":
main(args.files) main(args.files)
#+END_SRC #+END_SRC
#+RESULTS: *** table_ocr/extract_tables.py
*** table_image_ocr/extract_tables.py
#+BEGIN_SRC shell #+BEGIN_SRC shell
. ~/.virtualenvs/lotto_odds/bin/activate . ~/.virtualenvs/lotto_odds/bin/activate
@ -670,7 +661,7 @@ python -m pdf.extract_tables "resources/examples/example-page.png"
| resources/examples/example-page.png | | resources/examples/example-page.png |
| resources/examples/example-page-table-000.png | | resources/examples/example-page-table-000.png |
#+BEGIN_SRC python :noweb yes :tangle pdf/extract_tables.py :results none #+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_tables.py :results none
import argparse import argparse
import os import os
@ -684,6 +675,7 @@ def main(files):
results = [] results = []
for f in files: for f in files:
directory, filename = os.path.split(f) directory, filename = os.path.split(f)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
tables = find_tables(image) tables = find_tables(image)
files = [] files = []
@ -706,14 +698,14 @@ if __name__ == "__main__":
main(files) main(files)
#+END_SRC #+END_SRC
*** table_image_ocr/extract_cells_from_table.py *** table_ocr/extract_cells_from_table.py
#+BEGIN_SRC shell :results none #+BEGIN_SRC shell :results none
. ~/.virtualenvs/lotto_odds/bin/activate . ~/.virtualenvs/lotto_odds/bin/activate
python -m pdf.extract_cells_from_table "resources/examples/example-table.png" python -m pdf.extract_cells_from_table "resources/examples/example-table.png"
#+END_SRC #+END_SRC
#+BEGIN_SRC python :noweb yes :tangle pdf/extract_cells_from_table.py :results none #+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_cells_from_table.py :results none
import os import os
import sys import sys
@ -784,3 +776,17 @@ with ~advice-add~.
(advice-add 'org-babel-execute-src-block :before #'remove-attributes-from-src-block-result) (advice-add 'org-babel-execute-src-block :before #'remove-attributes-from-src-block-result)
#+END_SRC #+END_SRC
** Logging
#+BEGIN_SRC python :eval query :noweb-ref get-logger
def get_logger():
logger = logging.getLogger(__name__)
lvl = os.environ.get("PY_LOG_LVL", "info").upper()
handler = logging.StreamHandler()
formatter = logging.Formatter(logging.BASIC_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
handler.setLevel(lvl)
logger.setLevel(lvl)
return logger
#+END_SRC

@ -1,22 +1,28 @@
import setuptools import setuptools
with open("README.md", "r") as fh: long_description = """
long_description = fh.read() Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
Requires binaries for tesseract and pdfimages (from Poppler).
"""
setuptools.setup( setuptools.setup(
name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username name="table_ocr",
version="0.0.1", version="0.0.1",
author="Example Author", author="Eric Ihli",
author_email="author@example.com", author_email="eihli@owoga.com",
description="A small example package", description="Turn images of tables into CSV data.",
long_description=long_description, long_description=long_description,
long_description_content_type="text/markdown", long_description_content_type="text/plain",
url="https://github.com/pypa/sampleproject", url="https://github.com/eihli/image-table-ocr",
packages=setuptools.find_packages(), packages=setuptools.find_packages(),
classifiers=[ classifiers=[
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License", "License :: OSI Approved :: MIT License",
"Operating System :: OS Independent", "Operating System :: OS Independent",
], ],
install_requires=[
"pytesseract~=0.3",
"opencv-python~=4.2",
],
python_requires='>=3.6', python_requires='>=3.6',
) )

@ -0,0 +1,19 @@
Metadata-Version: 2.1
Name: table-ocr
Version: 0.0.1
Summary: Turn images of tables into CSV data.
Home-page: https://github.com/eihli/image-table-ocr
Author: Eric Ihli
Author-email: eihli@owoga.com
License: UNKNOWN
Description:
Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
Requires binaries for tesseract and pdfimages (from Poppler).
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Requires-Python: >=3.6
Description-Content-Type: text/plain

@ -0,0 +1,11 @@
setup.py
table_ocr/__init__.py
table_ocr/extract_cells_from_table.py
table_ocr/extract_tables.py
table_ocr/prepare_pdfs.py
table_ocr/util.py
table_ocr.egg-info/PKG-INFO
table_ocr.egg-info/SOURCES.txt
table_ocr.egg-info/dependency_links.txt
table_ocr.egg-info/requires.txt
table_ocr.egg-info/top_level.txt

@ -0,0 +1,2 @@
pytesseract~=0.3
opencv-python~=4.2

@ -0,0 +1,119 @@
import os
import sys
import cv2
import pytesseract
def main(f):
results = []
directory, filename = os.path.split(f)
table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
rows = extract_cell_images_from_table(table)
cell_img_dir = os.path.join(directory, "cells")
os.makedirs(cell_img_dir, exist_ok=True)
for i, row in enumerate(rows):
for j, cell in enumerate(row):
cell_filename = "{:03d}-{:03d}.png".format(i, j)
path = os.path.join(cell_img_dir, cell_filename)
cv2.imwrite(path, cell)
print(cell_filename)
def extract_cell_images_from_table(image):
BLUR_KERNEL_SIZE = (17, 17)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~blurred,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
vertical = horizontal = img_bin.copy()
SCALE = 5
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
mask = horizontally_dilated + vertically_dilated
contours, heirarchy = cv2.findContours(
mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
)
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.05 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
# Filter out contours that aren't rectangular. Those that aren't rectangular
# are probably noise.
approx_rects = [p for p in approx_polys if len(p) == 4]
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
# Filter out rectangles that are too narrow or too short.
MIN_RECT_WIDTH = 40
MIN_RECT_HEIGHT = 10
bounding_rects = [
r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
]
# The largest bounding rectangle is assumed to be the entire table.
# Remove it from the list. We don't want to accidentally try to OCR
# the entire table.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
bounding_rects = [b for b in bounding_rects if b is not largest_rect]
cells = [c for c in bounding_rects]
def cell_in_same_row(c1, c2):
c1_center = c1[1] + c1[3] - c1[3] / 2
c2_bottom = c2[1] + c2[3]
c2_top = c2[1]
return c2_top < c1_center < c2_bottom
orig_cells = [c for c in cells]
rows = []
while cells:
first = cells[0]
rest = cells[1:]
cells_in_same_row = sorted(
[
c for c in rest
if cell_in_same_row(c, first)
],
key=lambda c: c[0]
)
row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
rows.append(row_cells)
cells = [
c for c in rest
if not cell_in_same_row(c, first)
]
# Sort rows by average height of their center.
def avg_height_of_center(row):
centers = [y + h - h / 2 for x, y, w, h in row]
return sum(centers) / len(centers)
rows.sort(key=avg_height_of_center)
cell_images_rows = []
for row in rows:
cell_images_row = []
for x, y, w, h in row:
cell_images_row.append(image[y:y+h, x:x+w])
cell_images_rows.append(cell_images_row)
return cell_images_rows
if __name__ == "__main__":
main(sys.argv[1])

@ -0,0 +1,81 @@
import argparse
import os
import cv2
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
results = []
for f in files:
directory, filename = os.path.split(f)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
print("Reading {}".format(f))
tables = find_tables(image)
files = []
for i, table in enumerate(tables):
filename_sans_extension = os.path.splitext(filename)[0]
table_filename = "{}-table-{:03d}.png".format(filename_sans_extension, i)
table_filepath = os.path.join(directory, table_filename)
files.append(table_filepath)
cv2.imwrite(table_filepath, table)
results.append((f, files))
for image_filename, table_filenames in results:
print("{}\n{}\n".format(image_filename, "\n".join(table_filenames)))
def find_tables(image):
BLUR_KERNEL_SIZE = (17, 17)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~blurred,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
vertical = horizontal = img_bin.copy()
SCALE = 5
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
mask = horizontally_dilated + vertically_dilated
contours, heirarchy = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
MIN_TABLE_AREA = 1e5
contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.1 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
# The link where a lot of this code was borrowed from recommends an
# additional step to check the number of "joints" inside this bounding rectangle.
# A table should have a lot of intersections. We might have a rectangular image
# here though which would only have 4 intersections, 1 at each corner.
# Leaving that step as a future TODO if it is ever necessary.
images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
return images
if __name__ == "__main__":
args = parser.parse_args()
files = args.files
main(files)

@ -0,0 +1,100 @@
import argparse
import logging
import os
import re
import subprocess
import sys
from table_ocr.util import working_dir, make_tempdir
def get_logger():
logger = logging.getLogger(__name__)
lvl = os.environ.get("PY_LOG_LVL", "info").upper()
handler = logging.StreamHandler()
formatter = logging.Formatter(logging.BASIC_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
handler.setLevel(lvl)
logger.setLevel(lvl)
return logger
logger = get_logger()
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
pdf_images = []
for f in files:
pdf_images.append((f, pdf_to_images(f)))
for pdf, images in pdf_images:
for image in images:
preprocess_img(image)
for pdf, images in pdf_images:
print("{}\n{}\n".format(pdf, "\n".join(images)))
def pdf_to_images(pdf_filepath):
"""
Turn a pdf into images
"""
directory, filename = os.path.split(pdf_filepath)
with working_dir(directory):
image_filenames = pdfimages(pdf_filepath)
# Since pdfimages creates a number of files named each for there page number
# and doesn't return us the list that it created
return [os.path.join(directory, f) for f in image_filenames]
def pdfimages(pdf_filepath):
"""
Uses the `pdfimages` utility from Poppler
(https://poppler.freedesktop.org/). Creates images out of each page. Images
are prefixed by their name sans extension and suffixed by their page number.
"""
directory, filename = os.path.split(pdf_filepath)
filename_sans_ext = filename.split(".pdf")[0]
subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
return image_filenames
def find_matching_files_in_dir(file_prefix, directory):
files = [
filename
for filename in os.listdir(directory)
if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename)
]
return files
def preprocess_img(filepath):
"""
Processing that involves running shell executables,
like mogrify to rotate.
"""
rotate = get_rotate(filepath)
logger.debug("Rotating {} by {}.".format(filepath, rotate))
mogrify(filepath, rotate)
def get_rotate(image_filepath):
output = (
subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
.decode("utf-8")
.split("\n")
)
output = next(l for l in output if "Rotate: " in l)
output = output.split(": ")[1]
return output
def mogrify(image_filepath, rotate):
subprocess.run(["mogrify", "-rotate", rotate, image_filepath])
if __name__ == "__main__":
args = parser.parse_args()
main(args.files)

@ -4,17 +4,20 @@ import logging
import os import os
import tempfile import tempfile
from bs4 import BeautifulSoup as bs def get_logger():
import requests logger = logging.getLogger(__name__)
lvl = os.environ.get("PY_LOG_LVL", "info").upper()
handler = logging.StreamHandler()
formatter = logging.Formatter(logging.BASIC_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
handler.setLevel(lvl)
logger.setLevel(lvl)
return logger
logger = get_logger() logger = get_logger()
@contextmanager @contextmanager
def working_dir(directory): def working_dir(directory):
original_working_dir = os.getcwd() original_working_dir = os.getcwd()
@ -25,12 +28,5 @@ def working_dir(directory):
os.chdir(original_working_dir) os.chdir(original_working_dir)
def download(url, filepath):
response = request_get(url)
data = response.content
with open(filepath, "wb") as f:
f.write(data)
def make_tempdir(identifier): def make_tempdir(identifier):
return tempfile.mkdtemp(prefix="{}_".format(identifier)) return tempfile.mkdtemp(prefix="{}_".format(identifier))
Loading…
Cancel
Save