Add gitignore, rename modules, remove unused code
parent
8546902e64
commit
78e9cdb3f5
@ -0,0 +1,10 @@
|
|||||||
|
.DS_Store
|
||||||
|
.idea
|
||||||
|
*.log
|
||||||
|
tmp/
|
||||||
|
|
||||||
|
*.py[cod]
|
||||||
|
*.egg
|
||||||
|
build
|
||||||
|
htmlcov
|
||||||
|
dist
|
@ -1,22 +1,28 @@
|
|||||||
import setuptools
|
import setuptools
|
||||||
|
|
||||||
with open("README.md", "r") as fh:
|
long_description = """
|
||||||
long_description = fh.read()
|
Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
|
||||||
|
|
||||||
|
Requires binaries for tesseract and pdfimages (from Poppler).
|
||||||
|
"""
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username
|
name="table_ocr",
|
||||||
version="0.0.1",
|
version="0.0.1",
|
||||||
author="Example Author",
|
author="Eric Ihli",
|
||||||
author_email="author@example.com",
|
author_email="eihli@owoga.com",
|
||||||
description="A small example package",
|
description="Turn images of tables into CSV data.",
|
||||||
long_description=long_description,
|
long_description=long_description,
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/plain",
|
||||||
url="https://github.com/pypa/sampleproject",
|
url="https://github.com/eihli/image-table-ocr",
|
||||||
packages=setuptools.find_packages(),
|
packages=setuptools.find_packages(),
|
||||||
classifiers=[
|
classifiers=[
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"License :: OSI Approved :: MIT License",
|
"License :: OSI Approved :: MIT License",
|
||||||
"Operating System :: OS Independent",
|
"Operating System :: OS Independent",
|
||||||
],
|
],
|
||||||
|
install_requires=[
|
||||||
|
"pytesseract~=0.3",
|
||||||
|
"opencv-python~=4.2",
|
||||||
|
],
|
||||||
python_requires='>=3.6',
|
python_requires='>=3.6',
|
||||||
)
|
)
|
||||||
|
@ -0,0 +1,19 @@
|
|||||||
|
Metadata-Version: 2.1
|
||||||
|
Name: table-ocr
|
||||||
|
Version: 0.0.1
|
||||||
|
Summary: Turn images of tables into CSV data.
|
||||||
|
Home-page: https://github.com/eihli/image-table-ocr
|
||||||
|
Author: Eric Ihli
|
||||||
|
Author-email: eihli@owoga.com
|
||||||
|
License: UNKNOWN
|
||||||
|
Description:
|
||||||
|
Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
|
||||||
|
|
||||||
|
Requires binaries for tesseract and pdfimages (from Poppler).
|
||||||
|
|
||||||
|
Platform: UNKNOWN
|
||||||
|
Classifier: Programming Language :: Python :: 3
|
||||||
|
Classifier: License :: OSI Approved :: MIT License
|
||||||
|
Classifier: Operating System :: OS Independent
|
||||||
|
Requires-Python: >=3.6
|
||||||
|
Description-Content-Type: text/plain
|
@ -0,0 +1,11 @@
|
|||||||
|
setup.py
|
||||||
|
table_ocr/__init__.py
|
||||||
|
table_ocr/extract_cells_from_table.py
|
||||||
|
table_ocr/extract_tables.py
|
||||||
|
table_ocr/prepare_pdfs.py
|
||||||
|
table_ocr/util.py
|
||||||
|
table_ocr.egg-info/PKG-INFO
|
||||||
|
table_ocr.egg-info/SOURCES.txt
|
||||||
|
table_ocr.egg-info/dependency_links.txt
|
||||||
|
table_ocr.egg-info/requires.txt
|
||||||
|
table_ocr.egg-info/top_level.txt
|
@ -0,0 +1,2 @@
|
|||||||
|
pytesseract~=0.3
|
||||||
|
opencv-python~=4.2
|
@ -0,0 +1 @@
|
|||||||
|
table_ocr
|
@ -0,0 +1,3 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,119 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import pytesseract
|
||||||
|
|
||||||
|
def main(f):
|
||||||
|
results = []
|
||||||
|
directory, filename = os.path.split(f)
|
||||||
|
table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
|
||||||
|
rows = extract_cell_images_from_table(table)
|
||||||
|
cell_img_dir = os.path.join(directory, "cells")
|
||||||
|
os.makedirs(cell_img_dir, exist_ok=True)
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
for j, cell in enumerate(row):
|
||||||
|
cell_filename = "{:03d}-{:03d}.png".format(i, j)
|
||||||
|
path = os.path.join(cell_img_dir, cell_filename)
|
||||||
|
cv2.imwrite(path, cell)
|
||||||
|
print(cell_filename)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_cell_images_from_table(image):
|
||||||
|
BLUR_KERNEL_SIZE = (17, 17)
|
||||||
|
STD_DEV_X_DIRECTION = 0
|
||||||
|
STD_DEV_Y_DIRECTION = 0
|
||||||
|
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
|
||||||
|
MAX_COLOR_VAL = 255
|
||||||
|
BLOCK_SIZE = 15
|
||||||
|
SUBTRACT_FROM_MEAN = -2
|
||||||
|
|
||||||
|
img_bin = cv2.adaptiveThreshold(
|
||||||
|
~blurred,
|
||||||
|
MAX_COLOR_VAL,
|
||||||
|
cv2.ADAPTIVE_THRESH_MEAN_C,
|
||||||
|
cv2.THRESH_BINARY,
|
||||||
|
BLOCK_SIZE,
|
||||||
|
SUBTRACT_FROM_MEAN,
|
||||||
|
)
|
||||||
|
vertical = horizontal = img_bin.copy()
|
||||||
|
SCALE = 5
|
||||||
|
image_width, image_height = horizontal.shape
|
||||||
|
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
|
||||||
|
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
|
||||||
|
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
|
||||||
|
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
|
||||||
|
|
||||||
|
horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
|
||||||
|
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
|
||||||
|
|
||||||
|
mask = horizontally_dilated + vertically_dilated
|
||||||
|
contours, heirarchy = cv2.findContours(
|
||||||
|
mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
|
||||||
|
)
|
||||||
|
|
||||||
|
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
|
||||||
|
epsilons = [0.05 * p for p in perimeter_lengths]
|
||||||
|
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
|
||||||
|
|
||||||
|
# Filter out contours that aren't rectangular. Those that aren't rectangular
|
||||||
|
# are probably noise.
|
||||||
|
approx_rects = [p for p in approx_polys if len(p) == 4]
|
||||||
|
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
|
||||||
|
|
||||||
|
# Filter out rectangles that are too narrow or too short.
|
||||||
|
MIN_RECT_WIDTH = 40
|
||||||
|
MIN_RECT_HEIGHT = 10
|
||||||
|
bounding_rects = [
|
||||||
|
r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
|
||||||
|
]
|
||||||
|
|
||||||
|
# The largest bounding rectangle is assumed to be the entire table.
|
||||||
|
# Remove it from the list. We don't want to accidentally try to OCR
|
||||||
|
# the entire table.
|
||||||
|
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
|
||||||
|
bounding_rects = [b for b in bounding_rects if b is not largest_rect]
|
||||||
|
|
||||||
|
cells = [c for c in bounding_rects]
|
||||||
|
def cell_in_same_row(c1, c2):
|
||||||
|
c1_center = c1[1] + c1[3] - c1[3] / 2
|
||||||
|
c2_bottom = c2[1] + c2[3]
|
||||||
|
c2_top = c2[1]
|
||||||
|
return c2_top < c1_center < c2_bottom
|
||||||
|
|
||||||
|
orig_cells = [c for c in cells]
|
||||||
|
rows = []
|
||||||
|
while cells:
|
||||||
|
first = cells[0]
|
||||||
|
rest = cells[1:]
|
||||||
|
cells_in_same_row = sorted(
|
||||||
|
[
|
||||||
|
c for c in rest
|
||||||
|
if cell_in_same_row(c, first)
|
||||||
|
],
|
||||||
|
key=lambda c: c[0]
|
||||||
|
)
|
||||||
|
|
||||||
|
row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
|
||||||
|
rows.append(row_cells)
|
||||||
|
cells = [
|
||||||
|
c for c in rest
|
||||||
|
if not cell_in_same_row(c, first)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Sort rows by average height of their center.
|
||||||
|
def avg_height_of_center(row):
|
||||||
|
centers = [y + h - h / 2 for x, y, w, h in row]
|
||||||
|
return sum(centers) / len(centers)
|
||||||
|
|
||||||
|
rows.sort(key=avg_height_of_center)
|
||||||
|
cell_images_rows = []
|
||||||
|
for row in rows:
|
||||||
|
cell_images_row = []
|
||||||
|
for x, y, w, h in row:
|
||||||
|
cell_images_row.append(image[y:y+h, x:x+w])
|
||||||
|
cell_images_rows.append(cell_images_row)
|
||||||
|
return cell_images_rows
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main(sys.argv[1])
|
@ -0,0 +1,81 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("files", nargs="+")
|
||||||
|
|
||||||
|
|
||||||
|
def main(files):
|
||||||
|
results = []
|
||||||
|
for f in files:
|
||||||
|
directory, filename = os.path.split(f)
|
||||||
|
|
||||||
|
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
|
||||||
|
print("Reading {}".format(f))
|
||||||
|
tables = find_tables(image)
|
||||||
|
files = []
|
||||||
|
for i, table in enumerate(tables):
|
||||||
|
filename_sans_extension = os.path.splitext(filename)[0]
|
||||||
|
table_filename = "{}-table-{:03d}.png".format(filename_sans_extension, i)
|
||||||
|
table_filepath = os.path.join(directory, table_filename)
|
||||||
|
files.append(table_filepath)
|
||||||
|
cv2.imwrite(table_filepath, table)
|
||||||
|
results.append((f, files))
|
||||||
|
|
||||||
|
for image_filename, table_filenames in results:
|
||||||
|
print("{}\n{}\n".format(image_filename, "\n".join(table_filenames)))
|
||||||
|
|
||||||
|
def find_tables(image):
|
||||||
|
BLUR_KERNEL_SIZE = (17, 17)
|
||||||
|
STD_DEV_X_DIRECTION = 0
|
||||||
|
STD_DEV_Y_DIRECTION = 0
|
||||||
|
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
|
||||||
|
MAX_COLOR_VAL = 255
|
||||||
|
BLOCK_SIZE = 15
|
||||||
|
SUBTRACT_FROM_MEAN = -2
|
||||||
|
|
||||||
|
img_bin = cv2.adaptiveThreshold(
|
||||||
|
~blurred,
|
||||||
|
MAX_COLOR_VAL,
|
||||||
|
cv2.ADAPTIVE_THRESH_MEAN_C,
|
||||||
|
cv2.THRESH_BINARY,
|
||||||
|
BLOCK_SIZE,
|
||||||
|
SUBTRACT_FROM_MEAN,
|
||||||
|
)
|
||||||
|
vertical = horizontal = img_bin.copy()
|
||||||
|
SCALE = 5
|
||||||
|
image_width, image_height = horizontal.shape
|
||||||
|
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
|
||||||
|
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
|
||||||
|
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
|
||||||
|
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
|
||||||
|
|
||||||
|
horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
|
||||||
|
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
|
||||||
|
|
||||||
|
mask = horizontally_dilated + vertically_dilated
|
||||||
|
contours, heirarchy = cv2.findContours(
|
||||||
|
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||||
|
)
|
||||||
|
|
||||||
|
MIN_TABLE_AREA = 1e5
|
||||||
|
contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
|
||||||
|
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
|
||||||
|
epsilons = [0.1 * p for p in perimeter_lengths]
|
||||||
|
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
|
||||||
|
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
|
||||||
|
|
||||||
|
# The link where a lot of this code was borrowed from recommends an
|
||||||
|
# additional step to check the number of "joints" inside this bounding rectangle.
|
||||||
|
# A table should have a lot of intersections. We might have a rectangular image
|
||||||
|
# here though which would only have 4 intersections, 1 at each corner.
|
||||||
|
# Leaving that step as a future TODO if it is ever necessary.
|
||||||
|
images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
|
||||||
|
return images
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parser.parse_args()
|
||||||
|
files = args.files
|
||||||
|
main(files)
|
@ -0,0 +1,100 @@
|
|||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from table_ocr.util import working_dir, make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def get_logger():
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
lvl = os.environ.get("PY_LOG_LVL", "info").upper()
|
||||||
|
handler = logging.StreamHandler()
|
||||||
|
formatter = logging.Formatter(logging.BASIC_FORMAT)
|
||||||
|
handler.setFormatter(formatter)
|
||||||
|
logger.addHandler(handler)
|
||||||
|
handler.setLevel(lvl)
|
||||||
|
logger.setLevel(lvl)
|
||||||
|
return logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("files", nargs="+")
|
||||||
|
|
||||||
|
def main(files):
|
||||||
|
pdf_images = []
|
||||||
|
for f in files:
|
||||||
|
pdf_images.append((f, pdf_to_images(f)))
|
||||||
|
|
||||||
|
for pdf, images in pdf_images:
|
||||||
|
for image in images:
|
||||||
|
preprocess_img(image)
|
||||||
|
|
||||||
|
for pdf, images in pdf_images:
|
||||||
|
print("{}\n{}\n".format(pdf, "\n".join(images)))
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_to_images(pdf_filepath):
|
||||||
|
"""
|
||||||
|
Turn a pdf into images
|
||||||
|
"""
|
||||||
|
directory, filename = os.path.split(pdf_filepath)
|
||||||
|
with working_dir(directory):
|
||||||
|
image_filenames = pdfimages(pdf_filepath)
|
||||||
|
|
||||||
|
# Since pdfimages creates a number of files named each for there page number
|
||||||
|
# and doesn't return us the list that it created
|
||||||
|
return [os.path.join(directory, f) for f in image_filenames]
|
||||||
|
|
||||||
|
|
||||||
|
def pdfimages(pdf_filepath):
|
||||||
|
"""
|
||||||
|
Uses the `pdfimages` utility from Poppler
|
||||||
|
(https://poppler.freedesktop.org/). Creates images out of each page. Images
|
||||||
|
are prefixed by their name sans extension and suffixed by their page number.
|
||||||
|
"""
|
||||||
|
directory, filename = os.path.split(pdf_filepath)
|
||||||
|
filename_sans_ext = filename.split(".pdf")[0]
|
||||||
|
subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
|
||||||
|
image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
|
||||||
|
logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
|
||||||
|
return image_filenames
|
||||||
|
|
||||||
|
|
||||||
|
def find_matching_files_in_dir(file_prefix, directory):
|
||||||
|
files = [
|
||||||
|
filename
|
||||||
|
for filename in os.listdir(directory)
|
||||||
|
if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename)
|
||||||
|
]
|
||||||
|
return files
|
||||||
|
def preprocess_img(filepath):
|
||||||
|
"""
|
||||||
|
Processing that involves running shell executables,
|
||||||
|
like mogrify to rotate.
|
||||||
|
"""
|
||||||
|
rotate = get_rotate(filepath)
|
||||||
|
logger.debug("Rotating {} by {}.".format(filepath, rotate))
|
||||||
|
mogrify(filepath, rotate)
|
||||||
|
|
||||||
|
|
||||||
|
def get_rotate(image_filepath):
|
||||||
|
output = (
|
||||||
|
subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
|
||||||
|
.decode("utf-8")
|
||||||
|
.split("\n")
|
||||||
|
)
|
||||||
|
output = next(l for l in output if "Rotate: " in l)
|
||||||
|
output = output.split(": ")[1]
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def mogrify(image_filepath, rotate):
|
||||||
|
subprocess.run(["mogrify", "-rotate", rotate, image_filepath])
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args.files)
|
Loading…
Reference in New Issue