Remove unused files, finish refactor of structure

main
Eric Ihli 5 years ago
parent b911f87126
commit 4eca593944

@ -837,6 +837,18 @@ import cv2
**** table_ocr/extract_cells/__main__.py
Takes as a command line argument a path to an image of a table.
Detects cells in the table and extracts each cell to an image file in a new
~/cells/~ subdirectory in the same directory of the given image's path.
Each cell filename is suffixed with ~<row>-<column>~ so that the filenames can
be sorted lexicographically and will align with reading the cells from
left-to-right, top-to-bottom.
Prints to stdout the lexicographically sorted list of filenames of the extracted
cells.
#+BEGIN_SRC python :tangle table_ocr/extract_cells/__main__.py :results none
import os
import sys
@ -866,22 +878,24 @@ if __name__ == "__main__":
main(sys.argv[1])
#+END_SRC
*** table_ocr/ocr_image.py
*** table_ocr/ocr_image/
**** table_ocr/ocr_image/__init__.py
#+BEGIN_SRC python :tangle table_ocr/ocr_image/__init__.py
import math
import cv2
<<crop-to-text>>
<<ocr-image>>
#+END_SRC
**** table_ocr/ocr_image/__main__.py
This does a little bit of cleanup before sending it through tesseract.
Creates images and text files that can be used for training tesseract. See
https://github.com/tesseract-ocr/tesstrain.
#+BEGIN_SRC shell :results output
. ~/.virtualenvs/lotto_odds/bin/activate
python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
#+END_SRC
#+RESULTS:
: PRIZE
#+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none
#+BEGIN_SRC python :tangle table_ocr/ocr_image/__main__.py :mkdirp yes :results none
import argparse
import math
import os
@ -921,21 +935,15 @@ if __name__ == "__main__":
args, tess_args = parser.parse_known_args()
main(args.image, tess_args)
#+END_SRC
*** table_ocr/ocr_to_csv.py
#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv.py
import argparse
*** table_ocr/ocr_to_csv/
**** table_ocr/ocr_to_csv/__init__.py
#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv/__init__.py
import csv
import io
import os
import sys
import tempfile
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
def text_files_to_csv(files):
"""Files must be sorted lexicographically
Filenames must be <row>-<colum>.txt.
000-000.txt
@ -956,7 +964,23 @@ def main(files):
csv_file = io.StringIO()
writer = csv.writer(csv_file)
writer.writerows(rows)
print(csv_file.getvalue())
return csv_file.getvalue()
#+END_SRC
**** table_ocr/ocr_to_csv/__main__.py
#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv/__main__.py
import argparse
import os
from table_ocr.ocr_to_csv import text_files_to_csv
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
print(text_files_to_csv(files))
if __name__ == "__main__":
args = parser.parse_args()

@ -1,120 +0,0 @@
import os
import sys
import cv2
from table_ocr.extract_cells import extract_cells_from_table
def main(f):
results = []
directory, filename = os.path.split(f)
table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
rows = extract_cell_images_from_table(table)
cell_img_dir = os.path.join(directory, "cells")
os.makedirs(cell_img_dir, exist_ok=True)
for i, row in enumerate(rows):
for j, cell in enumerate(row):
cell_filename = "{:03d}-{:03d}.png".format(i, j)
path = os.path.join(cell_img_dir, cell_filename)
cv2.imwrite(path, cell)
print(path)
def extract_cell_images_from_table(image):
BLUR_KERNEL_SIZE = (17, 17)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~blurred,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
vertical = horizontal = img_bin.copy()
SCALE = 5
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
mask = horizontally_dilated + vertically_dilated
contours, heirarchy = cv2.findContours(
mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
)
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.05 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
# Filter out contours that aren't rectangular. Those that aren't rectangular
# are probably noise.
approx_rects = [p for p in approx_polys if len(p) == 4]
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
# Filter out rectangles that are too narrow or too short.
MIN_RECT_WIDTH = 40
MIN_RECT_HEIGHT = 10
bounding_rects = [
r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
]
# The largest bounding rectangle is assumed to be the entire table.
# Remove it from the list. We don't want to accidentally try to OCR
# the entire table.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
bounding_rects = [b for b in bounding_rects if b is not largest_rect]
cells = [c for c in bounding_rects]
def cell_in_same_row(c1, c2):
c1_center = c1[1] + c1[3] - c1[3] / 2
c2_bottom = c2[1] + c2[3]
c2_top = c2[1]
return c2_top < c1_center < c2_bottom
orig_cells = [c for c in cells]
rows = []
while cells:
first = cells[0]
rest = cells[1:]
cells_in_same_row = sorted(
[
c for c in rest
if cell_in_same_row(c, first)
],
key=lambda c: c[0]
)
row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
rows.append(row_cells)
cells = [
c for c in rest
if not cell_in_same_row(c, first)
]
# Sort rows by average height of their center.
def avg_height_of_center(row):
centers = [y + h - h / 2 for x, y, w, h in row]
return sum(centers) / len(centers)
rows.sort(key=avg_height_of_center)
cell_images_rows = []
for row in rows:
cell_images_row = []
for x, y, w, h in row:
cell_images_row.append(image[y:y+h, x:x+w])
cell_images_rows.append(cell_images_row)
return cell_images_rows
if __name__ == "__main__":
main(sys.argv[1])

@ -1,39 +0,0 @@
import argparse
import os
import cv2
from table_ocr.extract_tables import find_tables
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
results = []
for f in files:
directory, filename = os.path.split(f)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
tables = find_tables(image)
files = []
filename_sans_extension = os.path.splitext(filename)[0]
if tables:
os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
for i, table in enumerate(tables):
table_filename = "table-{:03d}.png".format(i)
table_filepath = os.path.join(
directory, filename_sans_extension, table_filename
)
files.append(table_filepath)
cv2.imwrite(table_filepath, table)
if tables:
results.append((f, files))
for image_filename, table_filenames in results:
print("\n".join(table_filenames))
if __name__ == "__main__":
args = parser.parse_args()
files = args.files
main(files)

@ -0,0 +1,53 @@
import math
import cv2
def crop_to_text(image):
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2
img_bin = cv2.adaptiveThreshold(
~image,
MAX_COLOR_VAL,
cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY,
BLOCK_SIZE,
SUBTRACT_FROM_MEAN,
)
img_h, img_w = image.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
both = horizontal_lines + vertical_lines
cleaned = img_bin - both
# Get rid of little noise.
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours]
NUM_PX_COMMA = 6
MIN_CHAR_AREA = 5 * 9
if bounding_rects:
minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
minx = min(minx, x)
miny = min(miny, y)
maxx = max(maxx, x + w)
maxy = max(maxy, y + h)
x, y, w, h = minx, miny, maxx - minx, maxy - miny
cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
else:
# If we morphed out all of the text, fallback to using the unmorphed image.
cropped = image
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered
def ocr_image(image, config):
return pytesseract.image_to_string(
image,
config=config
)

@ -1,14 +1,9 @@
import argparse
import csv
import io
import os
import sys
import tempfile
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
def text_files_to_csv(files):
"""Files must be sorted lexicographically
Filenames must be <row>-<colum>.txt.
000-000.txt
@ -29,10 +24,4 @@ def main(files):
csv_file = io.StringIO()
writer = csv.writer(csv_file)
writer.writerows(rows)
print(csv_file.getvalue())
if __name__ == "__main__":
args = parser.parse_args()
files = args.files
files.sort()
main(files)
return csv_file.getvalue()

@ -0,0 +1,18 @@
import argparse
import os
from table_ocr.ocr_to_csv import text_files_to_csv
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
print(text_files_to_csv(files))
if __name__ == "__main__":
args = parser.parse_args()
files = args.files
files.sort()
main(files)

@ -1,105 +0,0 @@
import argparse
import logging
import os
import re
import subprocess
import sys
from table_ocr.util import working_dir, make_tempdir
def get_logger(name):
logger = logging.getLogger(name)
lvl = os.environ.get("PY_LOG_LVL", "info").upper()
handler = logging.StreamHandler()
formatter = logging.Formatter(logging.BASIC_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
handler.setLevel(lvl)
logger.setLevel(lvl)
return logger
logger = get_logger()
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
def main(files):
pdf_images = []
for f in files:
pdf_images.append((f, pdf_to_images(f)))
for pdf, images in pdf_images:
for image in images:
preprocess_img(image)
for pdf, images in pdf_images:
print("{}\n{}\n".format(pdf, "\n".join(images)))
def pdf_to_images(pdf_filepath):
"""
Turn a pdf into images
"""
directory, filename = os.path.split(pdf_filepath)
with working_dir(directory):
image_filenames = pdfimages(pdf_filepath)
# Since pdfimages creates a number of files named each for there page number
# and doesn't return us the list that it created
return [os.path.join(directory, f) for f in image_filenames]
def pdfimages(pdf_filepath):
"""
Uses the `pdfimages` utility from Poppler
(https://poppler.freedesktop.org/). Creates images out of each page. Images
are prefixed by their name sans extension and suffixed by their page number.
This should work up to pdfs with 999 pages since find matching files in dir
uses 3 digits in its regex.
"""
directory, filename = os.path.split(pdf_filepath)
filename_sans_ext = filename.split(".pdf")[0]
subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
logger.debug(
"Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
)
return image_filenames
def find_matching_files_in_dir(file_prefix, directory):
files = [
filename
for filename in os.listdir(directory)
if re.match(r"{}-\d{{3}}.*\.png".format(re.escape(file_prefix)), filename)
]
return files
def preprocess_img(filepath):
"""
Processing that involves running shell executables,
like mogrify to rotate.
"""
rotate = get_rotate(filepath)
logger.debug("Rotating {} by {}.".format(filepath, rotate))
mogrify(filepath, rotate)
def get_rotate(image_filepath):
output = (
subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
.decode("utf-8")
.split("\n")
)
output = next(l for l in output if "Rotate: " in l)
output = output.split(": ")[1]
return output
def mogrify(image_filepath, rotate):
subprocess.run(["mogrify", "-rotate", rotate, image_filepath])
if __name__ == "__main__":
args = parser.parse_args()
main(args.files)
Loading…
Cancel
Save