Remove unused files, finish refactor of structure

5 years ago · 4eca593944
parent b911f87126
commit 4eca593944
8 changed files with 118 additions and 298 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -837,6 +837,18 @@ import cv2
 **** table_ocr/extract_cells/__main__.py
 Takes as a command line argument a path to an image of a table.
 Detects cells in the table and extracts each cell to an image file in a new
 ~/cells/~ subdirectory in the same directory of the given image's path.
 Each cell filename is suffixed with ~<row>-<column>~ so that the filenames can
 be sorted lexicographically and will align with reading the cells from
 left-to-right, top-to-bottom.
 Prints to stdout the lexicographically sorted list of filenames of the extracted
 cells.
 #+BEGIN_SRC python :tangle table_ocr/extract_cells/__main__.py :results none
 import os
 import sys
@ -866,22 +878,24 @@ if __name__ == "__main__":
    main(sys.argv[1])
 #+END_SRC
-*** table_ocr/ocr_image.py
+*** table_ocr/ocr_image/
 **** table_ocr/ocr_image/__init__.py
 #+BEGIN_SRC python :tangle table_ocr/ocr_image/__init__.py
 import math
 import cv2
 <<crop-to-text>>
 <<ocr-image>>
 #+END_SRC
 **** table_ocr/ocr_image/__main__.py
 This does a little bit of cleanup before sending it through tesseract.
 Creates images and text files that can be used for training tesseract. See
 https://github.com/tesseract-ocr/tesstrain.
-#+BEGIN_SRC shell :results output
+#+BEGIN_SRC python :tangle table_ocr/ocr_image/__main__.py :mkdirp yes :results none
 . ~/.virtualenvs/lotto_odds/bin/activate
 python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
 #+END_SRC
 #+RESULTS:
 : PRIZE
 #+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none
 import argparse
 import math
 import os
@ -921,21 +935,15 @@ if __name__ == "__main__":
    args, tess_args = parser.parse_known_args()
    main(args.image, tess_args)
 #+END_SRC
-
+*** table_ocr/ocr_to_csv/
-*** table_ocr/ocr_to_csv.py
+**** table_ocr/ocr_to_csv/__init__.py
-
+#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv/__init__.py
 #+BEGIN_SRC python :tangle table_ocr/ocr_to_csv.py
 import argparse
 import csv
 import io
 import os
 import sys
 import tempfile
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
-def main(files):
+def text_files_to_csv(files):
    """Files must be sorted lexicographically
    Filenames must be <row>-<colum>.txt.
    000-000.txt
@ -956,7 +964,23 @@ def main(files):
    csv_file = io.StringIO()
    writer = csv.writer(csv_file)
    writer.writerows(rows)
-    print(csv_file.getvalue())
+    return csv_file.getvalue()
 #+END_SRC
 **** table_ocr/ocr_to_csv/__main__.py
 #+BEGIN_SRC python :tangle table_ocr/ocr_to_csv/__main__.py
 import argparse
 import os
 from table_ocr.ocr_to_csv import text_files_to_csv
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
 def main(files):
    print(text_files_to_csv(files))
 if __name__ == "__main__":
    args = parser.parse_args()
--- a/table_ocr/extract_cells_from_table.py
+++ b/table_ocr/extract_cells_from_table.py
@ -1,120 +0,0 @@
 import os
 import sys
 import cv2
 from table_ocr.extract_cells import extract_cells_from_table
 def main(f):
    results = []
    directory, filename = os.path.split(f)
    table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
    rows = extract_cell_images_from_table(table)
    cell_img_dir = os.path.join(directory, "cells")
    os.makedirs(cell_img_dir, exist_ok=True)
    for i, row in enumerate(rows):
        for j, cell in enumerate(row):
            cell_filename = "{:03d}-{:03d}.png".format(i, j)
            path = os.path.join(cell_img_dir, cell_filename)
            cv2.imwrite(path, cell)
            print(path)
 def extract_cell_images_from_table(image):
    BLUR_KERNEL_SIZE = (17, 17)
    STD_DEV_X_DIRECTION = 0
    STD_DEV_Y_DIRECTION = 0
    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    img_bin = cv2.adaptiveThreshold(
        ~blurred,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    vertical = horizontal = img_bin.copy()
    SCALE = 5
    image_width, image_height = horizontal.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
    mask = horizontally_dilated + vertically_dilated
    contours, heirarchy = cv2.findContours(
        mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
    )
    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
    epsilons = [0.05 * p for p in perimeter_lengths]
    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
    # Filter out contours that aren't rectangular. Those that aren't rectangular
    # are probably noise.
    approx_rects = [p for p in approx_polys if len(p) == 4]
    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
    # Filter out rectangles that are too narrow or too short.
    MIN_RECT_WIDTH = 40
    MIN_RECT_HEIGHT = 10
    bounding_rects = [
        r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
    ]
    # The largest bounding rectangle is assumed to be the entire table.
    # Remove it from the list. We don't want to accidentally try to OCR
    # the entire table.
    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
    bounding_rects = [b for b in bounding_rects if b is not largest_rect]
    cells = [c for c in bounding_rects]
    def cell_in_same_row(c1, c2):
        c1_center = c1[1] + c1[3] - c1[3] / 2
        c2_bottom = c2[1] + c2[3]
        c2_top = c2[1]
        return c2_top < c1_center < c2_bottom
    orig_cells = [c for c in cells]
    rows = []
    while cells:
        first = cells[0]
        rest = cells[1:]
        cells_in_same_row = sorted(
            [
                c for c in rest
                if cell_in_same_row(c, first)
            ],
            key=lambda c: c[0]
        )
        row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
        rows.append(row_cells)
        cells = [
            c for c in rest
            if not cell_in_same_row(c, first)
        ]
    # Sort rows by average height of their center.
    def avg_height_of_center(row):
        centers = [y + h - h / 2 for x, y, w, h in row]
        return sum(centers) / len(centers)
    rows.sort(key=avg_height_of_center)
    cell_images_rows = []
    for row in rows:
        cell_images_row = []
        for x, y, w, h in row:
            cell_images_row.append(image[y:y+h, x:x+w])
        cell_images_rows.append(cell_images_row)
    return cell_images_rows
 if __name__ == "__main__":
    main(sys.argv[1])
--- a/table_ocr/extract_tables.py
+++ b/table_ocr/extract_tables.py
@ -1,39 +0,0 @@
 import argparse
 import os
 import cv2
 from table_ocr.extract_tables import find_tables
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
 def main(files):
    results = []
    for f in files:
        directory, filename = os.path.split(f)
        image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
        tables = find_tables(image)
        files = []
        filename_sans_extension = os.path.splitext(filename)[0]
        if tables:
            os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
        for i, table in enumerate(tables):
            table_filename = "table-{:03d}.png".format(i)
            table_filepath = os.path.join(
                directory, filename_sans_extension, table_filename
            )
            files.append(table_filepath)
            cv2.imwrite(table_filepath, table)
        if tables:
            results.append((f, files))
    for image_filename, table_filenames in results:
        print("\n".join(table_filenames))
 if __name__ == "__main__":
    args = parser.parse_args()
    files = args.files
    main(files)
--- a/table_ocr/ocr_image/init.py
+++ b/table_ocr/ocr_image/init.py
@ -0,0 +1,53 @@
 import math
 import cv2
 def crop_to_text(image):
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    img_bin = cv2.adaptiveThreshold(
        ~image,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    img_h, img_w = image.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
    horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    both = horizontal_lines + vertical_lines
    cleaned = img_bin - both
    # Get rid of little noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    NUM_PX_COMMA = 6
    MIN_CHAR_AREA = 5 * 9
    if bounding_rects:
        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
        for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
            minx = min(minx, x)
            miny = min(miny, y)
            maxx = max(maxx, x + w)
            maxy = max(maxy, y + h)
        x, y, w, h = minx, miny, maxx - minx, maxy - miny
        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
    else:
        # If we morphed out all of the text, fallback to using the unmorphed image.
        cropped = image
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
 def ocr_image(image, config):
    return pytesseract.image_to_string(
        image,
        config=config
    )
--- a/table_ocr/ocr_image/main.py
+++ b/table_ocr/ocr_image/main.py
--- a/table_ocr/ocr_to_csv/init.py
+++ b/table_ocr/ocr_to_csv/init.py
@ -1,14 +1,9 @@
 import argparse
 import csv
 import io
 import os
 import sys
 import tempfile
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
-def main(files):
+def text_files_to_csv(files):
    """Files must be sorted lexicographically
    Filenames must be <row>-<colum>.txt.
    000-000.txt
@ -29,10 +24,4 @@ def main(files):
    csv_file = io.StringIO()
    writer = csv.writer(csv_file)
    writer.writerows(rows)
-    print(csv_file.getvalue())
+    return csv_file.getvalue()
 if __name__ == "__main__":
    args = parser.parse_args()
    files = args.files
    files.sort()
    main(files)
--- a/table_ocr/ocr_to_csv/main.py
+++ b/table_ocr/ocr_to_csv/main.py
@ -0,0 +1,18 @@
 import argparse
 import os
 from table_ocr.ocr_to_csv import text_files_to_csv
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
 def main(files):
    print(text_files_to_csv(files))
 if __name__ == "__main__":
    args = parser.parse_args()
    files = args.files
    files.sort()
    main(files)
--- a/table_ocr/prepare_pdfs.py
+++ b/table_ocr/prepare_pdfs.py
@ -1,105 +0,0 @@
 import argparse
 import logging
 import os
 import re
 import subprocess
 import sys
 from table_ocr.util import working_dir, make_tempdir
 def get_logger(name):
    logger = logging.getLogger(name)
    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
    handler = logging.StreamHandler()
    formatter = logging.Formatter(logging.BASIC_FORMAT)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    handler.setLevel(lvl)
    logger.setLevel(lvl)
    return logger
 logger = get_logger()
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
 def main(files):
    pdf_images = []
    for f in files:
        pdf_images.append((f, pdf_to_images(f)))
    for pdf, images in pdf_images:
        for image in images:
            preprocess_img(image)
    for pdf, images in pdf_images:
        print("{}\n{}\n".format(pdf, "\n".join(images)))
 def pdf_to_images(pdf_filepath):
    """
    Turn a pdf into images
    """
    directory, filename = os.path.split(pdf_filepath)
    with working_dir(directory):
        image_filenames = pdfimages(pdf_filepath)
    # Since pdfimages creates a number of files named each for there page number
    # and doesn't return us the list that it created
    return [os.path.join(directory, f) for f in image_filenames]
 def pdfimages(pdf_filepath):
    """
    Uses the `pdfimages` utility from Poppler
    (https://poppler.freedesktop.org/). Creates images out of each page. Images
    are prefixed by their name sans extension and suffixed by their page number.
    This should work up to pdfs with 999 pages since find matching files in dir
    uses 3 digits in its regex.
    """
    directory, filename = os.path.split(pdf_filepath)
    filename_sans_ext = filename.split(".pdf")[0]
    subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
    image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
    logger.debug(
        "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
    )
    return image_filenames
 def find_matching_files_in_dir(file_prefix, directory):
    files = [
        filename
        for filename in os.listdir(directory)
        if re.match(r"{}-\d{{3}}.*\.png".format(re.escape(file_prefix)), filename)
    ]
    return files
 def preprocess_img(filepath):
    """
    Processing that involves running shell executables,
    like mogrify to rotate.
    """
    rotate = get_rotate(filepath)
    logger.debug("Rotating {} by {}.".format(filepath, rotate))
    mogrify(filepath, rotate)
 def get_rotate(image_filepath):
    output = (
        subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
        .decode("utf-8")
        .split("\n")
    )
    output = next(l for l in output if "Rotate: " in l)
    output = output.split(": ")[1]
    return output
 def mogrify(image_filepath, rotate):
    subprocess.run(["mogrify", "-rotate", rotate, image_filepath])
 if __name__ == "__main__":
    args = parser.parse_args()
    main(args.files)