Remove unused files, finish refactor of structure

5 years ago · 4eca593944
parent b911f87126
commit 4eca593944
8 changed files with 118 additions and 298 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -837,6 +837,18 @@ import cv2

 **** table_ocr/extract_cells/__main__.py

+Takes as a command line argument a path to an image of a table.
+
+Detects cells in the table and extracts each cell to an image file in a new
+~/cells/~ subdirectory in the same directory of the given image's path.
+
+Each cell filename is suffixed with ~<row>-<column>~ so that the filenames can
+be sorted lexicographically and will align with reading the cells from
+left-to-right, top-to-bottom.
+
+Prints to stdout the lexicographically sorted list of filenames of the extracted
+cells.
+
 #+BEGIN_SRC python :tangle table_ocr/extract_cells/__main__.py :results none
 import os
 import sys
@ -866,22 +878,24 @@ if __name__ == "__main__":
    main(sys.argv[1])
 #+END_SRC

-*** table_ocr/ocr_image.py
+*** table_ocr/ocr_image/
+**** table_ocr/ocr_image/__init__.py
+#+BEGIN_SRC python :tangle table_ocr/ocr_image/__init__.py
+import math
+
+import cv2
+
+<<crop-to-text>>
+<<ocr-image>>
+#+END_SRC
+**** table_ocr/ocr_image/__main__.py

 This does a little bit of cleanup before sending it through tesseract.

 Creates images and text files that can be used for training tesseract. See
 https://github.com/tesseract-ocr/tesstrain.

-#+BEGIN_SRC shell :results output
-. ~/.virtualenvs/lotto_odds/bin/activate
-python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
-#+END_SRC
-
-#+RESULTS:
-: PRIZE
-
-#+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none
+#+BEGIN_SRC python :tangle table_ocr/ocr_image/__main__.py :mkdirp yes :results none
 import argparse
 import math
 import os
@ -921,21 +935,15 @@ if __name__ == "__main__":
    args, tess_args = parser.parse_known_args()
    main(args.image, tess_args)
 #+END_SRC
-
-*** table_ocr/ocr_to_csv.py
-
-#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv.py
-import argparse
+*** table_ocr/ocr_to_csv/
+**** table_ocr/ocr_to_csv/__init__.py
+#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv/__init__.py
 import csv
 import io
 import os
-import sys
-import tempfile

-parser = argparse.ArgumentParser()
-parser.add_argument("files", nargs="+")

-def main(files):
+def text_files_to_csv(files):
    """Files must be sorted lexicographically
    Filenames must be <row>-<colum>.txt.
    000-000.txt
@ -956,7 +964,23 @@ def main(files):
    csv_file = io.StringIO()
    writer = csv.writer(csv_file)
    writer.writerows(rows)
-    print(csv_file.getvalue())
+    return csv_file.getvalue()
+#+END_SRC
+**** table_ocr/ocr_to_csv/__main__.py
+
+#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv/__main__.py
+import argparse
+import os
+
+from table_ocr.ocr_to_csv import text_files_to_csv
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+
+def main(files):
+    print(text_files_to_csv(files))
+

 if __name__ == "__main__":
    args = parser.parse_args()
--- a/table_ocr/extract_cells_from_table.py
+++ b/table_ocr/extract_cells_from_table.py
@ -1,120 +0,0 @@
-import os
-import sys
-
-import cv2
-
-from table_ocr.extract_cells import extract_cells_from_table
-
-def main(f):
-    results = []
-    directory, filename = os.path.split(f)
-    table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
-    rows = extract_cell_images_from_table(table)
-    cell_img_dir = os.path.join(directory, "cells")
-    os.makedirs(cell_img_dir, exist_ok=True)
-    for i, row in enumerate(rows):
-        for j, cell in enumerate(row):
-            cell_filename = "{:03d}-{:03d}.png".format(i, j)
-            path = os.path.join(cell_img_dir, cell_filename)
-            cv2.imwrite(path, cell)
-            print(path)
-
-
-def extract_cell_images_from_table(image):
-    BLUR_KERNEL_SIZE = (17, 17)
-    STD_DEV_X_DIRECTION = 0
-    STD_DEV_Y_DIRECTION = 0
-    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
-    MAX_COLOR_VAL = 255
-    BLOCK_SIZE = 15
-    SUBTRACT_FROM_MEAN = -2
-    
-    img_bin = cv2.adaptiveThreshold(
-        ~blurred,
-        MAX_COLOR_VAL,
-        cv2.ADAPTIVE_THRESH_MEAN_C,
-        cv2.THRESH_BINARY,
-        BLOCK_SIZE,
-        SUBTRACT_FROM_MEAN,
-    )
-    vertical = horizontal = img_bin.copy()
-    SCALE = 5
-    image_width, image_height = horizontal.shape
-    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
-    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
-    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
-    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
-    
-    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
-    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
-    
-    mask = horizontally_dilated + vertically_dilated
-    contours, heirarchy = cv2.findContours(
-        mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
-    )
-    
-    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
-    epsilons = [0.05 * p for p in perimeter_lengths]
-    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
-    
-    # Filter out contours that aren't rectangular. Those that aren't rectangular
-    # are probably noise.
-    approx_rects = [p for p in approx_polys if len(p) == 4]
-    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
-    
-    # Filter out rectangles that are too narrow or too short.
-    MIN_RECT_WIDTH = 40
-    MIN_RECT_HEIGHT = 10
-    bounding_rects = [
-        r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
-    ]
-    
-    # The largest bounding rectangle is assumed to be the entire table.
-    # Remove it from the list. We don't want to accidentally try to OCR
-    # the entire table.
-    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
-    bounding_rects = [b for b in bounding_rects if b is not largest_rect]
-    
-    cells = [c for c in bounding_rects]
-    def cell_in_same_row(c1, c2):
-        c1_center = c1[1] + c1[3] - c1[3] / 2
-        c2_bottom = c2[1] + c2[3]
-        c2_top = c2[1]
-        return c2_top < c1_center < c2_bottom
-    
-    orig_cells = [c for c in cells]
-    rows = []
-    while cells:
-        first = cells[0]
-        rest = cells[1:]
-        cells_in_same_row = sorted(
-            [
-                c for c in rest
-                if cell_in_same_row(c, first)
-            ],
-            key=lambda c: c[0]
-        )
-    
-        row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
-        rows.append(row_cells)
-        cells = [
-            c for c in rest
-            if not cell_in_same_row(c, first)
-        ]
-    
-    # Sort rows by average height of their center.
-    def avg_height_of_center(row):
-        centers = [y + h - h / 2 for x, y, w, h in row]
-        return sum(centers) / len(centers)
-    
-    rows.sort(key=avg_height_of_center)
-    cell_images_rows = []
-    for row in rows:
-        cell_images_row = []
-        for x, y, w, h in row:
-            cell_images_row.append(image[y:y+h, x:x+w])
-        cell_images_rows.append(cell_images_row)
-    return cell_images_rows
-
-if __name__ == "__main__":
-    main(sys.argv[1])
--- a/table_ocr/extract_tables.py
+++ b/table_ocr/extract_tables.py
@ -1,39 +0,0 @@
-import argparse
-import os
-
-import cv2
-
-from table_ocr.extract_tables import find_tables
-
-parser = argparse.ArgumentParser()
-parser.add_argument("files", nargs="+")
-
-
-def main(files):
-    results = []
-    for f in files:
-        directory, filename = os.path.split(f)
-        image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
-        tables = find_tables(image)
-        files = []
-        filename_sans_extension = os.path.splitext(filename)[0]
-        if tables:
-            os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
-        for i, table in enumerate(tables):
-            table_filename = "table-{:03d}.png".format(i)
-            table_filepath = os.path.join(
-                directory, filename_sans_extension, table_filename
-            )
-            files.append(table_filepath)
-            cv2.imwrite(table_filepath, table)
-        if tables:
-            results.append((f, files))
-
-    for image_filename, table_filenames in results:
-        print("\n".join(table_filenames))
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    files = args.files
-    main(files)
--- a/table_ocr/ocr_image/init.py
+++ b/table_ocr/ocr_image/init.py
@ -0,0 +1,53 @@
+import math
+
+import cv2
+
+def crop_to_text(image):
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    SUBTRACT_FROM_MEAN = -2
+
+    img_bin = cv2.adaptiveThreshold(
+        ~image,
+        MAX_COLOR_VAL,
+        cv2.ADAPTIVE_THRESH_MEAN_C,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+        SUBTRACT_FROM_MEAN,
+    )
+
+    img_h, img_w = image.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
+    horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    both = horizontal_lines + vertical_lines
+    cleaned = img_bin - both
+
+    # Get rid of little noise.
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
+    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
+
+    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+    bounding_rects = [cv2.boundingRect(c) for c in contours]
+    NUM_PX_COMMA = 6
+    MIN_CHAR_AREA = 5 * 9
+    if bounding_rects:
+        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
+        for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
+            minx = min(minx, x)
+            miny = min(miny, y)
+            maxx = max(maxx, x + w)
+            maxy = max(maxy, y + h)
+        x, y, w, h = minx, miny, maxx - minx, maxy - miny
+        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
+    else:
+        # If we morphed out all of the text, fallback to using the unmorphed image.
+        cropped = image
+    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
+    return bordered
+def ocr_image(image, config):
+    return pytesseract.image_to_string(
+        image,
+        config=config
+    )
--- a/table_ocr/ocr_image/main.py
+++ b/table_ocr/ocr_image/main.py
--- a/table_ocr/ocr_to_csv/init.py
+++ b/table_ocr/ocr_to_csv/init.py
@ -1,14 +1,9 @@
-import argparse
 import csv
 import io
 import os
-import sys
-import tempfile

-parser = argparse.ArgumentParser()
-parser.add_argument("files", nargs="+")

-def main(files):
+def text_files_to_csv(files):
    """Files must be sorted lexicographically
    Filenames must be <row>-<colum>.txt.
    000-000.txt
@ -29,10 +24,4 @@ def main(files):
    csv_file = io.StringIO()
    writer = csv.writer(csv_file)
    writer.writerows(rows)
-    print(csv_file.getvalue())
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    files = args.files
-    files.sort()
-    main(files)
+    return csv_file.getvalue()
--- a/table_ocr/ocr_to_csv/main.py
+++ b/table_ocr/ocr_to_csv/main.py
@ -0,0 +1,18 @@
+import argparse
+import os
+
+from table_ocr.ocr_to_csv import text_files_to_csv
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+
+def main(files):
+    print(text_files_to_csv(files))
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    files = args.files
+    files.sort()
+    main(files)
--- a/table_ocr/prepare_pdfs.py
+++ b/table_ocr/prepare_pdfs.py
@ -1,105 +0,0 @@
-import argparse
-import logging
-import os
-import re
-import subprocess
-import sys
-
-from table_ocr.util import working_dir, make_tempdir
-
-
-def get_logger(name):
-    logger = logging.getLogger(name)
-    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
-    handler = logging.StreamHandler()
-    formatter = logging.Formatter(logging.BASIC_FORMAT)
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    handler.setLevel(lvl)
-    logger.setLevel(lvl)
-    return logger
-
-logger = get_logger()
-
-parser = argparse.ArgumentParser()
-parser.add_argument("files", nargs="+")
-
-def main(files):
-    pdf_images = []
-    for f in files:
-        pdf_images.append((f, pdf_to_images(f)))
-
-    for pdf, images in pdf_images:
-        for image in images:
-            preprocess_img(image)
-
-    for pdf, images in pdf_images:
-        print("{}\n{}\n".format(pdf, "\n".join(images)))
-
-
-def pdf_to_images(pdf_filepath):
-    """
-    Turn a pdf into images
-    """
-    directory, filename = os.path.split(pdf_filepath)
-    with working_dir(directory):
-        image_filenames = pdfimages(pdf_filepath)
-
-    # Since pdfimages creates a number of files named each for there page number
-    # and doesn't return us the list that it created
-    return [os.path.join(directory, f) for f in image_filenames]
-
-
-def pdfimages(pdf_filepath):
-    """
-    Uses the `pdfimages` utility from Poppler
-    (https://poppler.freedesktop.org/). Creates images out of each page. Images
-    are prefixed by their name sans extension and suffixed by their page number.
-
-    This should work up to pdfs with 999 pages since find matching files in dir
-    uses 3 digits in its regex.
-    """
-    directory, filename = os.path.split(pdf_filepath)
-    filename_sans_ext = filename.split(".pdf")[0]
-    subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
-    image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
-    logger.debug(
-        "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
-    )
-    return image_filenames
-
-
-def find_matching_files_in_dir(file_prefix, directory):
-    files = [
-        filename
-        for filename in os.listdir(directory)
-        if re.match(r"{}-\d{{3}}.*\.png".format(re.escape(file_prefix)), filename)
-    ]
-    return files
-def preprocess_img(filepath):
-    """
-    Processing that involves running shell executables,
-    like mogrify to rotate.
-    """
-    rotate = get_rotate(filepath)
-    logger.debug("Rotating {} by {}.".format(filepath, rotate))
-    mogrify(filepath, rotate)
-
-
-def get_rotate(image_filepath):
-    output = (
-        subprocess.check_output(["tesseract", "--psm", "0", image_filepath, "-"])
-        .decode("utf-8")
-        .split("\n")
-    )
-    output = next(l for l in output if "Rotate: " in l)
-    output = output.split(": ")[1]
-    return output
-
-
-def mogrify(image_filepath, rotate):
-    subprocess.run(["mogrify", "-rotate", rotate, image_filepath])
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    main(args.files)