Add module for outputting csv from parsed table

Make cell extraction a little more accurate.
6 years ago · e49fffa5a7
parent de398f73c2
commit e49fffa5a7
4 changed files with 169 additions and 36 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -482,19 +482,48 @@ much fewer than the width of the text. If that's the case, then we can remove
 that noise with a simple open morph.
 Once the stray border pixels have been removed, we can expand our border using
-~openMakeBorder~.
+~copyMakeBorder~.
 #+BEGIN_SRC python :eval no :noweb-ref crop-to-text
 def crop_to_text(image):
-    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
+    MAX_COLOR_VAL = 255
-    opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
+    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    img_bin = cv2.adaptiveThreshold(
        ~image,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    # Get rid of littl noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
    opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
    # Dilate so each digit is connected, so we can get a bounding rectangle
    # around all of the digits as one contour. This will make the bounding
    # rectangle 8 pixels wider on the left and right, so we'll need to crop that
    # out at the end so that we don't pick up stray border pixels.
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
    dilated = cv2.dilate(opened, kernel)
    contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    if bounding_rects:
        # The largest contour is certainly the text that we're looking for.
        largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
        x, y, w, h = largest_rect
-    cropped = image[y:y+h, x:x+w]
+        # Commas sometimes go a little below the bounding box and we don't want
        # to lost them or turn them into periods.
        img_h, img_w = image.shape
        cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
    else:
        cropped = image
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
 #+END_SRC
@ -513,20 +542,6 @@ cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image)
 #+ATTR_HTML: :width 200px :height 100%
 [[file:resources/examples/example-table-cell-1-1-cropped.png]]
 #+HEADER: :post html-image-size(text=*this*, width="200px")
 #+BEGIN_SRC python :noweb no-export :results raw :exports both
 import cv2
 <<crop-to-text>>
 image = cv2.imread("/tmp/example-1/cells/001-002.png", cv2.IMREAD_GRAYSCALE)
 image = crop_to_text(image)
 cv2.imwrite("/tmp/example-1/cells/001-002-cropped.png", image)
 "/tmp/example-1/cells/001-002-cropped.png"
 #+END_SRC
 #+RESULTS:
 #+ATTR_HTML: :width 200px :height 100%
 [[file:/tmp/example-1/cells/001-002-cropped.png]]
 ** OCR each cell
@ -543,9 +558,8 @@ period into a comma, then you might need to do some custom Tesseract training.
 #+BEGIN_SRC python :noweb-ref ocr-image :eval no
 def ocr_image(image, config):
    cropped = crop_to_text(image)
    return pytesseract.image_to_string(
-        ~cropped,
+        image,
        config=config
    )
 #+END_SRC
@ -556,6 +570,7 @@ import cv2
 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
 <<crop-to-text>>
 <<ocr-image>>
 image = crop_to_text(image)
 ocr_image(image, "--psm 7")
 #+END_SRC
@ -777,6 +792,9 @@ if __name__ == "__main__":
 This does a little bit of cleanup before sending it through tesseract.
 Creates images and text files that can be used for training tesseract. See
 https://github.com/tesseract-ocr/tesstrain.
 #+BEGIN_SRC shell :results output
 . ~/.virtualenvs/lotto_odds/bin/activate
 python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
@ -785,7 +803,8 @@ python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
 #+RESULTS:
 : PRIZE
-#+BEGIN_SRC python :tangle table_ocr/ocr_cell.py :mkdirp yes :results none
+#+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none
 import os
 import sys
 import cv2
@ -795,13 +814,59 @@ import pytesseract
 <<ocr-image>>
 def main(f):
    directory, filename = os.path.split(f)
    filename_sans_ext, ext = os.path.splitext(filename)
    image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
-    print(ocr_image(image, "--psm 7"))
+    cropped = crop_to_text(image)
    ocr_data_dir = os.path.join(directory, "ocr_data")
    os.makedirs(ocr_data_dir, exist_ok=True)
    out_imagepath = os.path.join(ocr_data_dir, filename)
    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
    cv2.imwrite(out_imagepath, cropped)
    txt = ocr_image(cropped, "--psm 7")
    with open(out_txtpath, "w") as txt_file:
        txt_file.write(txt)
 if __name__ == "__main__":
    main(sys.argv[1])
 #+END_SRC
 *** table_ocr/ocr_to_csv.py
 #+BEGIN_SRC python :tangle table_ocr/ocr_to_csv.py
 import argparse
 import csv
 import io
 import os
 import sys
 import tempfile
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
 def main(files):
    rows = []
    for f in files:
        directory, filename = os.path.split(f)
        with open(f) as of:
            txt = of.read()
        row, column = map(int, filename.split(".")[0].split("-"))
        if row == len(rows):
            rows.append([])
        rows[row].append(txt)
    csv_file = io.StringIO()
    writer = csv.writer(csv_file)
    writer.writerows(rows)
    print(csv_file.getvalue())
 if __name__ == "__main__":
    args = parser.parse_args()
    main(args.files)
 #+END_SRC
 * Utils
 The following code lets us specify a size for images when they are exported to
--- a/resources/examples/example-table-cell-1-1-cropped.png
+++ b/resources/examples/example-table-cell-1-1-cropped.png
--- a/table_ocr/ocr_image.py
+++ b/table_ocr/ocr_image.py
@ -1,30 +1,69 @@
 import os
 import sys
 import cv2
 import pytesseract
 def crop_to_text(image):
-    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
+    MAX_COLOR_VAL = 255
-    opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
+    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    img_bin = cv2.adaptiveThreshold(
        ~image,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    # Get rid of littl noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
    opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
    # Dilate so each digit is connected, so we can get a bounding rectangle
    # around all of the digits as one contour. This will make the bounding
    # rectangle 8 pixels wider on the left and right, so we'll need to crop that
    # out at the end so that we don't pick up stray border pixels.
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
    dilated = cv2.dilate(opened, kernel)
    contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    if bounding_rects:
        # The largest contour is certainly the text that we're looking for.
        largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
        x, y, w, h = largest_rect
-    cropped = image[y:y+h, x:x+w]
+        # Commas sometimes go a little below the bounding box and we don't want
        # to lost them or turn them into periods.
        img_h, img_w = image.shape
        cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
    else:
        cropped = image
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
 def ocr_image(image, config):
    cropped = crop_to_text(image)
    return pytesseract.image_to_string(
-        ~cropped,
+        image,
        config=config
    )
 def main(f):
    directory, filename = os.path.split(f)
    filename_sans_ext, ext = os.path.splitext(filename)
    image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
-    print(ocr_image(image, "--psm 7"))
+    cropped = crop_to_text(image)
    ocr_data_dir = os.path.join(directory, "ocr_data")
    os.makedirs(ocr_data_dir, exist_ok=True)
    out_imagepath = os.path.join(ocr_data_dir, filename)
    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
    cv2.imwrite(out_imagepath, cropped)
    txt = ocr_image(cropped, "--psm 7")
    with open(out_txtpath, "w") as txt_file:
        txt_file.write(txt)
 if __name__ == "__main__":
    main(sys.argv[1])
--- a/table_ocr/ocr_to_csv.py
+++ b/table_ocr/ocr_to_csv.py
@ -0,0 +1,29 @@
 import argparse
 import csv
 import io
 import os
 import sys
 import tempfile
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
 def main(files):
    rows = []
    for f in files:
        directory, filename = os.path.split(f)
        with open(f) as of:
            txt = of.read()
        row, column = map(int, filename.split(".")[0].split("-"))
        if row == len(rows):
            rows.append([])
        rows[row].append(txt)
    csv_file = io.StringIO()
    writer = csv.writer(csv_file)
    writer.writerows(rows)
    print(csv_file.getvalue())
 if __name__ == "__main__":
    args = parser.parse_args()
    main(args.files)