Add module for outputting csv from parsed table

Make cell extraction a little more accurate.
6 years ago · e49fffa5a7
parent de398f73c2
commit e49fffa5a7
4 changed files with 169 additions and 36 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -482,19 +482,48 @@ much fewer than the width of the text. If that's the case, then we can remove
 that noise with a simple open morph.

 Once the stray border pixels have been removed, we can expand our border using
-~openMakeBorder~.
+~copyMakeBorder~.

 #+BEGIN_SRC python :eval no :noweb-ref crop-to-text
 def crop_to_text(image):
-    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
-    opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    SUBTRACT_FROM_MEAN = -2
+
+    img_bin = cv2.adaptiveThreshold(
+        ~image,
+        MAX_COLOR_VAL,
+        cv2.ADAPTIVE_THRESH_MEAN_C,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+        SUBTRACT_FROM_MEAN,
+    )
+
+    # Get rid of littl noise.
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
+    opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
+
+    # Dilate so each digit is connected, so we can get a bounding rectangle
+    # around all of the digits as one contour. This will make the bounding
+    # rectangle 8 pixels wider on the left and right, so we'll need to crop that
+    # out at the end so that we don't pick up stray border pixels.
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
+    dilated = cv2.dilate(opened, kernel)
+
+    contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

-    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
-    # The largest contour is certainly the text that we're looking for.
-    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
-    x, y, w, h = largest_rect
-    cropped = image[y:y+h, x:x+w]
+
+    if bounding_rects:
+        # The largest contour is certainly the text that we're looking for.
+        largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
+        x, y, w, h = largest_rect
+        # Commas sometimes go a little below the bounding box and we don't want
+        # to lost them or turn them into periods.
+        img_h, img_w = image.shape
+        cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
+    else:
+        cropped = image
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
 #+END_SRC
@ -513,20 +542,6 @@ cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image)
 #+ATTR_HTML: :width 200px :height 100%
 [[file:resources/examples/example-table-cell-1-1-cropped.png]]

-#+HEADER: :post html-image-size(text=*this*, width="200px")
-#+BEGIN_SRC python :noweb no-export :results raw :exports both
-import cv2
-<<crop-to-text>>
-image = cv2.imread("/tmp/example-1/cells/001-002.png", cv2.IMREAD_GRAYSCALE)
-image = crop_to_text(image)
-cv2.imwrite("/tmp/example-1/cells/001-002-cropped.png", image)
-"/tmp/example-1/cells/001-002-cropped.png"
-#+END_SRC
-
-#+RESULTS:
-#+ATTR_HTML: :width 200px :height 100%
-[[file:/tmp/example-1/cells/001-002-cropped.png]]
-

 ** OCR each cell

@ -543,9 +558,8 @@ period into a comma, then you might need to do some custom Tesseract training.

 #+BEGIN_SRC python :noweb-ref ocr-image :eval no
 def ocr_image(image, config):
-    cropped = crop_to_text(image)
    return pytesseract.image_to_string(
-        ~cropped,
+        image,
        config=config
    )
 #+END_SRC
@ -556,6 +570,7 @@ import cv2
 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
 <<crop-to-text>>
 <<ocr-image>>
+image = crop_to_text(image)
 ocr_image(image, "--psm 7")
 #+END_SRC

@ -777,6 +792,9 @@ if __name__ == "__main__":

 This does a little bit of cleanup before sending it through tesseract.

+Creates images and text files that can be used for training tesseract. See
+https://github.com/tesseract-ocr/tesstrain.
+
 #+BEGIN_SRC shell :results output
 . ~/.virtualenvs/lotto_odds/bin/activate
 python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
@ -785,7 +803,8 @@ python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
 #+RESULTS:
 : PRIZE

-#+BEGIN_SRC python :tangle table_ocr/ocr_cell.py :mkdirp yes :results none
+#+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none
+import os
 import sys

 import cv2
@ -795,13 +814,59 @@ import pytesseract
 <<ocr-image>>

 def main(f):
+    directory, filename = os.path.split(f)
+    filename_sans_ext, ext = os.path.splitext(filename)
    image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
-    print(ocr_image(image, "--psm 7"))
+    cropped = crop_to_text(image)
+    ocr_data_dir = os.path.join(directory, "ocr_data")
+    os.makedirs(ocr_data_dir, exist_ok=True)
+    out_imagepath = os.path.join(ocr_data_dir, filename)
+    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
+    cv2.imwrite(out_imagepath, cropped)
+    txt = ocr_image(cropped, "--psm 7")
+    with open(out_txtpath, "w") as txt_file:
+        txt_file.write(txt)

 if __name__ == "__main__":
    main(sys.argv[1])
 #+END_SRC

+*** table_ocr/ocr_to_csv.py
+
+#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv.py
+import argparse
+import csv
+import io
+import os
+import sys
+import tempfile
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+def main(files):
+    rows = []
+    for f in files:
+        directory, filename = os.path.split(f)
+        with open(f) as of:
+            txt = of.read()
+        row, column = map(int, filename.split(".")[0].split("-"))
+        if row == len(rows):
+            rows.append([])
+        rows[row].append(txt)
+
+    csv_file = io.StringIO()
+    writer = csv.writer(csv_file)
+    writer.writerows(rows)
+    print(csv_file.getvalue())
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.files)
+
+
+#+END_SRC
+
 * Utils

 The following code lets us specify a size for images when they are exported to
--- a/resources/examples/example-table-cell-1-1-cropped.png
+++ b/resources/examples/example-table-cell-1-1-cropped.png
--- a/table_ocr/ocr_image.py
+++ b/table_ocr/ocr_image.py
@ -1,30 +1,69 @@
+import os
 import sys

 import cv2
 import pytesseract

 def crop_to_text(image):
-    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
-    opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    SUBTRACT_FROM_MEAN = -2
+
+    img_bin = cv2.adaptiveThreshold(
+        ~image,
+        MAX_COLOR_VAL,
+        cv2.ADAPTIVE_THRESH_MEAN_C,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+        SUBTRACT_FROM_MEAN,
+    )
+
+    # Get rid of littl noise.
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
+    opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
+
+    # Dilate so each digit is connected, so we can get a bounding rectangle
+    # around all of the digits as one contour. This will make the bounding
+    # rectangle 8 pixels wider on the left and right, so we'll need to crop that
+    # out at the end so that we don't pick up stray border pixels.
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
+    dilated = cv2.dilate(opened, kernel)
+
+    contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

-    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
-    # The largest contour is certainly the text that we're looking for.
-    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
-    x, y, w, h = largest_rect
-    cropped = image[y:y+h, x:x+w]
+
+    if bounding_rects:
+        # The largest contour is certainly the text that we're looking for.
+        largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
+        x, y, w, h = largest_rect
+        # Commas sometimes go a little below the bounding box and we don't want
+        # to lost them or turn them into periods.
+        img_h, img_w = image.shape
+        cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
+    else:
+        cropped = image
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
 def ocr_image(image, config):
-    cropped = crop_to_text(image)
    return pytesseract.image_to_string(
-        ~cropped,
+        image,
        config=config
    )

 def main(f):
+    directory, filename = os.path.split(f)
+    filename_sans_ext, ext = os.path.splitext(filename)
    image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
-    print(ocr_image(image, "--psm 7"))
+    cropped = crop_to_text(image)
+    ocr_data_dir = os.path.join(directory, "ocr_data")
+    os.makedirs(ocr_data_dir, exist_ok=True)
+    out_imagepath = os.path.join(ocr_data_dir, filename)
+    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
+    cv2.imwrite(out_imagepath, cropped)
+    txt = ocr_image(cropped, "--psm 7")
+    with open(out_txtpath, "w") as txt_file:
+        txt_file.write(txt)

 if __name__ == "__main__":
    main(sys.argv[1])
--- a/table_ocr/ocr_to_csv.py
+++ b/table_ocr/ocr_to_csv.py
@ -0,0 +1,29 @@
+import argparse
+import csv
+import io
+import os
+import sys
+import tempfile
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+def main(files):
+    rows = []
+    for f in files:
+        directory, filename = os.path.split(f)
+        with open(f) as of:
+            txt = of.read()
+        row, column = map(int, filename.split(".")[0].split("-"))
+        if row == len(rows):
+            rows.append([])
+        rows[row].append(txt)
+
+    csv_file = io.StringIO()
+    writer = csv.writer(csv_file)
+    writer.writerows(rows)
+    print(csv_file.getvalue())
+   
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.files)