diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index b77687b..1ab9ed5 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -482,19 +482,48 @@ much fewer than the width of the text. If that's the case, then we can remove that noise with a simple open morph. Once the stray border pixels have been removed, we can expand our border using -~openMakeBorder~. +~copyMakeBorder~. #+BEGIN_SRC python :eval no :noweb-ref crop-to-text def crop_to_text(image): - kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4)) - opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel) + MAX_COLOR_VAL = 255 + BLOCK_SIZE = 15 + SUBTRACT_FROM_MEAN = -2 + + img_bin = cv2.adaptiveThreshold( + ~image, + MAX_COLOR_VAL, + cv2.ADAPTIVE_THRESH_MEAN_C, + cv2.THRESH_BINARY, + BLOCK_SIZE, + SUBTRACT_FROM_MEAN, + ) + + # Get rid of littl noise. + kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) + opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel) + + # Dilate so each digit is connected, so we can get a bounding rectangle + # around all of the digits as one contour. This will make the bounding + # rectangle 8 pixels wider on the left and right, so we'll need to crop that + # out at the end so that we don't pick up stray border pixels. + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1)) + dilated = cv2.dilate(opened, kernel) + + contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) - contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) bounding_rects = [cv2.boundingRect(c) for c in contours] - # The largest contour is certainly the text that we're looking for. - largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) - x, y, w, h = largest_rect - cropped = image[y:y+h, x:x+w] + + if bounding_rects: + # The largest contour is certainly the text that we're looking for. + largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) + x, y, w, h = largest_rect + # Commas sometimes go a little below the bounding box and we don't want + # to lost them or turn them into periods. + img_h, img_w = image.shape + cropped = image[y:min(img_h, y+h+6), x+8:x+w-8] + else: + cropped = image bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) return bordered #+END_SRC @@ -513,20 +542,6 @@ cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image) #+ATTR_HTML: :width 200px :height 100% [[file:resources/examples/example-table-cell-1-1-cropped.png]] -#+HEADER: :post html-image-size(text=*this*, width="200px") -#+BEGIN_SRC python :noweb no-export :results raw :exports both -import cv2 -<> -image = cv2.imread("/tmp/example-1/cells/001-002.png", cv2.IMREAD_GRAYSCALE) -image = crop_to_text(image) -cv2.imwrite("/tmp/example-1/cells/001-002-cropped.png", image) -"/tmp/example-1/cells/001-002-cropped.png" -#+END_SRC - -#+RESULTS: -#+ATTR_HTML: :width 200px :height 100% -[[file:/tmp/example-1/cells/001-002-cropped.png]] - ** OCR each cell @@ -543,9 +558,8 @@ period into a comma, then you might need to do some custom Tesseract training. #+BEGIN_SRC python :noweb-ref ocr-image :eval no def ocr_image(image, config): - cropped = crop_to_text(image) return pytesseract.image_to_string( - ~cropped, + image, config=config ) #+END_SRC @@ -556,6 +570,7 @@ import cv2 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE) <> <> +image = crop_to_text(image) ocr_image(image, "--psm 7") #+END_SRC @@ -777,6 +792,9 @@ if __name__ == "__main__": This does a little bit of cleanup before sending it through tesseract. +Creates images and text files that can be used for training tesseract. See +https://github.com/tesseract-ocr/tesstrain. + #+BEGIN_SRC shell :results output . ~/.virtualenvs/lotto_odds/bin/activate python -m table_ocr.ocr_cell resources/examples/cells/000-000.png @@ -785,7 +803,8 @@ python -m table_ocr.ocr_cell resources/examples/cells/000-000.png #+RESULTS: : PRIZE -#+BEGIN_SRC python :tangle table_ocr/ocr_cell.py :mkdirp yes :results none +#+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none +import os import sys import cv2 @@ -795,13 +814,59 @@ import pytesseract <> def main(f): + directory, filename = os.path.split(f) + filename_sans_ext, ext = os.path.splitext(filename) image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) - print(ocr_image(image, "--psm 7")) + cropped = crop_to_text(image) + ocr_data_dir = os.path.join(directory, "ocr_data") + os.makedirs(ocr_data_dir, exist_ok=True) + out_imagepath = os.path.join(ocr_data_dir, filename) + out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) + cv2.imwrite(out_imagepath, cropped) + txt = ocr_image(cropped, "--psm 7") + with open(out_txtpath, "w") as txt_file: + txt_file.write(txt) if __name__ == "__main__": main(sys.argv[1]) #+END_SRC +*** table_ocr/ocr_to_csv.py + +#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv.py +import argparse +import csv +import io +import os +import sys +import tempfile + +parser = argparse.ArgumentParser() +parser.add_argument("files", nargs="+") + +def main(files): + rows = [] + for f in files: + directory, filename = os.path.split(f) + with open(f) as of: + txt = of.read() + row, column = map(int, filename.split(".")[0].split("-")) + if row == len(rows): + rows.append([]) + rows[row].append(txt) + + csv_file = io.StringIO() + writer = csv.writer(csv_file) + writer.writerows(rows) + print(csv_file.getvalue()) + +if __name__ == "__main__": + args = parser.parse_args() + main(args.files) + + +#+END_SRC + * Utils The following code lets us specify a size for images when they are exported to diff --git a/resources/examples/example-table-cell-1-1-cropped.png b/resources/examples/example-table-cell-1-1-cropped.png index 2ba2327..4aba6ec 100644 Binary files a/resources/examples/example-table-cell-1-1-cropped.png and b/resources/examples/example-table-cell-1-1-cropped.png differ diff --git a/table_ocr/ocr_image.py b/table_ocr/ocr_image.py index b15a28c..e2c886b 100644 --- a/table_ocr/ocr_image.py +++ b/table_ocr/ocr_image.py @@ -1,30 +1,69 @@ +import os import sys import cv2 import pytesseract def crop_to_text(image): - kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4)) - opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel) + MAX_COLOR_VAL = 255 + BLOCK_SIZE = 15 + SUBTRACT_FROM_MEAN = -2 + + img_bin = cv2.adaptiveThreshold( + ~image, + MAX_COLOR_VAL, + cv2.ADAPTIVE_THRESH_MEAN_C, + cv2.THRESH_BINARY, + BLOCK_SIZE, + SUBTRACT_FROM_MEAN, + ) + + # Get rid of littl noise. + kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) + opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel) + + # Dilate so each digit is connected, so we can get a bounding rectangle + # around all of the digits as one contour. This will make the bounding + # rectangle 8 pixels wider on the left and right, so we'll need to crop that + # out at the end so that we don't pick up stray border pixels. + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1)) + dilated = cv2.dilate(opened, kernel) + + contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) - contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) bounding_rects = [cv2.boundingRect(c) for c in contours] - # The largest contour is certainly the text that we're looking for. - largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) - x, y, w, h = largest_rect - cropped = image[y:y+h, x:x+w] + + if bounding_rects: + # The largest contour is certainly the text that we're looking for. + largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) + x, y, w, h = largest_rect + # Commas sometimes go a little below the bounding box and we don't want + # to lost them or turn them into periods. + img_h, img_w = image.shape + cropped = image[y:min(img_h, y+h+6), x+8:x+w-8] + else: + cropped = image bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) return bordered def ocr_image(image, config): - cropped = crop_to_text(image) return pytesseract.image_to_string( - ~cropped, + image, config=config ) def main(f): + directory, filename = os.path.split(f) + filename_sans_ext, ext = os.path.splitext(filename) image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) - print(ocr_image(image, "--psm 7")) + cropped = crop_to_text(image) + ocr_data_dir = os.path.join(directory, "ocr_data") + os.makedirs(ocr_data_dir, exist_ok=True) + out_imagepath = os.path.join(ocr_data_dir, filename) + out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) + cv2.imwrite(out_imagepath, cropped) + txt = ocr_image(cropped, "--psm 7") + with open(out_txtpath, "w") as txt_file: + txt_file.write(txt) if __name__ == "__main__": main(sys.argv[1]) diff --git a/table_ocr/ocr_to_csv.py b/table_ocr/ocr_to_csv.py new file mode 100644 index 0000000..98eff38 --- /dev/null +++ b/table_ocr/ocr_to_csv.py @@ -0,0 +1,29 @@ +import argparse +import csv +import io +import os +import sys +import tempfile + +parser = argparse.ArgumentParser() +parser.add_argument("files", nargs="+") + +def main(files): + rows = [] + for f in files: + directory, filename = os.path.split(f) + with open(f) as of: + txt = of.read() + row, column = map(int, filename.split(".")[0].split("-")) + if row == len(rows): + rows.append([]) + rows[row].append(txt) + + csv_file = io.StringIO() + writer = csv.writer(csv_file) + writer.writerows(rows) + print(csv_file.getvalue()) + +if __name__ == "__main__": + args = parser.parse_args() + main(args.files)