diff --git a/ocr_tables b/ocr_tables new file mode 100755 index 0000000..d8c3217 --- /dev/null +++ b/ocr_tables @@ -0,0 +1,13 @@ +#!/bin/sh + +PDF=$1 + +python -m table_ocr.prepare_pdfs $PDF | grep .png > /tmp/pdf-images.txt +cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt +cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} | grep cells > /tmp/extracted-cells.txt +cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l data-table + +for image in $(cat /tmp/extracted-tables.txt); do + dir=$(dirname $image) + python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt") +done diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index 6004b31..04007c0 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -26,16 +26,33 @@ output~ to a code block will minimize that noise. #+BEGIN_SRC shell :results none :session *Shell* TABLES=("/tmp/example-1/example-1.pdf" "/tmp/example-2/example-2.pdf") -python -m table_ocr.prepare_pdfs $TABLES | grep .png > /tmp/pdf_images.txt -# All pngs that don't have "table" in their name. Assume "table" has already been found for files with table in name. -cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} # | grep tables > /tmp/extracted-tables.txt -cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} # | grep cells > /tmp/extracted-cells.txt +python -m table_ocr.prepare_pdfs $TABLES | grep .png > /tmp/pdf-images.txt +cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt +cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} | grep cells > /tmp/extracted-cells.txt cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} # This next one needs to be run on each subdirectory one at a time. python -m table_ocr.ocr_to_csv $(find . -iregex ".*cells.*ocr_data.*\.txt" 2>/dev/null) #+END_SRC +Or, as a shell script. + +#+BEGIN_SRC shell :results none :tangle ocr_tables :tangle-mode (identity #o755) +#!/bin/sh + +PDF=$1 + +python -m table_ocr.prepare_pdfs $PDF | grep .png > /tmp/pdf-images.txt +cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt +cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} | grep cells > /tmp/extracted-cells.txt +cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l data-table + +for image in $(cat /tmp/extracted-tables.txt); do + dir=$(dirname $image) + python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt") +done +#+END_SRC + * Preparing data ** Converting PDFs to images @@ -506,30 +523,32 @@ def crop_to_text(image): SUBTRACT_FROM_MEAN, ) - # Get rid of littl noise. - kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) - opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel) + img_h, img_w = image.shape + horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1)) + vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7))) + horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) + vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) + both = horizontal_lines + vertical_lines + cleaned = img_bin - both - # Dilate so each digit is connected, so we can get a bounding rectangle - # around all of the digits as one contour. This will make the bounding - # rectangle 8 pixels wider on the left and right, so we'll need to crop that - # out at the end so that we don't pick up stray border pixels. - kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1)) - dilated = cv2.dilate(opened, kernel) - - contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + # Get rid of little noise. + kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) + opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel) + contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) bounding_rects = [cv2.boundingRect(c) for c in contours] - + NUM_PX_COMMA = 6 if bounding_rects: - # The largest contour is certainly the text that we're looking for. - largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) - x, y, w, h = largest_rect - # Commas sometimes go a little below the bounding box and we don't want - # to lost them or turn them into periods. - img_h, img_w = image.shape - cropped = image[y:min(img_h, y+h+6), x+8:x+w-8] + minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 + for x, y, w, h in bounding_rects: + minx = min(minx, x) + miny = min(miny, y) + maxx = max(maxx, x + w) + maxy = max(maxy, y + h) + x, y, w, h = minx, miny, maxx - minx, maxy - miny + cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)] else: + # If we morphed out all of the text, fallback to using the unmorphed image. cropped = image bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) return bordered @@ -549,7 +568,6 @@ cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image) #+ATTR_HTML: :width 200px :height 100% [[file:resources/examples/example-table-cell-1-1-cropped.png]] - ** OCR each cell If we cleaned up the images well enough, we might get some accurate OCR! @@ -813,31 +831,44 @@ python -m table_ocr.ocr_cell resources/examples/cells/000-000.png : PRIZE #+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none +import argparse +import math import os import sys import cv2 import pytesseract +description="""Takes a single argument that is the image to OCR. +Remaining arguments are passed directly to Tesseract. + +Attempts to make OCR more accurate by performing some modifications on the image. +Saves the modified image and the OCR text in an `ocr_data` directory. +Filenames are of the format for training with tesstrain.""" +parser = argparse.ArgumentParser(description=description) +parser.add_argument("image", help="filepath of image to perform OCR") + <> <> -def main(f): - directory, filename = os.path.split(f) +def main(image_file, tess_args): + directory, filename = os.path.split(image_file) filename_sans_ext, ext = os.path.splitext(filename) - image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) + image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE) cropped = crop_to_text(image) ocr_data_dir = os.path.join(directory, "ocr_data") os.makedirs(ocr_data_dir, exist_ok=True) out_imagepath = os.path.join(ocr_data_dir, filename) out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) cv2.imwrite(out_imagepath, cropped) - txt = ocr_image(cropped, "--psm 7") + txt = ocr_image(cropped, " ".join(tess_args)) + print(txt) with open(out_txtpath, "w") as txt_file: txt_file.write(txt) if __name__ == "__main__": - main(sys.argv[1]) + args, tess_args = parser.parse_known_args() + main(args.image, tess_args) #+END_SRC *** table_ocr/ocr_to_csv.py @@ -854,6 +885,13 @@ parser = argparse.ArgumentParser() parser.add_argument("files", nargs="+") def main(files): + """Files must be sorted lexicographically + Filenames must be -.txt. + 000-000.txt + 000-001.txt + 001-000.txt + etc... + """ rows = [] for f in files: directory, filename = os.path.split(f) @@ -871,9 +909,9 @@ def main(files): if __name__ == "__main__": args = parser.parse_args() - main(args.files) - - + files = args.files + files.sort() + main(files) #+END_SRC * Utils @@ -899,10 +937,6 @@ with ~advice-add~. (concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]") #+END_SRC -#+RESULTS: html-image-size -#+ATTR_HTML: :width 100% :height 100% -[[file:]] - #+BEGIN_SRC emacs-lisp :results none (defun remove-attributes-from-src-block-result (&rest args) (let ((location (org-babel-where-is-src-block-result)) diff --git a/resources/examples/example-table-cell-1-1-cropped.png b/resources/examples/example-table-cell-1-1-cropped.png index 4aba6ec..5bbcbe9 100644 Binary files a/resources/examples/example-table-cell-1-1-cropped.png and b/resources/examples/example-table-cell-1-1-cropped.png differ diff --git a/table_ocr/ocr_image.py b/table_ocr/ocr_image.py index e2c886b..f92e786 100644 --- a/table_ocr/ocr_image.py +++ b/table_ocr/ocr_image.py @@ -1,9 +1,20 @@ +import argparse +import math import os import sys import cv2 import pytesseract +description="""Takes a single argument that is the image to OCR. +Remaining arguments are passed directly to Tesseract. + +Attempts to make OCR more accurate by performing some modifications on the image. +Saves the modified image and the OCR text in an `ocr_data` directory. +Filenames are of the format for training with tesstrain.""" +parser = argparse.ArgumentParser(description=description) +parser.add_argument("image", help="filepath of image to perform OCR") + def crop_to_text(image): MAX_COLOR_VAL = 255 BLOCK_SIZE = 15 @@ -18,30 +29,32 @@ def crop_to_text(image): SUBTRACT_FROM_MEAN, ) - # Get rid of littl noise. - kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) - opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel) + img_h, img_w = image.shape + horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1)) + vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7))) + horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) + vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) + both = horizontal_lines + vertical_lines + cleaned = img_bin - both - # Dilate so each digit is connected, so we can get a bounding rectangle - # around all of the digits as one contour. This will make the bounding - # rectangle 8 pixels wider on the left and right, so we'll need to crop that - # out at the end so that we don't pick up stray border pixels. - kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1)) - dilated = cv2.dilate(opened, kernel) - - contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + # Get rid of little noise. + kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) + opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel) + contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) bounding_rects = [cv2.boundingRect(c) for c in contours] - + NUM_PX_COMMA = 6 if bounding_rects: - # The largest contour is certainly the text that we're looking for. - largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) - x, y, w, h = largest_rect - # Commas sometimes go a little below the bounding box and we don't want - # to lost them or turn them into periods. - img_h, img_w = image.shape - cropped = image[y:min(img_h, y+h+6), x+8:x+w-8] + minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 + for x, y, w, h in bounding_rects: + minx = min(minx, x) + miny = min(miny, y) + maxx = max(maxx, x + w) + maxy = max(maxy, y + h) + x, y, w, h = minx, miny, maxx - minx, maxy - miny + cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)] else: + # If we morphed out all of the text, fallback to using the unmorphed image. cropped = image bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) return bordered @@ -51,19 +64,21 @@ def ocr_image(image, config): config=config ) -def main(f): - directory, filename = os.path.split(f) +def main(image_file, tess_args): + directory, filename = os.path.split(image_file) filename_sans_ext, ext = os.path.splitext(filename) - image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) + image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE) cropped = crop_to_text(image) ocr_data_dir = os.path.join(directory, "ocr_data") os.makedirs(ocr_data_dir, exist_ok=True) out_imagepath = os.path.join(ocr_data_dir, filename) out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) cv2.imwrite(out_imagepath, cropped) - txt = ocr_image(cropped, "--psm 7") + txt = ocr_image(cropped, " ".join(tess_args)) + print(txt) with open(out_txtpath, "w") as txt_file: txt_file.write(txt) if __name__ == "__main__": - main(sys.argv[1]) + args, tess_args = parser.parse_known_args() + main(args.image, tess_args) diff --git a/table_ocr/ocr_to_csv.py b/table_ocr/ocr_to_csv.py index d390bd6..2560233 100644 --- a/table_ocr/ocr_to_csv.py +++ b/table_ocr/ocr_to_csv.py @@ -9,6 +9,13 @@ parser = argparse.ArgumentParser() parser.add_argument("files", nargs="+") def main(files): + """Files must be sorted lexicographically + Filenames must be -.txt. + 000-000.txt + 000-001.txt + 001-000.txt + etc... + """ rows = [] for f in files: directory, filename = os.path.split(f) @@ -26,4 +33,6 @@ def main(files): if __name__ == "__main__": args = parser.parse_args() - main(args.files) + files = args.files + files.sort() + main(files)