From e49fffa5a72fe6801b88cfaaaca11e70d297c5f0 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Tue, 14 Apr 2020 10:42:58 -0700 Subject: [PATCH] Add module for outputting csv from parsed table Make cell extraction a little more accurate. --- pdf_table_extraction_and_ocr.org | 117 ++++++++++++++---- .../example-table-cell-1-1-cropped.png | Bin 1058 -> 1026 bytes table_ocr/ocr_image.py | 59 +++++++-- table_ocr/ocr_to_csv.py | 29 +++++ 4 files changed, 169 insertions(+), 36 deletions(-) create mode 100644 table_ocr/ocr_to_csv.py diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index b77687b..1ab9ed5 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -482,19 +482,48 @@ much fewer than the width of the text. If that's the case, then we can remove that noise with a simple open morph. Once the stray border pixels have been removed, we can expand our border using -~openMakeBorder~. +~copyMakeBorder~. #+BEGIN_SRC python :eval no :noweb-ref crop-to-text def crop_to_text(image): - kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4)) - opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel) + MAX_COLOR_VAL = 255 + BLOCK_SIZE = 15 + SUBTRACT_FROM_MEAN = -2 + + img_bin = cv2.adaptiveThreshold( + ~image, + MAX_COLOR_VAL, + cv2.ADAPTIVE_THRESH_MEAN_C, + cv2.THRESH_BINARY, + BLOCK_SIZE, + SUBTRACT_FROM_MEAN, + ) + + # Get rid of littl noise. + kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) + opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel) + + # Dilate so each digit is connected, so we can get a bounding rectangle + # around all of the digits as one contour. This will make the bounding + # rectangle 8 pixels wider on the left and right, so we'll need to crop that + # out at the end so that we don't pick up stray border pixels. + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1)) + dilated = cv2.dilate(opened, kernel) + + contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) - contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) bounding_rects = [cv2.boundingRect(c) for c in contours] - # The largest contour is certainly the text that we're looking for. - largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) - x, y, w, h = largest_rect - cropped = image[y:y+h, x:x+w] + + if bounding_rects: + # The largest contour is certainly the text that we're looking for. + largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) + x, y, w, h = largest_rect + # Commas sometimes go a little below the bounding box and we don't want + # to lost them or turn them into periods. + img_h, img_w = image.shape + cropped = image[y:min(img_h, y+h+6), x+8:x+w-8] + else: + cropped = image bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) return bordered #+END_SRC @@ -513,20 +542,6 @@ cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image) #+ATTR_HTML: :width 200px :height 100% [[file:resources/examples/example-table-cell-1-1-cropped.png]] -#+HEADER: :post html-image-size(text=*this*, width="200px") -#+BEGIN_SRC python :noweb no-export :results raw :exports both -import cv2 -<> -image = cv2.imread("/tmp/example-1/cells/001-002.png", cv2.IMREAD_GRAYSCALE) -image = crop_to_text(image) -cv2.imwrite("/tmp/example-1/cells/001-002-cropped.png", image) -"/tmp/example-1/cells/001-002-cropped.png" -#+END_SRC - -#+RESULTS: -#+ATTR_HTML: :width 200px :height 100% -[[file:/tmp/example-1/cells/001-002-cropped.png]] - ** OCR each cell @@ -543,9 +558,8 @@ period into a comma, then you might need to do some custom Tesseract training. #+BEGIN_SRC python :noweb-ref ocr-image :eval no def ocr_image(image, config): - cropped = crop_to_text(image) return pytesseract.image_to_string( - ~cropped, + image, config=config ) #+END_SRC @@ -556,6 +570,7 @@ import cv2 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE) <> <> +image = crop_to_text(image) ocr_image(image, "--psm 7") #+END_SRC @@ -777,6 +792,9 @@ if __name__ == "__main__": This does a little bit of cleanup before sending it through tesseract. +Creates images and text files that can be used for training tesseract. See +https://github.com/tesseract-ocr/tesstrain. + #+BEGIN_SRC shell :results output . ~/.virtualenvs/lotto_odds/bin/activate python -m table_ocr.ocr_cell resources/examples/cells/000-000.png @@ -785,7 +803,8 @@ python -m table_ocr.ocr_cell resources/examples/cells/000-000.png #+RESULTS: : PRIZE -#+BEGIN_SRC python :tangle table_ocr/ocr_cell.py :mkdirp yes :results none +#+BEGIN_SRC python :tangle table_ocr/ocr_image.py :mkdirp yes :results none +import os import sys import cv2 @@ -795,13 +814,59 @@ import pytesseract <> def main(f): + directory, filename = os.path.split(f) + filename_sans_ext, ext = os.path.splitext(filename) image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) - print(ocr_image(image, "--psm 7")) + cropped = crop_to_text(image) + ocr_data_dir = os.path.join(directory, "ocr_data") + os.makedirs(ocr_data_dir, exist_ok=True) + out_imagepath = os.path.join(ocr_data_dir, filename) + out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) + cv2.imwrite(out_imagepath, cropped) + txt = ocr_image(cropped, "--psm 7") + with open(out_txtpath, "w") as txt_file: + txt_file.write(txt) if __name__ == "__main__": main(sys.argv[1]) #+END_SRC +*** table_ocr/ocr_to_csv.py + +#+BEGIN_SRC python :tangle table_ocr/ocr_to_csv.py +import argparse +import csv +import io +import os +import sys +import tempfile + +parser = argparse.ArgumentParser() +parser.add_argument("files", nargs="+") + +def main(files): + rows = [] + for f in files: + directory, filename = os.path.split(f) + with open(f) as of: + txt = of.read() + row, column = map(int, filename.split(".")[0].split("-")) + if row == len(rows): + rows.append([]) + rows[row].append(txt) + + csv_file = io.StringIO() + writer = csv.writer(csv_file) + writer.writerows(rows) + print(csv_file.getvalue()) + +if __name__ == "__main__": + args = parser.parse_args() + main(args.files) + + +#+END_SRC + * Utils The following code lets us specify a size for images when they are exported to diff --git a/resources/examples/example-table-cell-1-1-cropped.png b/resources/examples/example-table-cell-1-1-cropped.png index 2ba2327016cfcc28b0fc151be47674463af6f091..4aba6ecd1931960d623cd5c138fb9ca37260c5e9 100644 GIT binary patch literal 1026 zcmV+d1pWJoP) z{YzbS90&0C=X}q@J@=~j-tq~oJt(bgrRi+CwlY#i$krxow9yZvC^1-rAY_daB&x|4 ztF=~A6l_FVAl0yiIT#3Tq_LKXZf@l)ozKpPd+xdCo^w8Z@8}=6uNN`WMT~S2BVEKu z7ctUBjFfA?)-Sp?ZSXW-fvWnAx28w$fTRU9K8kj zP?obSees>s`4eH!`@P5xh^97A5l-|*04cJ(8}$v3lCRin2zqMz!x9f+1fIC zaN_0GdoWc$vZpk$&{le^91=04=#uX%bAQ$bw84DU6S3e-*;M$4C&ZVH+Pkmgp#{FJ z0?j86d<2OYQpS#USiyn4(-l2Bd?l$(tPeE50bG98sXuGw`dhj+i%%VHXoExyDRbu; z6>&43`}FrrA{N3>-FR1>N2^t{65He@cJkVj`$8VPxBye!(+N zRrl?a;Ymoukh15STduF|tIf%}YC`gcEAwtBjvlX?SleeDIFmDjS#@{6Z$l!6)ZR4x zScdLh2;~p4AXfa*%E%ZoVRvtI(44QCe0XKW(I*T4fJ6*w=}@Q0Rd-$-im{^u@J7qE zJHCjE7k(-nbWnZEiWF=a>~$ST#7McU7bhZi$E%xq43`ywi3*cl=T7+dYKwVx#nelKeNDGHh|AW8-;9#E^{pinhq9@AqB&B$9C(rv#pJqdabj%L^I09{h>tY!xQGwR~BCC zi>*#Y&b*oZ-d;$=knF9a$&f?qdb8YrHby6p?pwROuZKc`1@Gq?xKunA%9xAJ#C-!2 zF;Y&qugm;9p4SfW$0u@*Z!(or!zC+A)1wXh9X)w)pjH_l__R_rAQ40ImZQEhldT|O zB#cGfP?y5+#A0;S6hK_)HbE7xio1v*EoWJ(Mz8}&YH(FZNe%Es+yqlBl+?iF=vELC zG5(j-0841hs?DY!3$gH>01~<{kQBZWHi*4T7fbGEtXx!v7!?>+aN^XZ&Rf5Cmdh>;CqWP=#lAVxNb zkqu(FVj)|OPBElLredp(Na~KII|8v#T>6*>3N3}n2w*TT9)gD$E?eIIb={biq}jIW z2v`b$k!!gNhTA&oR5h(ma9g1NUSy+!{}Cf&MQ5K?G!eZII1m%LhBo%zv7+9vZT~SV z>61H-+d`VryFR2o1$@P8$WzqD!j5M|aj9hpE9u4A-7dS2ZX| z_!9*${lFySLA0;vIlscD$%R9e=Z7l#Uj5dQsQ*%2nQJL<5yN$G@0vzfj<);rs!L1& z?Di!yZ#hIm&y6f?A{oi=kSYDpzRA*GBJdEy_07BC6BT~6zNkHYB^3}%=v*?bRWfx~ zSk>oHp!n}uinaGhc_A1=G2s_?EbwZy{S=`H%EhWtSc zDQjDuCC$hnhU?MA9RttiX!hk`Q9Daw;dE6+x`_#EuSVPaBQsm=Fl5Bi)ezR_| zkm#-Cep@DeuYX_IC2%e8NsIXAa_#p}j)038E=RWQ0{0#{FIvU3^)R<=a` zwpay>0QMZ1{ij3*F^@N@hz3j3`|C>iSNx8dm-UUmA zhZwGgEsxJSFXWDcQ_q!KW*`>gff7^wZC2{i>c?JavTIKj-3i%p&Ck_8IS_CW!}VZJ zIN7+|c7Yq7 zuI5?~=0*GVZ^<_t#%6ONAYFkHmQICf@6=$}O48Gt`KoPTJ( zUmWcyjg1d<)owJk^wt~8M9+;csuVq~!b1#?JrQ-pV4grqPw8Wtu8fCa%dzOBUjhlK zB?*eO6eOhz4>3FwxrU+=l0KwWSPEQe6|iN(52ko9t%AwajKH|&hl?25AVxNbkqu&G cgBaQP4R07*qoM6N<$f;IpH(f|Me diff --git a/table_ocr/ocr_image.py b/table_ocr/ocr_image.py index b15a28c..e2c886b 100644 --- a/table_ocr/ocr_image.py +++ b/table_ocr/ocr_image.py @@ -1,30 +1,69 @@ +import os import sys import cv2 import pytesseract def crop_to_text(image): - kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4)) - opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel) + MAX_COLOR_VAL = 255 + BLOCK_SIZE = 15 + SUBTRACT_FROM_MEAN = -2 + + img_bin = cv2.adaptiveThreshold( + ~image, + MAX_COLOR_VAL, + cv2.ADAPTIVE_THRESH_MEAN_C, + cv2.THRESH_BINARY, + BLOCK_SIZE, + SUBTRACT_FROM_MEAN, + ) + + # Get rid of littl noise. + kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) + opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel) + + # Dilate so each digit is connected, so we can get a bounding rectangle + # around all of the digits as one contour. This will make the bounding + # rectangle 8 pixels wider on the left and right, so we'll need to crop that + # out at the end so that we don't pick up stray border pixels. + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1)) + dilated = cv2.dilate(opened, kernel) + + contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) - contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) bounding_rects = [cv2.boundingRect(c) for c in contours] - # The largest contour is certainly the text that we're looking for. - largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) - x, y, w, h = largest_rect - cropped = image[y:y+h, x:x+w] + + if bounding_rects: + # The largest contour is certainly the text that we're looking for. + largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) + x, y, w, h = largest_rect + # Commas sometimes go a little below the bounding box and we don't want + # to lost them or turn them into periods. + img_h, img_w = image.shape + cropped = image[y:min(img_h, y+h+6), x+8:x+w-8] + else: + cropped = image bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) return bordered def ocr_image(image, config): - cropped = crop_to_text(image) return pytesseract.image_to_string( - ~cropped, + image, config=config ) def main(f): + directory, filename = os.path.split(f) + filename_sans_ext, ext = os.path.splitext(filename) image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) - print(ocr_image(image, "--psm 7")) + cropped = crop_to_text(image) + ocr_data_dir = os.path.join(directory, "ocr_data") + os.makedirs(ocr_data_dir, exist_ok=True) + out_imagepath = os.path.join(ocr_data_dir, filename) + out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) + cv2.imwrite(out_imagepath, cropped) + txt = ocr_image(cropped, "--psm 7") + with open(out_txtpath, "w") as txt_file: + txt_file.write(txt) if __name__ == "__main__": main(sys.argv[1]) diff --git a/table_ocr/ocr_to_csv.py b/table_ocr/ocr_to_csv.py new file mode 100644 index 0000000..98eff38 --- /dev/null +++ b/table_ocr/ocr_to_csv.py @@ -0,0 +1,29 @@ +import argparse +import csv +import io +import os +import sys +import tempfile + +parser = argparse.ArgumentParser() +parser.add_argument("files", nargs="+") + +def main(files): + rows = [] + for f in files: + directory, filename = os.path.split(f) + with open(f) as of: + txt = of.read() + row, column = map(int, filename.split(".")[0].split("-")) + if row == len(rows): + rows.append([]) + rows[row].append(txt) + + csv_file = io.StringIO() + writer = csv.writer(csv_file) + writer.writerows(rows) + print(csv_file.getvalue()) + +if __name__ == "__main__": + args = parser.parse_args() + main(args.files)