diff --git a/.gitignore b/.gitignore index 0cbdb7d..6dbad59 100644 --- a/.gitignore +++ b/.gitignore @@ -7,5 +7,4 @@ tmp/ *.egg build htmlcov -dist *.egg-info diff --git a/dist/table_ocr-0.0.1-py3-none-any.whl b/dist/table_ocr-0.0.1-py3-none-any.whl new file mode 100644 index 0000000..267920e Binary files /dev/null and b/dist/table_ocr-0.0.1-py3-none-any.whl differ diff --git a/dist/table_ocr-0.1.0-py3-none-any.whl b/dist/table_ocr-0.1.0-py3-none-any.whl new file mode 100644 index 0000000..d605ff7 Binary files /dev/null and b/dist/table_ocr-0.1.0-py3-none-any.whl differ diff --git a/dist/table_ocr-0.1.1-py3-none-any.whl b/dist/table_ocr-0.1.1-py3-none-any.whl new file mode 100644 index 0000000..fc3eefc Binary files /dev/null and b/dist/table_ocr-0.1.1-py3-none-any.whl differ diff --git a/dist/table_ocr-0.2.0-py3-none-any.whl b/dist/table_ocr-0.2.0-py3-none-any.whl new file mode 100644 index 0000000..7e24082 Binary files /dev/null and b/dist/table_ocr-0.2.0-py3-none-any.whl differ diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index 40fe5e4..3d1587b 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -317,6 +317,8 @@ We'll start with an image shown at the end of the previous section. ** Training Tesseract +Tesseract is used for recognizing characters. It is not involved in extracting the tables from an image or in extracting cells from the table. + It's a very good idea to train tesseract. Accuracy will improve tremendously. Clone the tesstrain repo at [[https://github.com/tesseract-ocr/tesstrain]]. @@ -370,7 +372,7 @@ the caption from within ~feh~. ~feh~ expects the captions to be named ~.txt~, so use a little shell-fu to do a quick rename. -#+BEGIN_SRC shell +#+BEGIN_SRC shell :eval no for f in *.txt; do f1=$(cut -d"." -f1 <(echo $f)); mv $f ${f1}.png.txt; done #+END_SRC @@ -383,7 +385,7 @@ next/previons images. Press ~q~ to quit. When finished, rename the files back to the filename structure that Tesseract looks for in training. -#+BEGIN_SRC shell +#+BEGIN_SRC shell :eval no for f in *.txt; do f1=$(cut -d"." -f1 <(echo $f)); mv $f ${f1}.gt.txt; done #+END_SRC @@ -746,10 +748,21 @@ def ocr_image(image, config): ) #+END_SRC +The second argument passed to ~ocr_image~ is a string of the command line arguments passed directly to ~tesseract~. You can view the available options at [[https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#options]] + +If no options are passed to ~tesseract~, then language defaults to english. This means ~tesseract~ needs to be able to find a file named ~eng.traineddata~ on whatever path it searches for languages. + +This python package comes with ~eng.traineddata~ and ~table-ocr.traineddata~. ~table-ocr.traineddata~ is a personal model that I've found to be more accurate for my use case. You should train your own to maximize accuracy. + +When you ~pip install~ this package, the traineddata gets copied to a ~tessdata~ folder in the same directory in which ~pip~ installs the package. + +The ~ocr_image~ package in this repo defaults to using the ~--tessdata-dir~ option to the package's ~tessdata~ directory in the package install location and the ~-l~ option to the ~table_ocr~ language. + #+BEGIN_SRC python :noweb no-export :exports both import pytesseract import cv2 import numpy as np +import math image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE) <> <> @@ -776,28 +789,28 @@ import setuptools long_description = """ Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV. -Requires binaries for tesseract and pdfimages (from Poppler). +Requires binaries for tesseract, ImageMagick, and pdfimages (from Poppler). """ setuptools.setup( name="table_ocr", - version="0.0.1", + version="0.2.0", author="Eric Ihli", author_email="eihli@owoga.com", - description="Turn images of tables into CSV data.", + description="Extract text from tables in images.", long_description=long_description, long_description_content_type="text/plain", url="https://github.com/eihli/image-table-ocr", packages=setuptools.find_packages(), + package_data={ + "table_ocr": ["tessdata/table-ocr.traineddata", "tessdata/eng.traineddata"] + }, classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], - install_requires=[ - "pytesseract~=0.3", - "opencv-python~=4.2", - ], - python_requires='>=3.6', + install_requires=["pytesseract~=0.3", "opencv-python~=4.2",], + python_requires=">=3.6", ) #+END_SRC @@ -1019,6 +1032,7 @@ print("\n".join(paths)) #+BEGIN_SRC python :tangle table_ocr/ocr_image/__init__.py import math import os +import sys import cv2 import numpy as np @@ -1042,6 +1056,10 @@ def main(image_file, tess_args): out_imagepath = os.path.join(ocr_data_dir, filename) out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) cv2.imwrite(out_imagepath, cropped) + if not tess_args: + d = os.path.dirname(sys.modules["table_ocr"].__file__) + tessdata_dir = os.path.join(d, "tessdata") + tess_args = ["--psm", "7", "-l", "table-ocr", "--tessdata-dir", tessdata_dir] txt = ocr_image(cropped, " ".join(tess_args)) with open(out_txtpath, "w") as txt_file: txt_file.write(txt) diff --git a/setup.py b/setup.py index 59181d1..961cd26 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ Requires binaries for tesseract, ImageMagick, and pdfimages (from Poppler). """ setuptools.setup( name="table_ocr", - version="0.1", + version="0.2.0", author="Eric Ihli", author_email="eihli@owoga.com", description="Extract text from tables in images.", @@ -15,15 +15,14 @@ setuptools.setup( long_description_content_type="text/plain", url="https://github.com/eihli/image-table-ocr", packages=setuptools.find_packages(), + package_data={ + "table_ocr": ["tessdata/table-ocr.traineddata", "tessdata/eng.traineddata"] + }, classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], - install_requires=[ - "pytesseract~=0.3", - "opencv-python~=4.2", - "numpy~=1.18.1", - ], - python_requires='>=3.6', + install_requires=["pytesseract~=0.3", "opencv-python~=4.2",], + python_requires=">=3.6", ) diff --git a/table_ocr/ocr_image/__init__.py b/table_ocr/ocr_image/__init__.py index e952522..c185f60 100644 --- a/table_ocr/ocr_image/__init__.py +++ b/table_ocr/ocr_image/__init__.py @@ -1,5 +1,6 @@ import math import os +import sys import cv2 import numpy as np @@ -23,6 +24,10 @@ def main(image_file, tess_args): out_imagepath = os.path.join(ocr_data_dir, filename) out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext)) cv2.imwrite(out_imagepath, cropped) + if not tess_args: + d = os.path.dirname(sys.modules["table_ocr"].__file__) + tessdata_dir = os.path.join(d, "tessdata") + tess_args = ["--psm", "7", "-l", "table-ocr", "--tessdata-dir", tessdata_dir] txt = ocr_image(cropped, " ".join(tess_args)) with open(out_txtpath, "w") as txt_file: txt_file.write(txt) diff --git a/table_ocr/tessdata/eng.traineddata b/table_ocr/tessdata/eng.traineddata new file mode 100644 index 0000000..f4744c2 Binary files /dev/null and b/table_ocr/tessdata/eng.traineddata differ diff --git a/table_ocr/tessdata/table-ocr.traineddata b/table_ocr/tessdata/table-ocr.traineddata new file mode 100644 index 0000000..69e98b3 Binary files /dev/null and b/table_ocr/tessdata/table-ocr.traineddata differ