diff --git a/README.org b/README.org
index 0ebdcb6..65484b1 100644
--- a/README.org
+++ b/README.org
@@ -49,7 +49,7 @@ PDF=$1
 python -m table_ocr.pdf_to_images $PDF | grep .png > /tmp/pdf-images.txt
 cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {}  | grep table > /tmp/extracted-tables.txt
 cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells {} | grep cells > /tmp/extracted-cells.txt
-cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l table-ocr
+cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {}
 
 for image in $(cat /tmp/extracted-tables.txt); do
     dir=$(dirname $image)
@@ -57,6 +57,7 @@ for image in $(cat /tmp/extracted-tables.txt); do
 done
 #+END_SRC
 
+
 The package was written in a [[https://en.wikipedia.org/wiki/Literate_programming][literate programming]] style. The source code at
 [[https://eihli.github.io/image-table-ocr/pdf_table_extraction_and_ocr.html]] is
 meant to act as the documentation and reference material.
diff --git a/ocr_tables b/ocr_tables
index 25e936a..5f2e413 100755
--- a/ocr_tables
+++ b/ocr_tables
@@ -5,8 +5,7 @@ PDF=$1
 python -m table_ocr.pdf_to_images $PDF | grep .png > /tmp/pdf-images.txt
 cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {}  | grep table > /tmp/extracted-tables.txt
 cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells {} | grep cells > /tmp/extracted-cells.txt
-cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l table-ocr
-
+cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {}
 for image in $(cat /tmp/extracted-tables.txt); do
     dir=$(dirname $image)
     python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt")
diff --git a/pdf_table_extraction_and_ocr.html b/pdf_table_extraction_and_ocr.html
index be38955..5126f4d 100644
--- a/pdf_table_extraction_and_ocr.html
+++ b/pdf_table_extraction_and_ocr.html
@@ -3,7 +3,7 @@
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
 <head>
-<!-- 2020-04-25 Sat 12:20 -->
+<!-- 2020-10-14 Wed 21:28 -->
 <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
 <meta name="viewport" content="width=device-width, initial-scale=1" />
 <title>PDF Parsing</title>
@@ -225,94 +225,98 @@
 <h2>Table of Contents</h2>
 <div id="text-table-of-contents">
 <ul>
-<li><a href="#org1579202">1. Overview</a>
+<li><a href="#org3fab902">1. Overview</a>
 <ul>
-<li><a href="#org39ea9a7">1.1. Requirements</a>
+<li><a href="#orgaf477b8">1.1. Requirements</a>
 <ul>
-<li><a href="#orgd07d656">1.1.1. Python packages</a></li>
-<li><a href="#org6d44c18">1.1.2. External</a></li>
+<li><a href="#org43dd3dc">1.1.1. Python packages</a></li>
+<li><a href="#org8927075">1.1.2. External</a></li>
 </ul>
 </li>
-<li><a href="#orgdd8ad82">1.2. Contributing</a></li>
-<li><a href="#org3ae1ded">1.3. Example usage</a></li>
-<li><a href="#org43c2c09">1.4. Possible improvements</a></li>
+<li><a href="#org14c36da">1.2. Contributing</a></li>
+<li><a href="#org8aef2ca">1.3. Example usage</a></li>
+<li><a href="#org7e5cd11">1.4. Possible improvements</a></li>
 </ul>
 </li>
-<li><a href="#org6412872">2. Preparing data</a>
+<li><a href="#org51af43b">2. Preparing data</a>
 <ul>
-<li><a href="#orgdc51282">2.1. Converting PDFs to images</a></li>
-<li><a href="#org938b07f">2.2. Detecting image orientation and applying rotation.</a></li>
+<li><a href="#orga4dde96">2.1. Converting PDFs to images</a></li>
+<li><a href="#org6c75ffa">2.2. Detecting image orientation and applying rotation.</a></li>
 </ul>
 </li>
-<li><a href="#orga68497c">3. Detecting tables</a>
+<li><a href="#orgc195fec">3. Detecting tables</a>
 <ul>
-<li><a href="#org019e27e">3.1. Improving accuracy</a></li>
+<li><a href="#org9dd75a6">3.1. Improving accuracy</a></li>
 </ul>
 </li>
-<li><a href="#orga12c6cf">4. OCR tables</a>
+<li><a href="#org904debc">4. OCR tables</a>
 <ul>
-<li><a href="#orgc895682">4.1. Training Tesseract</a></li>
-<li><a href="#org1ce3ded">4.2. Blur</a></li>
-<li><a href="#org5ebc5e9">4.3. Threshold</a></li>
-<li><a href="#orga8b32e0">4.4. Finding the vertical and horizontal lines of the table</a></li>
-<li><a href="#org1a881f5">4.5. Finding the contours</a></li>
-<li><a href="#org094eedd">4.6. Sorting the bounding rectangles</a></li>
-<li><a href="#org87d2780">4.7. Cropping each cell to the text</a></li>
-<li><a href="#org8de0931">4.8. OCR each cell</a></li>
+<li><a href="#orgb03a965">4.1. Training Tesseract</a>
+<ul>
+<li><a href="#org5adeb27">4.1.1. Training tips</a></li>
+</ul>
+</li>
+<li><a href="#org152ead5">4.2. Blur</a></li>
+<li><a href="#org858fb89">4.3. Threshold</a></li>
+<li><a href="#orgcf17042">4.4. Finding the vertical and horizontal lines of the table</a></li>
+<li><a href="#org94f71b3">4.5. Finding the contours</a></li>
+<li><a href="#orgc64b6ef">4.6. Sorting the bounding rectangles</a></li>
+<li><a href="#orgd4dc4cc">4.7. Cropping each cell to the text</a></li>
+<li><a href="#org22a3e7b">4.8. OCR each cell</a></li>
 </ul>
 </li>
-<li><a href="#orgf6e8639">5. Files</a>
+<li><a href="#org9d0b21d">5. Files</a>
 <ul>
-<li><a href="#orgba6f775">5.1. setup.py</a></li>
-<li><a href="#org6a407ab">5.2. table_ocr</a>
+<li><a href="#orgd57e56a">5.1. setup.py</a></li>
+<li><a href="#org4b36161">5.2. table_ocr</a>
 <ul>
-<li><a href="#org465ea6b">5.2.1. table_ocr/__init__.py</a></li>
-<li><a href="#org7a49723">5.2.2. table_ocr/util.py</a></li>
-<li><a href="#orgd6e9341">5.2.3. table_ocr/pdf_to_images/</a>
+<li><a href="#orgbdd2fc0">5.2.1. table_ocr/__init__.py</a></li>
+<li><a href="#org09e5a07">5.2.2. table_ocr/util.py</a></li>
+<li><a href="#org5a371bd">5.2.3. table_ocr/pdf_to_images/</a>
 <ul>
-<li><a href="#org8a31fd7">5.2.3.1. table_ocr/pdf_to_images/__init__.py</a></li>
-<li><a href="#org1e4f0c6">5.2.3.2. table_ocr/pdf_to_images/__main__.py</a></li>
+<li><a href="#orgd777fae">5.2.3.1. table_ocr/pdf_to_images/__init__.py</a></li>
+<li><a href="#org0064754">5.2.3.2. table_ocr/pdf_to_images/__main__.py</a></li>
 </ul>
 </li>
-<li><a href="#orgb1142c2">5.2.4. table_ocr/extract_tables/</a>
+<li><a href="#org03e58e9">5.2.4. table_ocr/extract_tables/</a>
 <ul>
-<li><a href="#org4d226b5">5.2.4.1. table_ocr/extract_tables/__init__.py</a></li>
-<li><a href="#org28b104f">5.2.4.2. table_ocr/extract_tables/__main__.py</a></li>
+<li><a href="#orgfedc867">5.2.4.1. table_ocr/extract_tables/__init__.py</a></li>
+<li><a href="#org82b2c3a">5.2.4.2. table_ocr/extract_tables/__main__.py</a></li>
 </ul>
 </li>
-<li><a href="#org0d21747">5.2.5. table_ocr/extract_cells/</a>
+<li><a href="#org7ec79e9">5.2.5. table_ocr/extract_cells/</a>
 <ul>
-<li><a href="#org8556f38">5.2.5.1. table_ocr/extract_cells/__init__.py</a></li>
-<li><a href="#orgb988224">5.2.5.2. table_ocr/extract_cells/__main__.py</a></li>
+<li><a href="#org6d6ddc7">5.2.5.1. table_ocr/extract_cells/__init__.py</a></li>
+<li><a href="#orgd698866">5.2.5.2. table_ocr/extract_cells/__main__.py</a></li>
 </ul>
 </li>
-<li><a href="#org04c3633">5.2.6. table_ocr/ocr_image/</a>
+<li><a href="#org5ff2e40">5.2.6. table_ocr/ocr_image/</a>
 <ul>
-<li><a href="#orgd01e3e7">5.2.6.1. table_ocr/ocr_image/__init__.py</a></li>
-<li><a href="#org2a58364">5.2.6.2. table_ocr/ocr_image/__main__.py</a></li>
+<li><a href="#org1bc0eb3">5.2.6.1. table_ocr/ocr_image/__init__.py</a></li>
+<li><a href="#org11f1d0c">5.2.6.2. table_ocr/ocr_image/__main__.py</a></li>
 </ul>
 </li>
-<li><a href="#org2c74337">5.2.7. table_ocr/ocr_to_csv/</a>
+<li><a href="#org7612c04">5.2.7. table_ocr/ocr_to_csv/</a>
 <ul>
-<li><a href="#orgf6b3289">5.2.7.1. table_ocr/ocr_to_csv/__init__.py</a></li>
-<li><a href="#org4befe04">5.2.7.2. table_ocr/ocr_to_csv/__main__.py</a></li>
+<li><a href="#orgb76e923">5.2.7.1. table_ocr/ocr_to_csv/__init__.py</a></li>
+<li><a href="#orgb9ce258">5.2.7.2. table_ocr/ocr_to_csv/__main__.py</a></li>
 </ul>
 </li>
 </ul>
 </li>
 </ul>
 </li>
-<li><a href="#org487dcc8">6. Utils</a>
+<li><a href="#org446b9ad">6. Utils</a>
 <ul>
-<li><a href="#org457567f">6.1. Logging</a></li>
+<li><a href="#orgac512bd">6.1. Logging</a></li>
 </ul>
 </li>
 </ul>
 </div>
 </div>
 
-<div id="outline-container-org1579202" class="outline-2">
-<h2 id="org1579202"><span class="section-number-2">1</span> Overview</h2>
+<div id="outline-container-org3fab902" class="outline-2">
+<h2 id="org3fab902"><span class="section-number-2">1</span> Overview</h2>
 <div class="outline-text-2" id="text-1">
 <p>
 This Python package provides utilities for extracting tabular data from PDF
@@ -359,17 +363,17 @@ The package is split into modules with narrow focuses.
 <li><code>pdf_to_images</code> uses Poppler and ImageMagick to extract images from a PDF.</li>
 <li><code>extract_tables</code> finds and extracts table-looking things from an image.</li>
 <li><code>extract_cells</code> extracts and orders cells from a table.</li>
-<li><code>ocr_image</code> uses Tesseract to turn a OCR the text from an image of a cell.</li>
+<li><code>ocr_image</code> uses Tesseract to OCR the text from an image of a cell.</li>
 <li><code>ocr_to_csv</code> converts into a CSV the directory structure that <code>ocr_image</code> outputs.</li>
 </ul>
 </div>
 
-<div id="outline-container-org39ea9a7" class="outline-3">
-<h3 id="org39ea9a7"><span class="section-number-3">1.1</span> Requirements</h3>
+<div id="outline-container-orgaf477b8" class="outline-3">
+<h3 id="orgaf477b8"><span class="section-number-3">1.1</span> Requirements</h3>
 <div class="outline-text-3" id="text-1-1">
 </div>
-<div id="outline-container-orgd07d656" class="outline-4">
-<h4 id="orgd07d656"><span class="section-number-4">1.1.1</span> Python packages</h4>
+<div id="outline-container-org43dd3dc" class="outline-4">
+<h4 id="org43dd3dc"><span class="section-number-4">1.1.1</span> Python packages</h4>
 <div class="outline-text-4" id="text-1-1-1">
 <ul class="org-ul">
 <li>numpy</li>
@@ -379,8 +383,8 @@ The package is split into modules with narrow focuses.
 </div>
 </div>
 
-<div id="outline-container-org6d44c18" class="outline-4">
-<h4 id="org6d44c18"><span class="section-number-4">1.1.2</span> External</h4>
+<div id="outline-container-org8927075" class="outline-4">
+<h4 id="org8927075"><span class="section-number-4">1.1.2</span> External</h4>
 <div class="outline-text-4" id="text-1-1-2">
 <ul class="org-ul">
 <li><code>pdfimages</code> from Poppler</li>
@@ -391,8 +395,8 @@ The package is split into modules with narrow focuses.
 </div>
 </div>
 
-<div id="outline-container-orgdd8ad82" class="outline-3">
-<h3 id="orgdd8ad82"><span class="section-number-3">1.2</span> Contributing</h3>
+<div id="outline-container-org14c36da" class="outline-3">
+<h3 id="org14c36da"><span class="section-number-3">1.2</span> Contributing</h3>
 <div class="outline-text-3" id="text-1-2">
 <p>
 This package was created in a <a href="https://en.wikipedia.org/wiki/Literate_programming">literate programming</a> style with the help of <a href="https://orgmode.org/worg/org-contrib/babel/intro.html">Babel</a>.
@@ -405,8 +409,8 @@ barrier for contributors who aren&rsquo;t already familiar with Emacs and Babel.
 </div>
 </div>
 
-<div id="outline-container-org3ae1ded" class="outline-3">
-<h3 id="org3ae1ded"><span class="section-number-3">1.3</span> Example usage</h3>
+<div id="outline-container-org8aef2ca" class="outline-3">
+<h3 id="org8aef2ca"><span class="section-number-3">1.3</span> Example usage</h3>
 <div class="outline-text-3" id="text-1-3">
 <p>
 Here is an example of a shell script that uses each module to turn a pdf with a
@@ -424,26 +428,29 @@ you need into your own python projects and use them as needed.
 </p>
 
 <div class="org-src-container">
-<pre class="src src-shell" id="orgd0301ec"><span style="color: #5B6268;">#</span><span style="color: #5B6268;">!/bin/</span><span style="color: #51afef;">sh</span>
+<pre class="src src-shell" id="org5f5c842"><span style="color: #5B6268;">#</span><span style="color: #5B6268;">!/bin/</span><span style="color: #51afef;">sh</span>
 
 <span style="color: #dcaeea;">PDF</span>=$<span style="color: #da8548; font-weight: bold;">1</span>
 
 python -m table_ocr.pdf_to_images $<span style="color: #dcaeea;">PDF</span> | <span style="color: #ECBE7B;">grep</span> .png &gt; /tmp/pdf-images.txt
 <span style="color: #ECBE7B;">cat</span> /tmp/pdf-images.txt | xargs -I<span style="color: #51afef;">{}</span> python -m table_ocr.extract_tables <span style="color: #51afef;">{}</span>  | <span style="color: #ECBE7B;">grep</span> table &gt; /tmp/extracted-tables.txt
 <span style="color: #ECBE7B;">cat</span> /tmp/extracted-tables.txt | xargs -I<span style="color: #51afef;">{}</span> python -m table_ocr.extract_cells <span style="color: #51afef;">{}</span> | <span style="color: #ECBE7B;">grep</span> cells &gt; /tmp/extracted-cells.txt
-<span style="color: #ECBE7B;">cat</span> /tmp/extracted-cells.txt | xargs -I<span style="color: #51afef;">{}</span> python -m table_ocr.ocr_image <span style="color: #51afef;">{}</span> --psm <span style="color: #da8548; font-weight: bold;">7</span> -l table-ocr
-
+<span style="color: #ECBE7B;">cat</span> /tmp/extracted-cells.txt | xargs -I<span style="color: #51afef;">{}</span> python -m table_ocr.ocr_image <span style="color: #51afef;">{}</span>
 <span style="color: #51afef;">for</span> image<span style="color: #51afef;"> in</span> $<span style="color: #51afef;">(</span><span style="color: #ECBE7B;">cat</span> /tmp/extracted-tables.txt<span style="color: #51afef;">)</span>; <span style="color: #51afef;">do</span>
     <span style="color: #dcaeea;">dir</span>=$<span style="color: #51afef;">(</span>dirname $<span style="color: #dcaeea;">image</span><span style="color: #51afef;">)</span>
     python -m table_ocr.ocr_to_csv $<span style="color: #51afef;">(</span><span style="color: #ECBE7B;">find</span> $<span style="color: #dcaeea;">dir</span>/cells -name <span style="color: #98be65;">"*.txt"</span><span style="color: #51afef;">)</span>
 <span style="color: #51afef;">done</span>
 </pre>
 </div>
+
+<p>
+Any extra args you pass after the image path to <code>python -m table_ocr.ocr_image</code> will be passed directly to tesseract as options. If you don&rsquo;t pass anything, reasonable english defaults are used.
+</p>
 </div>
 </div>
 
-<div id="outline-container-org43c2c09" class="outline-3">
-<h3 id="org43c2c09"><span class="section-number-3">1.4</span> Possible improvements</h3>
+<div id="outline-container-org7e5cd11" class="outline-3">
+<h3 id="org7e5cd11"><span class="section-number-3">1.4</span> Possible improvements</h3>
 <div class="outline-text-3" id="text-1-4">
 <p>
 Detect text with the stroke-width-transform alogoritm. <a href="https://zablo.net/blog/post/stroke-width-transform-swt-python/index.html">https://zablo.net/blog/post/stroke-width-transform-swt-python/index.html</a>
@@ -452,8 +459,8 @@ Detect text with the stroke-width-transform alogoritm. <a href="https://zablo.ne
 </div>
 </div>
 
-<div id="outline-container-org6412872" class="outline-2">
-<h2 id="org6412872"><span class="section-number-2">2</span> Preparing data</h2>
+<div id="outline-container-org51af43b" class="outline-2">
+<h2 id="org51af43b"><span class="section-number-2">2</span> Preparing data</h2>
 <div class="outline-text-2" id="text-2">
 <p>
 Not all pdfs need to be sent through OCR to extract the text content. If you can
@@ -462,27 +469,27 @@ probably aren&rsquo;t necessary.
 </p>
 </div>
 
-<div id="outline-container-orgdc51282" class="outline-3">
-<h3 id="orgdc51282"><span class="section-number-3">2.1</span> Converting PDFs to images</h3>
+<div id="outline-container-orga4dde96" class="outline-3">
+<h3 id="orga4dde96"><span class="section-number-3">2.1</span> Converting PDFs to images</h3>
 <div class="outline-text-3" id="text-2-1">
 <p>
 This code calls out to <a href="https://manpages.debian.org/testing/poppler-utils/pdfimages.1.en.html">pdfimages</a> from <a href="https://poppler.freedesktop.org/">Poppler</a>.
 </p>
 
 <div class="org-src-container">
-<pre class="src src-python" id="org4cf64a2"><span style="color: #5B6268;"># </span><span style="color: #5B6268;">Wrapper around the Poppler command line utility "pdfimages" and helpers for</span>
+<pre class="src src-python" id="orgdb8901b"><span style="color: #5B6268;"># </span><span style="color: #5B6268;">Wrapper around the Poppler command line utility "pdfimages" and helpers for</span>
 <span style="color: #5B6268;"># </span><span style="color: #5B6268;">finding the output files of that command.</span>
 <span style="color: #51afef;">def</span> <span style="color: #c678dd;">pdf_to_images</span>(pdf_filepath):
 <span style="background-color: #282c34;"> </span>   <span style="color: #83898d;">"""</span>
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Turn a pdf into images</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Returns the filenames of the created images sorted lexicographically.</span>
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   """</span>
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(pdf_filepath)
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">with</span> working_dir(directory):
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">image_filenames</span> = pdfimages(pdf_filepath)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">image_filenames</span> = pdfimages(pdf_filepath)
 
 <span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Since pdfimages creates a number of files named each for there page number</span>
 <span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">and doesn't return us the list that it created</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> [os.path.join(directory, f) <span style="color: #51afef;">for</span> f <span style="color: #51afef;">in</span> image_filenames]
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> <span style="color: #c678dd;">sorted</span>([os.path.join(directory, f) <span style="color: #51afef;">for</span> f <span style="color: #51afef;">in</span> image_filenames])
 
 
 <span style="color: #51afef;">def</span> <span style="color: #c678dd;">pdfimages</span>(pdf_filepath):
@@ -495,8 +502,14 @@ This code calls out to <a href="https://manpages.debian.org/testing/poppler-util
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   uses 3 digits in its regex.</span>
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   """</span>
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(pdf_filepath)
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">if</span> <span style="color: #51afef;">not</span> os.path.isabs(directory):
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span> = os.path.abspath(directory)
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">filename_sans_ext</span> = filename.split(<span style="color: #98be65;">".pdf"</span>)[<span style="color: #da8548; font-weight: bold;">0</span>]
-<span style="background-color: #282c34;"> </span>   subprocess.run([<span style="color: #98be65;">"pdfimages"</span>, <span style="color: #98be65;">"-png"</span>, pdf_filepath, filename.split(<span style="color: #98be65;">".pdf"</span>)[<span style="color: #da8548; font-weight: bold;">0</span>]])
+
+<span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">pdfimages outputs results to the current working directory</span>
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">with</span> working_dir(directory):
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   subprocess.run([<span style="color: #98be65;">"pdfimages"</span>, <span style="color: #98be65;">"-png"</span>, filename, filename.split(<span style="color: #98be65;">".pdf"</span>)[<span style="color: #da8548; font-weight: bold;">0</span>]])
+
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">image_filenames</span> = find_matching_files_in_dir(filename_sans_ext, directory)
 <span style="background-color: #282c34;"> </span>   logger.debug(
 <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #98be65;">"Converted {} into files:\n{}"</span>.<span style="color: #c678dd;">format</span>(pdf_filepath, <span style="color: #98be65;">"\n"</span>.join(image_filenames))
@@ -516,8 +529,8 @@ This code calls out to <a href="https://manpages.debian.org/testing/poppler-util
 </div>
 </div>
 
-<div id="outline-container-org938b07f" class="outline-3">
-<h3 id="org938b07f"><span class="section-number-3">2.2</span> Detecting image orientation and applying rotation.</h3>
+<div id="outline-container-org6c75ffa" class="outline-3">
+<h3 id="org6c75ffa"><span class="section-number-3">2.2</span> Detecting image orientation and applying rotation.</h3>
 <div class="outline-text-3" id="text-2-2">
 <p>
 Tesseract can detect orientation and we can then use <a href="https://www.imagemagick.org/script/mogrify.php">ImageMagick&rsquo;s mogrify</a> to
@@ -546,19 +559,29 @@ to correct the rotation. This makes OCR more straightforward.
 </p>
 
 <div class="org-src-container">
-<pre class="src src-python" id="org4c77451"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">preprocess_img</span>(filepath):
-<span style="background-color: #282c34;"> </span>   <span style="color: #83898d;">"""</span>
-<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Processing that involves running shell executables,</span>
+<pre class="src src-python" id="org44f8315"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">preprocess_img</span>(filepath, tess_params=<span style="color: #a9a1e1;">None</span>):
+<span style="background-color: #282c34;"> </span>   <span style="color: #83898d;">"""Processing that involves running shell executables,</span>
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   like mogrify to rotate.</span>
+
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Uses tesseract to detect rotation.</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">  </span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Orientation and script detection is only available for legacy tesseract</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   (--oem 0). Some versions of tesseract will segfault if you let it run OSD</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   with the default oem (3).</span>
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   """</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">rotate</span> = get_rotate(filepath)
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">if</span> tess_params <span style="color: #51afef;">is</span> <span style="color: #a9a1e1;">None</span>:
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">tess_params</span> = [<span style="color: #98be65;">"--psm"</span>, <span style="color: #98be65;">"0"</span>, <span style="color: #98be65;">"--oem"</span>, <span style="color: #98be65;">"0"</span>]
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">rotate</span> = get_rotate(filepath, tess_params)
 <span style="background-color: #282c34;"> </span>   logger.debug(<span style="color: #98be65;">"Rotating {} by {}."</span>.<span style="color: #c678dd;">format</span>(filepath, rotate))
 <span style="background-color: #282c34;"> </span>   mogrify(filepath, rotate)
 
 
-<span style="color: #51afef;">def</span> <span style="color: #c678dd;">get_rotate</span>(image_filepath):
+<span style="color: #51afef;">def</span> <span style="color: #c678dd;">get_rotate</span>(image_filepath, tess_params):
+<span style="background-color: #282c34;"> </span>   <span style="color: #83898d;">"""</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   """</span>
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">tess_command</span> = [<span style="color: #98be65;">"tesseract"</span>] + tess_params + [image_filepath, <span style="color: #98be65;">"-"</span>]
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">output</span> = (
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   subprocess.check_output([<span style="color: #98be65;">"tesseract"</span>, <span style="color: #98be65;">"--psm"</span>, <span style="color: #98be65;">"0"</span>, image_filepath, <span style="color: #98be65;">"-"</span>])
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   subprocess.check_output(tess_command)
 <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   .decode(<span style="color: #98be65;">"utf-8"</span>)
 <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   .split(<span style="color: #98be65;">"\n"</span>)
 <span style="background-color: #282c34;"> </span>   )
@@ -575,8 +598,8 @@ to correct the rotation. This makes OCR more straightforward.
 </div>
 </div>
 
-<div id="outline-container-orga68497c" class="outline-2">
-<h2 id="orga68497c"><span class="section-number-2">3</span> Detecting tables</h2>
+<div id="outline-container-orgc195fec" class="outline-2">
+<h2 id="orgc195fec"><span class="section-number-2">3</span> Detecting tables</h2>
 <div class="outline-text-2" id="text-3">
 <p>
 This answer from opencv.org was heavily referenced while writing the code around
@@ -597,7 +620,7 @@ that makes things like shape detection more accurate.
 </p>
 
 <div class="org-src-container">
-<pre class="src src-python" id="org55a4fc5"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">find_tables</span>(image):
+<pre class="src src-python" id="orgd821c1d"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">find_tables</span>(image):
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">BLUR_KERNEL_SIZE</span> = (<span style="color: #da8548; font-weight: bold;">17</span>, <span style="color: #da8548; font-weight: bold;">17</span>)
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">STD_DEV_X_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">STD_DEV_Y_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
@@ -680,8 +703,8 @@ cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table.png"
 </div>
 </div>
 
-<div id="outline-container-org019e27e" class="outline-3">
-<h3 id="org019e27e"><span class="section-number-3">3.1</span> Improving accuracy</h3>
+<div id="outline-container-org9dd75a6" class="outline-3">
+<h3 id="org9dd75a6"><span class="section-number-3">3.1</span> Improving accuracy</h3>
 <div class="outline-text-3" id="text-3-1">
 <p>
 It&rsquo;s likely that some images will contain tables that aren&rsquo;t accurately
@@ -702,8 +725,8 @@ the x, y, width and height are anywhere in that range.
 </div>
 </div>
 
-<div id="outline-container-orga12c6cf" class="outline-2">
-<h2 id="orga12c6cf"><span class="section-number-2">4</span> OCR tables</h2>
+<div id="outline-container-org904debc" class="outline-2">
+<h2 id="org904debc"><span class="section-number-2">4</span> OCR tables</h2>
 <div class="outline-text-2" id="text-4">
 <p>
 Tesseract does not perform well when run on images of tables. It performs best
@@ -720,9 +743,13 @@ We&rsquo;ll start with an image shown at the end of the previous section.
 </p>
 </div>
 
-<div id="outline-container-orgc895682" class="outline-3">
-<h3 id="orgc895682"><span class="section-number-3">4.1</span> Training Tesseract</h3>
+<div id="outline-container-orgb03a965" class="outline-3">
+<h3 id="orgb03a965"><span class="section-number-3">4.1</span> Training Tesseract</h3>
 <div class="outline-text-3" id="text-4-1">
+<p>
+Tesseract is used for recognizing characters. It is not involved in extracting the tables from an image or in extracting cells from the table.
+</p>
+
 <p>
 It&rsquo;s a very good idea to train tesseract. Accuracy will improve tremendously.
 </p>
@@ -732,7 +759,7 @@ Clone the tesstrain repo at <a href="https://github.com/tesseract-ocr/tesstrain"
 </p>
 
 <p>
-Run the <a href="#orgd0301ec"><code>ocr_tables</code></a> script on a few pdfs to generate some training data. That
+Run the <a href="#org5f5c842"><code>ocr_tables</code></a> script on a few pdfs to generate some training data. That
 script outputs pairs of <code>.png</code> and <code>.gt.txt</code> files that can be used by
 tesstrain.
 </p>
@@ -772,10 +799,70 @@ Once the training is complete, there will be a new file
 Tesseract searches for models. On my machine, it was <code>/usr/local/share/tessdata/</code>.
 </p>
 </div>
+
+<div id="outline-container-org5adeb27" class="outline-4">
+<h4 id="org5adeb27"><span class="section-number-4">4.1.1</span> Training tips</h4>
+<div class="outline-text-4" id="text-4-1-1">
+<p>
+Here is a tip for quickly creating training data.
+</p>
+
+<p>
+The output of the <code>ocr_cells</code> script will be a directory named <code>ocr_data</code> that
+will have two files for each cell. One file is the image of the cell and the
+other file is the OCR text.
+</p>
+
+<p>
+You&rsquo;ll want to compare each image to its OCR text to check for accuracy. If
+the text doesn&rsquo;t match, you&rsquo;ll want to update the text and add the image to the
+training data.
+</p>
+
+<p>
+The fastest way to do this is with <code>feh</code>.
+</p>
+
+<p>
+<code>feh</code> lets you view an image and a caption at the same time and lets you edit
+the caption from within <code>feh</code>.
+</p>
+
+<p>
+<code>feh</code> expects the captions to be named <code>&lt;image-name&gt;.txt</code>, so use a little
+shell-fu to do a quick rename.
+</p>
+
+<div class="org-src-container">
+<pre class="src src-shell"><span style="color: #51afef;">for</span> f<span style="color: #51afef;"> in</span> *.txt; <span style="color: #51afef;">do</span> <span style="color: #dcaeea;">f1</span>=$<span style="color: #51afef;">(</span>cut -d<span style="color: #98be65;">"."</span> -f1 &lt;<span style="color: #c678dd;">(</span><span style="color: #ECBE7B;">echo</span> $<span style="color: #dcaeea;">f</span><span style="color: #c678dd;">)</span><span style="color: #51afef;">)</span>; <span style="color: #ECBE7B;">mv</span> $<span style="color: #dcaeea;">f</span> $<span style="color: #51afef;">{</span><span style="color: #dcaeea;">f1</span><span style="color: #51afef;">}</span>.png.txt; <span style="color: #51afef;">done</span>
+</pre>
 </div>
 
-<div id="outline-container-org1ce3ded" class="outline-3">
-<h3 id="org1ce3ded"><span class="section-number-3">4.2</span> Blur</h3>
+<p>
+Then run <code>feh -K .</code> to specify the current directory as the caption directory.
+This will open a window with the first image in the directory and its caption.
+</p>
+
+<p>
+Press <code>c</code> to edit the caption (if needed) and <code>n~/~p</code> to move to the
+next/previons images. Press <code>q</code> to quit.
+</p>
+
+<p>
+When finished, rename the files back to the filename structure that Tesseract
+looks for in training.
+</p>
+
+<div class="org-src-container">
+<pre class="src src-shell"><span style="color: #51afef;">for</span> f<span style="color: #51afef;"> in</span> *.txt; <span style="color: #51afef;">do</span> <span style="color: #dcaeea;">f1</span>=$<span style="color: #51afef;">(</span>cut -d<span style="color: #98be65;">"."</span> -f1 &lt;<span style="color: #c678dd;">(</span><span style="color: #ECBE7B;">echo</span> $<span style="color: #dcaeea;">f</span><span style="color: #c678dd;">)</span><span style="color: #51afef;">)</span>; <span style="color: #ECBE7B;">mv</span> $<span style="color: #dcaeea;">f</span> $<span style="color: #51afef;">{</span><span style="color: #dcaeea;">f1</span><span style="color: #51afef;">}</span>.gt.txt; <span style="color: #51afef;">done</span>
+</pre>
+</div>
+</div>
+</div>
+</div>
+
+<div id="outline-container-org152ead5" class="outline-3">
+<h3 id="org152ead5"><span class="section-number-3">4.2</span> Blur</h3>
 <div class="outline-text-3" id="text-4-2">
 <p>
 Blurring helps to make noise less noisy so that the overall structure of an
@@ -815,8 +902,8 @@ cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-blur
 </div>
 </div>
 
-<div id="outline-container-org5ebc5e9" class="outline-3">
-<h3 id="org5ebc5e9"><span class="section-number-3">4.3</span> Threshold</h3>
+<div id="outline-container-org858fb89" class="outline-3">
+<h3 id="org858fb89"><span class="section-number-3">4.3</span> Threshold</h3>
 <div class="outline-text-3" id="text-4-3">
 <p>
 We&rsquo;ve got a bunch of pixels that are gray. Thresholding will turn them all
@@ -854,8 +941,8 @@ cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-thre
 </div>
 </div>
 
-<div id="outline-container-orga8b32e0" class="outline-3">
-<h3 id="orga8b32e0"><span class="section-number-3">4.4</span> Finding the vertical and horizontal lines of the table</h3>
+<div id="outline-container-orgcf17042" class="outline-3">
+<h3 id="orgcf17042"><span class="section-number-3">4.4</span> Finding the vertical and horizontal lines of the table</h3>
 <div class="outline-text-3" id="text-4-4">
 <div class="org-src-container">
 <pre class="src src-python"><span style="color: #dcaeea;">vertical</span> = <span style="color: #dcaeea;">horizontal</span> = img_bin.copy()
@@ -894,8 +981,8 @@ cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-line
 </div>
 </div>
 
-<div id="outline-container-org1a881f5" class="outline-3">
-<h3 id="org1a881f5"><span class="section-number-3">4.5</span> Finding the contours</h3>
+<div id="outline-container-org94f71b3" class="outline-3">
+<h3 id="org94f71b3"><span class="section-number-3">4.5</span> Finding the contours</h3>
 <div class="outline-text-3" id="text-4-5">
 <p>
 Blurring and thresholding allow us to find the lines. Opening the lines allows
@@ -944,7 +1031,7 @@ above/below certain sizes.
 </p>
 
 <div class="org-src-container">
-<pre class="src src-python" id="org61f4909"><span style="color: #dcaeea;">contours</span>, <span style="color: #dcaeea;">heirarchy</span> = cv2.findContours(
+<pre class="src src-python" id="orgf486a5a"><span style="color: #dcaeea;">contours</span>, <span style="color: #dcaeea;">heirarchy</span> = cv2.findContours(
 <span style="background-color: #282c34;"> </span>   mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
 )
 
@@ -976,8 +1063,8 @@ above/below certain sizes.
 </div>
 </div>
 
-<div id="outline-container-org094eedd" class="outline-3">
-<h3 id="org094eedd"><span class="section-number-3">4.6</span> Sorting the bounding rectangles</h3>
+<div id="outline-container-orgc64b6ef" class="outline-3">
+<h3 id="orgc64b6ef"><span class="section-number-3">4.6</span> Sorting the bounding rectangles</h3>
 <div class="outline-text-3" id="text-4-6">
 <p>
 We want to process these from left-to-right, top-to-bottom.
@@ -996,7 +1083,7 @@ value of their center. We&rsquo;ll remove those rectangles from the list and rep
 </p>
 
 <div class="org-src-container">
-<pre class="src src-python" id="org233115e"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">cell_in_same_row</span>(c1, c2):
+<pre class="src src-python" id="org30980d9"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">cell_in_same_row</span>(c1, c2):
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">c1_center</span> = c1[<span style="color: #da8548; font-weight: bold;">1</span>] + c1[<span style="color: #da8548; font-weight: bold;">3</span>] - c1[<span style="color: #da8548; font-weight: bold;">3</span>] / <span style="color: #da8548; font-weight: bold;">2</span>
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">c2_bottom</span> = c2[<span style="color: #da8548; font-weight: bold;">1</span>] + c2[<span style="color: #da8548; font-weight: bold;">3</span>]
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">c2_top</span> = c2[<span style="color: #da8548; font-weight: bold;">1</span>]
@@ -1070,7 +1157,7 @@ cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-cell
 </div>
 
 <div class="org-src-container">
-<pre class="src src-python" id="orga57424d"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">extract_cell_images_from_table</span>(image):
+<pre class="src src-python" id="org74e59e6"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">extract_cell_images_from_table</span>(image):
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">BLUR_KERNEL_SIZE</span> = (<span style="color: #da8548; font-weight: bold;">17</span>, <span style="color: #da8548; font-weight: bold;">17</span>)
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">STD_DEV_X_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">STD_DEV_Y_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
@@ -1184,8 +1271,8 @@ cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-cell
 </div>
 </div>
 
-<div id="outline-container-org87d2780" class="outline-3">
-<h3 id="org87d2780"><span class="section-number-3">4.7</span> Cropping each cell to the text</h3>
+<div id="outline-container-orgd4dc4cc" class="outline-3">
+<h3 id="orgd4dc4cc"><span class="section-number-3">4.7</span> Cropping each cell to the text</h3>
 <div class="outline-text-3" id="text-4-7">
 <p>
 OCR with Tesseract works best when there is about 10 pixels of white border
@@ -1272,8 +1359,8 @@ cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-cell
 </div>
 </div>
 
-<div id="outline-container-org8de0931" class="outline-3">
-<h3 id="org8de0931"><span class="section-number-3">4.8</span> OCR each cell</h3>
+<div id="outline-container-org22a3e7b" class="outline-3">
+<h3 id="org22a3e7b"><span class="section-number-3">4.8</span> OCR each cell</h3>
 <div class="outline-text-3" id="text-4-8">
 <p>
 If we cleaned up the images well enough, we might get some accurate OCR!
@@ -1303,10 +1390,31 @@ period into a comma, then you might need to do some custom Tesseract training.
 </pre>
 </div>
 
+<p>
+The second argument passed to <code>ocr_image</code> is a string of the command line arguments passed directly to <code>tesseract</code>. You can view the available options at <a href="https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#options">https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#options</a>
+</p>
+
+<p>
+If no options are passed to <code>tesseract</code>, then language defaults to english. This means <code>tesseract</code> needs to be able to find a file named <code>eng.traineddata</code> on whatever path it searches for languages.
+</p>
+
+<p>
+This python package comes with <code>eng.traineddata</code> and <code>table-ocr.traineddata</code>. <code>table-ocr.traineddata</code> is a personal model that I&rsquo;ve found to be more accurate for my use case. You should train your own to maximize accuracy.
+</p>
+
+<p>
+When you <code>pip install</code> this package, the traineddata gets copied to a <code>tessdata</code> folder in the same directory in which <code>pip</code> installs the package.
+</p>
+
+<p>
+The <code>ocr_image</code> package in this repo defaults to using the <code>--tessdata-dir</code> option to the package&rsquo;s <code>tessdata</code> directory in the package install location and the <code>-l</code> option to the <code>table_ocr</code> language.
+</p>
+
 <div class="org-src-container">
 <pre class="src src-python"><span style="color: #51afef;">import</span> pytesseract
 <span style="color: #51afef;">import</span> cv2
 <span style="color: #51afef;">import</span> numpy <span style="color: #51afef;">as</span> np
+<span style="color: #51afef;">import</span> math
 <span style="color: #dcaeea;">image</span> = cv2.imread(<span style="color: #98be65;">"resources/examples/example-table-cell-1-1.png"</span>, cv2.IMREAD_GRAYSCALE)
 &lt;&lt;crop-to-text&gt;&gt;
 &lt;&lt;ocr-image&gt;&gt;
@@ -1322,8 +1430,8 @@ ocr_image(image, <span style="color: #98be65;">"--psm 7"</span>)
 </div>
 </div>
 
-<div id="outline-container-orgf6e8639" class="outline-2">
-<h2 id="orgf6e8639"><span class="section-number-2">5</span> Files</h2>
+<div id="outline-container-org9d0b21d" class="outline-2">
+<h2 id="org9d0b21d"><span class="section-number-2">5</span> Files</h2>
 <div class="outline-text-2" id="text-5">
 <div class="org-src-container">
 <pre class="src src-python">
@@ -1331,8 +1439,8 @@ ocr_image(image, <span style="color: #98be65;">"--psm 7"</span>)
 </div>
 </div>
 
-<div id="outline-container-orgba6f775" class="outline-3">
-<h3 id="orgba6f775"><span class="section-number-3">5.1</span> setup.py</h3>
+<div id="outline-container-orgd57e56a" class="outline-3">
+<h3 id="orgd57e56a"><span class="section-number-3">5.1</span> setup.py</h3>
 <div class="outline-text-3" id="text-5-1">
 <div class="org-src-container">
 <pre class="src src-python"><span style="color: #51afef;">import</span> setuptools
@@ -1340,43 +1448,43 @@ ocr_image(image, <span style="color: #98be65;">"--psm 7"</span>)
 <span style="color: #dcaeea;">long_description</span> = <span style="color: #98be65;">"""</span>
 <span style="color: #98be65;">Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.</span>
 
-<span style="color: #98be65;">Requires binaries for tesseract and pdfimages (from Poppler).</span>
+<span style="color: #98be65;">Requires binaries for tesseract, ImageMagick, and pdfimages (from Poppler).</span>
 <span style="color: #98be65;">"""</span>
 setuptools.setup(
 <span style="background-color: #282c34;"> </span>   name=<span style="color: #98be65;">"table_ocr"</span>,
-<span style="background-color: #282c34;"> </span>   version=<span style="color: #98be65;">"0.0.1"</span>,
+<span style="background-color: #282c34;"> </span>   version=<span style="color: #98be65;">"0.2.0"</span>,
 <span style="background-color: #282c34;"> </span>   author=<span style="color: #98be65;">"Eric Ihli"</span>,
 <span style="background-color: #282c34;"> </span>   author_email=<span style="color: #98be65;">"eihli@owoga.com"</span>,
-<span style="background-color: #282c34;"> </span>   description=<span style="color: #98be65;">"Turn images of tables into CSV data."</span>,
+<span style="background-color: #282c34;"> </span>   description=<span style="color: #98be65;">"Extract text from tables in images."</span>,
 <span style="background-color: #282c34;"> </span>   long_description=long_description,
 <span style="background-color: #282c34;"> </span>   long_description_content_type=<span style="color: #98be65;">"text/plain"</span>,
 <span style="background-color: #282c34;"> </span>   url=<span style="color: #98be65;">"https://github.com/eihli/image-table-ocr"</span>,
 <span style="background-color: #282c34;"> </span>   packages=setuptools.find_packages(),
+<span style="background-color: #282c34;"> </span>   package_data={
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #98be65;">"table_ocr"</span>: [<span style="color: #98be65;">"tessdata/table-ocr.traineddata"</span>, <span style="color: #98be65;">"tessdata/eng.traineddata"</span>]
+<span style="background-color: #282c34;"> </span>   },
 <span style="background-color: #282c34;"> </span>   classifiers=[
 <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #98be65;">"Programming Language :: Python :: 3"</span>,
 <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #98be65;">"License :: OSI Approved :: MIT License"</span>,
 <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #98be65;">"Operating System :: OS Independent"</span>,
 <span style="background-color: #282c34;"> </span>   ],
-<span style="background-color: #282c34;"> </span>   install_requires=[
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #98be65;">"pytesseract~=0.3"</span>,
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #98be65;">"opencv-python~=4.2"</span>,
-<span style="background-color: #282c34;"> </span>   ],
-<span style="background-color: #282c34;"> </span>   python_requires=<span style="color: #98be65;">'&gt;=3.6'</span>,
+<span style="background-color: #282c34;"> </span>   install_requires=[<span style="color: #98be65;">"pytesseract~=0.3"</span>, <span style="color: #98be65;">"opencv-python~=4.2"</span>,],
+<span style="background-color: #282c34;"> </span>   python_requires=<span style="color: #98be65;">"&gt;=3.6"</span>,
 )
 </pre>
 </div>
 </div>
 </div>
 
-<div id="outline-container-org6a407ab" class="outline-3">
-<h3 id="org6a407ab"><span class="section-number-3">5.2</span> table_ocr</h3>
+<div id="outline-container-org4b36161" class="outline-3">
+<h3 id="org4b36161"><span class="section-number-3">5.2</span> table_ocr</h3>
 <div class="outline-text-3" id="text-5-2">
 </div>
-<div id="outline-container-org465ea6b" class="outline-4">
-<h4 id="org465ea6b"><span class="section-number-4">5.2.1</span> table_ocr/__init__.py</h4>
+<div id="outline-container-orgbdd2fc0" class="outline-4">
+<h4 id="orgbdd2fc0"><span class="section-number-4">5.2.1</span> table_ocr/__init__.py</h4>
 </div>
-<div id="outline-container-org7a49723" class="outline-4">
-<h4 id="org7a49723"><span class="section-number-4">5.2.2</span> table_ocr/util.py</h4>
+<div id="outline-container-org09e5a07" class="outline-4">
+<h4 id="org09e5a07"><span class="section-number-4">5.2.2</span> table_ocr/util.py</h4>
 <div class="outline-text-4" id="text-5-2-2">
 <div class="org-src-container">
 <pre class="src src-python"><span style="color: #51afef;">from</span> contextlib <span style="color: #51afef;">import</span> contextmanager
@@ -1413,15 +1521,15 @@ setuptools.setup(
 </div>
 </div>
 
-<div id="outline-container-orgd6e9341" class="outline-4">
-<h4 id="orgd6e9341"><span class="section-number-4">5.2.3</span> table_ocr/pdf_to_images/</h4>
+<div id="outline-container-org5a371bd" class="outline-4">
+<h4 id="org5a371bd"><span class="section-number-4">5.2.3</span> table_ocr/pdf_to_images/</h4>
 <div class="outline-text-4" id="text-5-2-3">
 </div>
-<div id="outline-container-org8a31fd7" class="outline-5">
-<h5 id="org8a31fd7"><span class="section-number-5">5.2.3.1</span> table_ocr/pdf_to_images/__init__.py</h5>
+<div id="outline-container-orgd777fae" class="outline-5">
+<h5 id="orgd777fae"><span class="section-number-5">5.2.3.1</span> table_ocr/pdf_to_images/__init__.py</h5>
 <div class="outline-text-5" id="text-5-2-3-1">
 <div class="org-src-container">
-<pre class="src src-python" id="org1a36eec"><span style="color: #51afef;">import</span> os
+<pre class="src src-python" id="orgdf64015"><span style="color: #51afef;">import</span> os
 <span style="color: #51afef;">import</span> re
 <span style="color: #51afef;">import</span> subprocess
 
@@ -1434,14 +1542,14 @@ setuptools.setup(
 <span style="color: #51afef;">def</span> <span style="color: #c678dd;">pdf_to_images</span>(pdf_filepath):
 <span style="background-color: #282c34;"> </span>   <span style="color: #83898d;">"""</span>
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Turn a pdf into images</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Returns the filenames of the created images sorted lexicographically.</span>
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   """</span>
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(pdf_filepath)
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">with</span> working_dir(directory):
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">image_filenames</span> = pdfimages(pdf_filepath)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">image_filenames</span> = pdfimages(pdf_filepath)
 
 <span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Since pdfimages creates a number of files named each for there page number</span>
 <span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">and doesn't return us the list that it created</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> [os.path.join(directory, f) <span style="color: #51afef;">for</span> f <span style="color: #51afef;">in</span> image_filenames]
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> <span style="color: #c678dd;">sorted</span>([os.path.join(directory, f) <span style="color: #51afef;">for</span> f <span style="color: #51afef;">in</span> image_filenames])
 
 
 <span style="color: #51afef;">def</span> <span style="color: #c678dd;">pdfimages</span>(pdf_filepath):
@@ -1454,8 +1562,14 @@ setuptools.setup(
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   uses 3 digits in its regex.</span>
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   """</span>
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(pdf_filepath)
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">if</span> <span style="color: #51afef;">not</span> os.path.isabs(directory):
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span> = os.path.abspath(directory)
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">filename_sans_ext</span> = filename.split(<span style="color: #98be65;">".pdf"</span>)[<span style="color: #da8548; font-weight: bold;">0</span>]
-<span style="background-color: #282c34;"> </span>   subprocess.run([<span style="color: #98be65;">"pdfimages"</span>, <span style="color: #98be65;">"-png"</span>, pdf_filepath, filename.split(<span style="color: #98be65;">".pdf"</span>)[<span style="color: #da8548; font-weight: bold;">0</span>]])
+
+<span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">pdfimages outputs results to the current working directory</span>
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">with</span> working_dir(directory):
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   subprocess.run([<span style="color: #98be65;">"pdfimages"</span>, <span style="color: #98be65;">"-png"</span>, filename, filename.split(<span style="color: #98be65;">".pdf"</span>)[<span style="color: #da8548; font-weight: bold;">0</span>]])
+
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">image_filenames</span> = find_matching_files_in_dir(filename_sans_ext, directory)
 <span style="background-color: #282c34;"> </span>   logger.debug(
 <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #98be65;">"Converted {} into files:\n{}"</span>.<span style="color: #c678dd;">format</span>(pdf_filepath, <span style="color: #98be65;">"\n"</span>.join(image_filenames))
@@ -1471,19 +1585,29 @@ setuptools.setup(
 <span style="background-color: #282c34;"> </span>   ]
 <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> files
 
-<span style="color: #51afef;">def</span> <span style="color: #c678dd;">preprocess_img</span>(filepath):
-<span style="background-color: #282c34;"> </span>   <span style="color: #83898d;">"""</span>
-<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Processing that involves running shell executables,</span>
+<span style="color: #51afef;">def</span> <span style="color: #c678dd;">preprocess_img</span>(filepath, tess_params=<span style="color: #a9a1e1;">None</span>):
+<span style="background-color: #282c34;"> </span>   <span style="color: #83898d;">"""Processing that involves running shell executables,</span>
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   like mogrify to rotate.</span>
+
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Uses tesseract to detect rotation.</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">  </span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Orientation and script detection is only available for legacy tesseract</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   (--oem 0). Some versions of tesseract will segfault if you let it run OSD</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   with the default oem (3).</span>
 <span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   """</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">rotate</span> = get_rotate(filepath)
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">if</span> tess_params <span style="color: #51afef;">is</span> <span style="color: #a9a1e1;">None</span>:
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">tess_params</span> = [<span style="color: #98be65;">"--psm"</span>, <span style="color: #98be65;">"0"</span>, <span style="color: #98be65;">"--oem"</span>, <span style="color: #98be65;">"0"</span>]
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">rotate</span> = get_rotate(filepath, tess_params)
 <span style="background-color: #282c34;"> </span>   logger.debug(<span style="color: #98be65;">"Rotating {} by {}."</span>.<span style="color: #c678dd;">format</span>(filepath, rotate))
 <span style="background-color: #282c34;"> </span>   mogrify(filepath, rotate)
 
 
-<span style="color: #51afef;">def</span> <span style="color: #c678dd;">get_rotate</span>(image_filepath):
+<span style="color: #51afef;">def</span> <span style="color: #c678dd;">get_rotate</span>(image_filepath, tess_params):
+<span style="background-color: #282c34;"> </span>   <span style="color: #83898d;">"""</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   """</span>
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">tess_command</span> = [<span style="color: #98be65;">"tesseract"</span>] + tess_params + [image_filepath, <span style="color: #98be65;">"-"</span>]
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">output</span> = (
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   subprocess.check_output([<span style="color: #98be65;">"tesseract"</span>, <span style="color: #98be65;">"--psm"</span>, <span style="color: #98be65;">"0"</span>, image_filepath, <span style="color: #98be65;">"-"</span>])
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   subprocess.check_output(tess_command)
 <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   .decode(<span style="color: #98be65;">"utf-8"</span>)
 <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   .split(<span style="color: #98be65;">"\n"</span>)
 <span style="background-color: #282c34;"> </span>   )
@@ -1499,8 +1623,8 @@ setuptools.setup(
 </div>
 </div>
 
-<div id="outline-container-org1e4f0c6" class="outline-5">
-<h5 id="org1e4f0c6"><span class="section-number-5">5.2.3.2</span> table_ocr/pdf_to_images/__main__.py</h5>
+<div id="outline-container-org0064754" class="outline-5">
+<h5 id="org0064754"><span class="section-number-5">5.2.3.2</span> table_ocr/pdf_to_images/__main__.py</h5>
 <div class="outline-text-5" id="text-5-2-3-2">
 <p>
 Takes a variable number of pdf files and creates images out of each page of the
@@ -1520,7 +1644,7 @@ blank line.
 
 
 <div class="org-src-container">
-<pre class="src src-python" id="org2dead1b"><span style="color: #51afef;">import</span> argparse
+<pre class="src src-python" id="orgfe96fa6"><span style="color: #51afef;">import</span> argparse
 
 <span style="color: #51afef;">from</span> table_ocr.util <span style="color: #51afef;">import</span> working_dir, make_tempdir, get_logger
 <span style="color: #51afef;">from</span> table_ocr.pdf_to_images <span style="color: #51afef;">import</span> pdf_to_images, preprocess_img
@@ -1553,15 +1677,16 @@ parser.add_argument(<span style="color: #98be65;">"files"</span>, nargs=<span st
 </div>
 </div>
 
-<div id="outline-container-orgb1142c2" class="outline-4">
-<h4 id="orgb1142c2"><span class="section-number-4">5.2.4</span> table_ocr/extract_tables/</h4>
+<div id="outline-container-org03e58e9" class="outline-4">
+<h4 id="org03e58e9"><span class="section-number-4">5.2.4</span> table_ocr/extract_tables/</h4>
 <div class="outline-text-4" id="text-5-2-4">
 </div>
-<div id="outline-container-org4d226b5" class="outline-5">
-<h5 id="org4d226b5"><span class="section-number-5">5.2.4.1</span> table_ocr/extract_tables/__init__.py</h5>
+<div id="outline-container-orgfedc867" class="outline-5">
+<h5 id="orgfedc867"><span class="section-number-5">5.2.4.1</span> table_ocr/extract_tables/__init__.py</h5>
 <div class="outline-text-5" id="text-5-2-4-1">
 <div class="org-src-container">
-<pre class="src src-python" id="org578a61a"><span style="color: #51afef;">import</span> cv2
+<pre class="src src-python" id="org5f4485f"><span style="color: #51afef;">import</span> os
+<span style="color: #51afef;">import</span> cv2
 
 <span style="color: #51afef;">def</span> <span style="color: #c678dd;">find_tables</span>(image):
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">BLUR_KERNEL_SIZE</span> = (<span style="color: #da8548; font-weight: bold;">17</span>, <span style="color: #da8548; font-weight: bold;">17</span>)
@@ -1610,13 +1735,35 @@ parser.add_argument(<span style="color: #98be65;">"files"</span>, nargs=<span st
 <span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Leaving that step as a future </span><span style="color: #ECBE7B; font-weight: bold;">TODO</span><span style="color: #5B6268;"> if it is ever necessary.</span>
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">images</span> = [image[y:y+h, x:x+w] <span style="color: #51afef;">for</span> x, y, w, h <span style="color: #51afef;">in</span> bounding_rects]
 <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> images
+
+<span style="color: #51afef;">def</span> <span style="color: #c678dd;">main</span>(files):
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">results</span> = []
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">for</span> f <span style="color: #51afef;">in</span> files:
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(f)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">image</span> = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">tables</span> = find_tables(image)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">files</span> = []
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">filename_sans_extension</span> = os.path.splitext(filename)[<span style="color: #da8548; font-weight: bold;">0</span>]
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">if</span> tables:
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=<span style="color: #a9a1e1;">True</span>)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">for</span> i, table <span style="color: #51afef;">in</span> <span style="color: #c678dd;">enumerate</span>(tables):
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">table_filename</span> = <span style="color: #98be65;">"table-{:03d}.png"</span>.<span style="color: #c678dd;">format</span>(i)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">table_filepath</span> = os.path.join(
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   directory, filename_sans_extension, table_filename
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   )
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   files.append(table_filepath)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   cv2.imwrite(table_filepath, table)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">if</span> tables:
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   results.append((f, files))
+<span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Results is [[&lt;input image&gt;, [&lt;images of detected tables&gt;]]]</span>
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> results
 </pre>
 </div>
 </div>
 </div>
 
-<div id="outline-container-org28b104f" class="outline-5">
-<h5 id="org28b104f"><span class="section-number-5">5.2.4.2</span> table_ocr/extract_tables/__main__.py</h5>
+<div id="outline-container-org82b2c3a" class="outline-5">
+<h5 id="org82b2c3a"><span class="section-number-5">5.2.4.2</span> table_ocr/extract_tables/__main__.py</h5>
 <div class="outline-text-5" id="text-5-2-4-2">
 <p>
 Takes 1 or more image paths as arguments.
@@ -1648,60 +1795,33 @@ For each image path given as an agument, outputs:
 </ol>
 
 <div class="org-src-container">
-<pre class="src src-python" id="org31f60b7"><span style="color: #51afef;">import</span> argparse
-<span style="color: #51afef;">import</span> os
-
-<span style="color: #51afef;">import</span> cv2
+<pre class="src src-python" id="org8c02fcb"><span style="color: #51afef;">import</span> argparse
 
-<span style="color: #51afef;">from</span> table_ocr.extract_tables <span style="color: #51afef;">import</span> find_tables
+<span style="color: #51afef;">from</span> table_ocr.extract_tables <span style="color: #51afef;">import</span> main
 
 <span style="color: #dcaeea;">parser</span> = argparse.ArgumentParser()
 parser.add_argument(<span style="color: #98be65;">"files"</span>, nargs=<span style="color: #98be65;">"+"</span>)
-
-
-<span style="color: #51afef;">def</span> <span style="color: #c678dd;">main</span>(files):
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">results</span> = []
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">for</span> f <span style="color: #51afef;">in</span> files:
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(f)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">image</span> = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">tables</span> = find_tables(image)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">files</span> = []
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">filename_sans_extension</span> = os.path.splitext(filename)[<span style="color: #da8548; font-weight: bold;">0</span>]
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">if</span> tables:
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=<span style="color: #a9a1e1;">True</span>)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">for</span> i, table <span style="color: #51afef;">in</span> <span style="color: #c678dd;">enumerate</span>(tables):
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">table_filename</span> = <span style="color: #98be65;">"table-{:03d}.png"</span>.<span style="color: #c678dd;">format</span>(i)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">table_filepath</span> = os.path.join(
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   directory, filename_sans_extension, table_filename
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   )
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   files.append(table_filepath)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   cv2.imwrite(table_filepath, table)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">if</span> tables:
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   results.append((f, files))
-
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">for</span> image_filename, table_filenames <span style="color: #51afef;">in</span> results:
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">print</span>(<span style="color: #98be65;">"\n"</span>.join(table_filenames))
-
-
-<span style="color: #51afef;">if</span> <span style="color: #c678dd;">__name__</span> == <span style="color: #98be65;">"__main__"</span>:
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">args</span> = parser.parse_args()
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">files</span> = args.files
-<span style="background-color: #282c34;"> </span>   main(files)
+<span style="color: #dcaeea;">args</span> = parser.parse_args()
+<span style="color: #dcaeea;">files</span> = args.files
+<span style="color: #dcaeea;">results</span> = main(files)
+<span style="color: #51afef;">for</span> image, tables <span style="color: #51afef;">in</span> results:
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">print</span>(<span style="color: #98be65;">"\n"</span>.join(tables))
 </pre>
 </div>
 </div>
 </div>
 </div>
 
-<div id="outline-container-org0d21747" class="outline-4">
-<h4 id="org0d21747"><span class="section-number-4">5.2.5</span> table_ocr/extract_cells/</h4>
+<div id="outline-container-org7ec79e9" class="outline-4">
+<h4 id="org7ec79e9"><span class="section-number-4">5.2.5</span> table_ocr/extract_cells/</h4>
 <div class="outline-text-4" id="text-5-2-5">
 </div>
-<div id="outline-container-org8556f38" class="outline-5">
-<h5 id="org8556f38"><span class="section-number-5">5.2.5.1</span> table_ocr/extract_cells/__init__.py</h5>
+<div id="outline-container-org6d6ddc7" class="outline-5">
+<h5 id="org6d6ddc7"><span class="section-number-5">5.2.5.1</span> table_ocr/extract_cells/__init__.py</h5>
 <div class="outline-text-5" id="text-5-2-5-1">
 <div class="org-src-container">
 <pre class="src src-python"><span style="color: #51afef;">import</span> cv2
+<span style="color: #51afef;">import</span> os
 
 <span style="color: #51afef;">def</span> <span style="color: #c678dd;">extract_cell_images_from_table</span>(image):
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">BLUR_KERNEL_SIZE</span> = (<span style="color: #da8548; font-weight: bold;">17</span>, <span style="color: #da8548; font-weight: bold;">17</span>)
@@ -1798,13 +1918,29 @@ parser.add_argument(<span style="color: #98be65;">"files"</span>, nargs=<span st
 <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   cell_images_row.append(image[y:y+h, x:x+w])
 <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   cell_images_rows.append(cell_images_row)
 <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> cell_images_rows
+
+<span style="color: #51afef;">def</span> <span style="color: #c678dd;">main</span>(f):
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">results</span> = []
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(f)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">table</span> = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">rows</span> = extract_cell_images_from_table(table)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">cell_img_dir</span> = os.path.join(directory, <span style="color: #98be65;">"cells"</span>)
+<span style="background-color: #282c34;"> </span>   os.makedirs(cell_img_dir, exist_ok=<span style="color: #a9a1e1;">True</span>)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">paths</span> = []
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">for</span> i, row <span style="color: #51afef;">in</span> <span style="color: #c678dd;">enumerate</span>(rows):
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">for</span> j, cell <span style="color: #51afef;">in</span> <span style="color: #c678dd;">enumerate</span>(row):
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">cell_filename</span> = <span style="color: #98be65;">"{:03d}-{:03d}.png"</span>.<span style="color: #c678dd;">format</span>(i, j)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">path</span> = os.path.join(cell_img_dir, cell_filename)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   cv2.imwrite(path, cell)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   paths.append(path)
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> paths
 </pre>
 </div>
 </div>
 </div>
 
-<div id="outline-container-orgb988224" class="outline-5">
-<h5 id="orgb988224"><span class="section-number-5">5.2.5.2</span> table_ocr/extract_cells/__main__.py</h5>
+<div id="outline-container-orgd698866" class="outline-5">
+<h5 id="orgd698866"><span class="section-number-5">5.2.5.2</span> table_ocr/extract_cells/__main__.py</h5>
 <div class="outline-text-5" id="text-5-2-5-2">
 <p>
 Takes as a command line argument a path to an image of a table.
@@ -1827,146 +1963,61 @@ cells.
 </p>
 
 <div class="org-src-container">
-<pre class="src src-python"><span style="color: #51afef;">import</span> os
-<span style="color: #51afef;">import</span> sys
-
-<span style="color: #51afef;">import</span> cv2
-
-<span style="color: #51afef;">from</span> table_ocr.extract_cells <span style="color: #51afef;">import</span> extract_cell_images_from_table
+<pre class="src src-python"><span style="color: #51afef;">import</span> sys
 
-<span style="color: #51afef;">def</span> <span style="color: #c678dd;">main</span>(f):
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">results</span> = []
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(f)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">table</span> = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">rows</span> = extract_cell_images_from_table(table)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">cell_img_dir</span> = os.path.join(directory, <span style="color: #98be65;">"cells"</span>)
-<span style="background-color: #282c34;"> </span>   os.makedirs(cell_img_dir, exist_ok=<span style="color: #a9a1e1;">True</span>)
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">for</span> i, row <span style="color: #51afef;">in</span> <span style="color: #c678dd;">enumerate</span>(rows):
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">for</span> j, cell <span style="color: #51afef;">in</span> <span style="color: #c678dd;">enumerate</span>(row):
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">cell_filename</span> = <span style="color: #98be65;">"{:03d}-{:03d}.png"</span>.<span style="color: #c678dd;">format</span>(i, j)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">path</span> = os.path.join(cell_img_dir, cell_filename)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   cv2.imwrite(path, cell)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">print</span>(path)
-
-
-<span style="color: #51afef;">def</span> <span style="color: #c678dd;">extract_cell_images_from_table</span>(image):
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">BLUR_KERNEL_SIZE</span> = (<span style="color: #da8548; font-weight: bold;">17</span>, <span style="color: #da8548; font-weight: bold;">17</span>)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">STD_DEV_X_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">STD_DEV_Y_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">blurred</span> = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">MAX_COLOR_VAL</span> = <span style="color: #da8548; font-weight: bold;">255</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">BLOCK_SIZE</span> = <span style="color: #da8548; font-weight: bold;">15</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">SUBTRACT_FROM_MEAN</span> = -<span style="color: #da8548; font-weight: bold;">2</span>
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">img_bin</span> = cv2.adaptiveThreshold(
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   ~blurred,
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   MAX_COLOR_VAL,
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   cv2.ADAPTIVE_THRESH_MEAN_C,
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   cv2.THRESH_BINARY,
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   BLOCK_SIZE,
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   SUBTRACT_FROM_MEAN,
-<span style="background-color: #282c34;"> </span>   )
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">vertical</span> = <span style="color: #dcaeea;">horizontal</span> = img_bin.copy()
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">SCALE</span> = <span style="color: #da8548; font-weight: bold;">5</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">image_width</span>, <span style="color: #dcaeea;">image_height</span> = horizontal.shape
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">horizontal_kernel</span> = cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #c678dd;">int</span>(image_width / SCALE), <span style="color: #da8548; font-weight: bold;">1</span>))
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">horizontally_opened</span> = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">vertical_kernel</span> = cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">1</span>, <span style="color: #c678dd;">int</span>(image_height / SCALE)))
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">vertically_opened</span> = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">horizontally_dilated</span> = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">40</span>, <span style="color: #da8548; font-weight: bold;">1</span>)))
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">vertically_dilated</span> = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">1</span>, <span style="color: #da8548; font-weight: bold;">60</span>)))
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">mask</span> = horizontally_dilated + vertically_dilated
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">contours</span>, <span style="color: #dcaeea;">heirarchy</span> = cv2.findContours(
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
-<span style="background-color: #282c34;"> </span>   )
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">perimeter_lengths</span> = [cv2.arcLength(c, <span style="color: #a9a1e1;">True</span>) <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> contours]
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">epsilons</span> = [<span style="color: #da8548; font-weight: bold;">0.05</span> * p <span style="color: #51afef;">for</span> p <span style="color: #51afef;">in</span> perimeter_lengths]
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">approx_polys</span> = [cv2.approxPolyDP(c, e, <span style="color: #a9a1e1;">True</span>) <span style="color: #51afef;">for</span> c, e <span style="color: #51afef;">in</span> <span style="color: #c678dd;">zip</span>(contours, epsilons)]
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Filter out contours that aren't rectangular. Those that aren't rectangular</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">are probably noise.</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">approx_rects</span> = [p <span style="color: #51afef;">for</span> p <span style="color: #51afef;">in</span> approx_polys <span style="color: #51afef;">if</span> <span style="color: #c678dd;">len</span>(p) == <span style="color: #da8548; font-weight: bold;">4</span>]
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">bounding_rects</span> = [cv2.boundingRect(a) <span style="color: #51afef;">for</span> a <span style="color: #51afef;">in</span> approx_polys]
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Filter out rectangles that are too narrow or too short.</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">MIN_RECT_WIDTH</span> = <span style="color: #da8548; font-weight: bold;">40</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">MIN_RECT_HEIGHT</span> = <span style="color: #da8548; font-weight: bold;">10</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">bounding_rects</span> = [
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   r <span style="color: #51afef;">for</span> r <span style="color: #51afef;">in</span> bounding_rects <span style="color: #51afef;">if</span> MIN_RECT_WIDTH &lt; r[<span style="color: #da8548; font-weight: bold;">2</span>] <span style="color: #51afef;">and</span> MIN_RECT_HEIGHT &lt; r[<span style="color: #da8548; font-weight: bold;">3</span>]
-<span style="background-color: #282c34;"> </span>   ]
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">The largest bounding rectangle is assumed to be the entire table.</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Remove it from the list. We don't want to accidentally try to OCR</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">the entire table.</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">largest_rect</span> = <span style="color: #c678dd;">max</span>(bounding_rects, key=<span style="color: #51afef;">lambda</span> r: r[<span style="color: #da8548; font-weight: bold;">2</span>] * r[<span style="color: #da8548; font-weight: bold;">3</span>])
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">bounding_rects</span> = [b <span style="color: #51afef;">for</span> b <span style="color: #51afef;">in</span> bounding_rects <span style="color: #51afef;">if</span> b <span style="color: #51afef;">is</span> <span style="color: #51afef;">not</span> largest_rect]
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">cells</span> = [c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> bounding_rects]
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">def</span> <span style="color: #c678dd;">cell_in_same_row</span>(c1, c2):
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">c1_center</span> = c1[<span style="color: #da8548; font-weight: bold;">1</span>] + c1[<span style="color: #da8548; font-weight: bold;">3</span>] - c1[<span style="color: #da8548; font-weight: bold;">3</span>] / <span style="color: #da8548; font-weight: bold;">2</span>
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">c2_bottom</span> = c2[<span style="color: #da8548; font-weight: bold;">1</span>] + c2[<span style="color: #da8548; font-weight: bold;">3</span>]
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">c2_top</span> = c2[<span style="color: #da8548; font-weight: bold;">1</span>]
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> c2_top &lt; c1_center &lt; c2_bottom
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">orig_cells</span> = [c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> cells]
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">rows</span> = []
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">while</span> cells:
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">first</span> = cells[<span style="color: #da8548; font-weight: bold;">0</span>]
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">rest</span> = cells[<span style="color: #da8548; font-weight: bold;">1</span>:]
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">cells_in_same_row</span> = <span style="color: #c678dd;">sorted</span>(
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   [
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> rest
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">if</span> cell_in_same_row(c, first)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   ],
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   key=<span style="color: #51afef;">lambda</span> c: c[<span style="color: #da8548; font-weight: bold;">0</span>]
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   )
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">row_cells</span> = <span style="color: #c678dd;">sorted</span>([first] + cells_in_same_row, key=<span style="color: #51afef;">lambda</span> c: c[<span style="color: #da8548; font-weight: bold;">0</span>])
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   rows.append(row_cells)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">cells</span> = [
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> rest
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">if</span> <span style="color: #51afef;">not</span> cell_in_same_row(c, first)
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   ]
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Sort rows by average height of their center.</span>
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">def</span> <span style="color: #c678dd;">avg_height_of_center</span>(row):
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">centers</span> = [y + h - h / <span style="color: #da8548; font-weight: bold;">2</span> <span style="color: #51afef;">for</span> x, y, w, h <span style="color: #51afef;">in</span> row]
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> <span style="color: #c678dd;">sum</span>(centers) / <span style="color: #c678dd;">len</span>(centers)
-<span style="background-color: #282c34;"> </span>   
-<span style="background-color: #282c34;"> </span>   rows.sort(key=avg_height_of_center)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">cell_images_rows</span> = []
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">for</span> row <span style="color: #51afef;">in</span> rows:
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">cell_images_row</span> = []
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">for</span> x, y, w, h <span style="color: #51afef;">in</span> row:
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   cell_images_row.append(image[y:y+h, x:x+w])
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   cell_images_rows.append(cell_images_row)
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> cell_images_rows
+<span style="color: #51afef;">from</span> table_ocr.extract_cells <span style="color: #51afef;">import</span> main
 
-<span style="color: #51afef;">if</span> <span style="color: #c678dd;">__name__</span> == <span style="color: #98be65;">"__main__"</span>:
-<span style="background-color: #282c34;"> </span>   main(sys.argv[<span style="color: #da8548; font-weight: bold;">1</span>])
+<span style="color: #dcaeea;">paths</span> = main(sys.argv[<span style="color: #da8548; font-weight: bold;">1</span>])
+<span style="color: #51afef;">print</span>(<span style="color: #98be65;">"\n"</span>.join(paths))
 </pre>
 </div>
 </div>
 </div>
 </div>
 
-<div id="outline-container-org04c3633" class="outline-4">
-<h4 id="org04c3633"><span class="section-number-4">5.2.6</span> table_ocr/ocr_image/</h4>
+<div id="outline-container-org5ff2e40" class="outline-4">
+<h4 id="org5ff2e40"><span class="section-number-4">5.2.6</span> table_ocr/ocr_image/</h4>
 <div class="outline-text-4" id="text-5-2-6">
 </div>
-<div id="outline-container-orgd01e3e7" class="outline-5">
-<h5 id="orgd01e3e7"><span class="section-number-5">5.2.6.1</span> table_ocr/ocr_image/__init__.py</h5>
+<div id="outline-container-org1bc0eb3" class="outline-5">
+<h5 id="org1bc0eb3"><span class="section-number-5">5.2.6.1</span> table_ocr/ocr_image/__init__.py</h5>
 <div class="outline-text-5" id="text-5-2-6-1">
 <div class="org-src-container">
 <pre class="src src-python"><span style="color: #51afef;">import</span> math
+<span style="color: #51afef;">import</span> os
+<span style="color: #51afef;">import</span> sys
 
 <span style="color: #51afef;">import</span> cv2
 <span style="color: #51afef;">import</span> numpy <span style="color: #51afef;">as</span> np
 <span style="color: #51afef;">import</span> pytesseract
 
+<span style="color: #51afef;">def</span> <span style="color: #c678dd;">main</span>(image_file, tess_args):
+<span style="background-color: #282c34;"> </span>   <span style="color: #83898d;">"""</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   OCR the image and output the text to a file with an extension that is ready</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   to be used in Tesseract training (.gt.txt).</span>
+
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Tries to crop the image so that only the relevant text gets passed to Tesseract.</span>
+
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   Returns the name of the text file that contains the text.</span>
+<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;">   """</span>
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(image_file)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">filename_sans_ext</span>, <span style="color: #dcaeea;">ext</span> = os.path.splitext(filename)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">image</span> = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">cropped</span> = crop_to_text(image)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">ocr_data_dir</span> = os.path.join(directory, <span style="color: #98be65;">"ocr_data"</span>)
+<span style="background-color: #282c34;"> </span>   os.makedirs(ocr_data_dir, exist_ok=<span style="color: #a9a1e1;">True</span>)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">out_imagepath</span> = os.path.join(ocr_data_dir, filename)
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">out_txtpath</span> = os.path.join(ocr_data_dir, <span style="color: #98be65;">"{}.gt.txt"</span>.<span style="color: #c678dd;">format</span>(filename_sans_ext))
+<span style="background-color: #282c34;"> </span>   cv2.imwrite(out_imagepath, cropped)
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">if</span> <span style="color: #51afef;">not</span> tess_args:
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">d</span> = os.path.dirname(sys.modules[<span style="color: #98be65;">"table_ocr"</span>].__file__)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">tessdata_dir</span> = os.path.join(d, <span style="color: #98be65;">"tessdata"</span>)
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">tess_args</span> = [<span style="color: #98be65;">"--psm"</span>, <span style="color: #98be65;">"7"</span>, <span style="color: #98be65;">"-l"</span>, <span style="color: #98be65;">"table-ocr"</span>, <span style="color: #98be65;">"--tessdata-dir"</span>, tessdata_dir]
+<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">txt</span> = ocr_image(cropped, <span style="color: #98be65;">" "</span>.join(tess_args))
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">with</span> <span style="color: #c678dd;">open</span>(out_txtpath, <span style="color: #98be65;">"w"</span>) <span style="color: #51afef;">as</span> txt_file:
+<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   txt_file.write(txt)
+<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">return</span> out_txtpath
+
 <span style="color: #51afef;">def</span> <span style="color: #c678dd;">crop_to_text</span>(image):
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">MAX_COLOR_VAL</span> = <span style="color: #da8548; font-weight: bold;">255</span>
 <span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">BLOCK_SIZE</span> = <span style="color: #da8548; font-weight: bold;">15</span>
@@ -2022,8 +2073,8 @@ cells.
 </div>
 </div>
 </div>
-<div id="outline-container-org2a58364" class="outline-5">
-<h5 id="org2a58364"><span class="section-number-5">5.2.6.2</span> table_ocr/ocr_image/__main__.py</h5>
+<div id="outline-container-org11f1d0c" class="outline-5">
+<h5 id="org11f1d0c"><span class="section-number-5">5.2.6.2</span> table_ocr/ocr_image/__main__.py</h5>
 <div class="outline-text-5" id="text-5-2-6-2">
 <p>
 This does a little bit of cleanup before sending it through tesseract.
@@ -2036,13 +2087,8 @@ Creates images and text files that can be used for training tesseract. See
 
 <div class="org-src-container">
 <pre class="src src-python"><span style="color: #51afef;">import</span> argparse
-<span style="color: #51afef;">import</span> math
-<span style="color: #51afef;">import</span> os
-<span style="color: #51afef;">import</span> sys
-
-<span style="color: #51afef;">import</span> cv2
 
-<span style="color: #51afef;">from</span> table_ocr.ocr_image <span style="color: #51afef;">import</span> crop_to_text, ocr_image
+<span style="color: #51afef;">from</span> table_ocr.ocr_image <span style="color: #51afef;">import</span> main
 
 <span style="color: #dcaeea;">description</span>=<span style="color: #98be65;">"""Takes a single argument that is the image to OCR.</span>
 <span style="color: #98be65;">Remaining arguments are passed directly to Tesseract.</span>
@@ -2053,35 +2099,19 @@ Creates images and text files that can be used for training tesseract. See
 <span style="color: #dcaeea;">parser</span> = argparse.ArgumentParser(description=description)
 parser.add_argument(<span style="color: #98be65;">"image"</span>, <span style="color: #c678dd;">help</span>=<span style="color: #98be65;">"filepath of image to perform OCR"</span>)
 
-<span style="color: #51afef;">def</span> <span style="color: #c678dd;">main</span>(image_file, tess_args):
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(image_file)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">filename_sans_ext</span>, <span style="color: #dcaeea;">ext</span> = os.path.splitext(filename)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">image</span> = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">cropped</span> = crop_to_text(image)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">ocr_data_dir</span> = os.path.join(directory, <span style="color: #98be65;">"ocr_data"</span>)
-<span style="background-color: #282c34;"> </span>   os.makedirs(ocr_data_dir, exist_ok=<span style="color: #a9a1e1;">True</span>)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">out_imagepath</span> = os.path.join(ocr_data_dir, filename)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">out_txtpath</span> = os.path.join(ocr_data_dir, <span style="color: #98be65;">"{}.gt.txt"</span>.<span style="color: #c678dd;">format</span>(filename_sans_ext))
-<span style="background-color: #282c34;"> </span>   cv2.imwrite(out_imagepath, cropped)
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">txt</span> = ocr_image(cropped, <span style="color: #98be65;">" "</span>.join(tess_args))
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">print</span>(txt)
-<span style="background-color: #282c34;"> </span>   <span style="color: #51afef;">with</span> <span style="color: #c678dd;">open</span>(out_txtpath, <span style="color: #98be65;">"w"</span>) <span style="color: #51afef;">as</span> txt_file:
-<span style="background-color: #282c34;"> </span>   <span style="background-color: #282c34;"> </span>   txt_file.write(txt)
-
-<span style="color: #51afef;">if</span> <span style="color: #c678dd;">__name__</span> == <span style="color: #98be65;">"__main__"</span>:
-<span style="background-color: #282c34;"> </span>   <span style="color: #dcaeea;">args</span>, <span style="color: #dcaeea;">tess_args</span> = parser.parse_known_args()
-<span style="background-color: #282c34;"> </span>   main(args.image, tess_args)
+<span style="color: #dcaeea;">args</span>, <span style="color: #dcaeea;">tess_args</span> = parser.parse_known_args()
+<span style="color: #51afef;">print</span>(main(args.image, tess_args))
 </pre>
 </div>
 </div>
 </div>
 </div>
-<div id="outline-container-org2c74337" class="outline-4">
-<h4 id="org2c74337"><span class="section-number-4">5.2.7</span> table_ocr/ocr_to_csv/</h4>
+<div id="outline-container-org7612c04" class="outline-4">
+<h4 id="org7612c04"><span class="section-number-4">5.2.7</span> table_ocr/ocr_to_csv/</h4>
 <div class="outline-text-4" id="text-5-2-7">
 </div>
-<div id="outline-container-orgf6b3289" class="outline-5">
-<h5 id="orgf6b3289"><span class="section-number-5">5.2.7.1</span> table_ocr/ocr_to_csv/__init__.py</h5>
+<div id="outline-container-orgb76e923" class="outline-5">
+<h5 id="orgb76e923"><span class="section-number-5">5.2.7.1</span> table_ocr/ocr_to_csv/__init__.py</h5>
 <div class="outline-text-5" id="text-5-2-7-1">
 <div class="org-src-container">
 <pre class="src src-python"><span style="color: #51afef;">import</span> csv
@@ -2115,8 +2145,8 @@ parser.add_argument(<span style="color: #98be65;">"image"</span>, <span style="c
 </div>
 </div>
 </div>
-<div id="outline-container-org4befe04" class="outline-5">
-<h5 id="org4befe04"><span class="section-number-5">5.2.7.2</span> table_ocr/ocr_to_csv/__main__.py</h5>
+<div id="outline-container-orgb9ce258" class="outline-5">
+<h5 id="orgb9ce258"><span class="section-number-5">5.2.7.2</span> table_ocr/ocr_to_csv/__main__.py</h5>
 <div class="outline-text-5" id="text-5-2-7-2">
 <div class="org-src-container">
 <pre class="src src-python"><span style="color: #51afef;">import</span> argparse
@@ -2145,8 +2175,8 @@ parser.add_argument(<span style="color: #98be65;">"files"</span>, nargs=<span st
 </div>
 </div>
 
-<div id="outline-container-org487dcc8" class="outline-2">
-<h2 id="org487dcc8"><span class="section-number-2">6</span> Utils</h2>
+<div id="outline-container-org446b9ad" class="outline-2">
+<h2 id="org446b9ad"><span class="section-number-2">6</span> Utils</h2>
 <div class="outline-text-2" id="text-6">
 <p>
 The following code lets us specify a size for images when they are exported to
@@ -2173,7 +2203,7 @@ with <code>advice-add</code>.
 </p>
 
 <div class="org-src-container">
-<pre class="src src-emacs-lisp" id="org4050aca"><span style="color: #51afef;">(</span><span style="color: #a9a1e1;">concat</span> <span style="color: #98be65;">"#+ATTR_HTML: :width "</span> width <span style="color: #98be65;">" :height "</span> height <span style="color: #98be65;">"\n[[file:"</span> text <span style="color: #98be65;">"]]"</span><span style="color: #51afef;">)</span>
+<pre class="src src-emacs-lisp" id="orgb2f52a9"><span style="color: #51afef;">(</span><span style="color: #a9a1e1;">concat</span> <span style="color: #98be65;">"#+ATTR_HTML: :width "</span> width <span style="color: #98be65;">" :height "</span> height <span style="color: #98be65;">"\n[[file:"</span> text <span style="color: #98be65;">"]]"</span><span style="color: #51afef;">)</span>
 </pre>
 </div>
 
@@ -2195,8 +2225,8 @@ with <code>advice-add</code>.
 </div>
 </div>
 
-<div id="outline-container-org457567f" class="outline-3">
-<h3 id="org457567f"><span class="section-number-3">6.1</span> Logging</h3>
+<div id="outline-container-orgac512bd" class="outline-3">
+<h3 id="orgac512bd"><span class="section-number-3">6.1</span> Logging</h3>
 <div class="outline-text-3" id="text-6-1">
 <div class="org-src-container">
 <pre class="src src-python"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">get_logger</span>(name):
@@ -2217,7 +2247,7 @@ with <code>advice-add</code>.
 </div>
 <div id="postamble" class="status">
 <p class="author">Author: Eric Ihli</p>
-<p class="date">Created: 2020-04-25 Sat 12:20</p>
+<p class="date">Created: 2020-10-14 Wed 21:28</p>
 </div>
 </body>
 </html>
diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org
index 3d1587b..bc86c47 100644
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@@ -97,14 +97,15 @@ PDF=$1
 python -m table_ocr.pdf_to_images $PDF | grep .png > /tmp/pdf-images.txt
 cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {}  | grep table > /tmp/extracted-tables.txt
 cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells {} | grep cells > /tmp/extracted-cells.txt
-cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l table-ocr
-
+cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {}
 for image in $(cat /tmp/extracted-tables.txt); do
     dir=$(dirname $image)
     python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt")
 done
 #+END_SRC
 
+Any extra args you pass after the image path to ~python -m table_ocr.ocr_image~ will be passed directly to tesseract as options. If you don't pass anything, reasonable english defaults are used.
+
 ** Possible improvements
 
 Detect text with the stroke-width-transform alogoritm. https://zablo.net/blog/post/stroke-width-transform-swt-python/index.html
@@ -199,7 +200,7 @@ def preprocess_img(filepath, tess_params=None):
     like mogrify to rotate.
 
     Uses tesseract to detect rotation.
-   
+
     Orientation and script detection is only available for legacy tesseract
     (--oem 0). Some versions of tesseract will segfault if you let it run OSD
     with the default oem (3).