You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2254 lines
176 KiB
HTML
2254 lines
176 KiB
HTML
<?xml version="1.0" encoding="utf-8"?>
|
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
|
|
<head>
|
|
<!-- 2020-10-14 Wed 21:28 -->
|
|
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
<title>PDF Parsing</title>
|
|
<meta name="generator" content="Org mode" />
|
|
<meta name="author" content="Eric Ihli" />
|
|
<style type="text/css">
|
|
<!--/*--><![CDATA[/*><!--*/
|
|
.title { text-align: center;
|
|
margin-bottom: .2em; }
|
|
.subtitle { text-align: center;
|
|
font-size: medium;
|
|
font-weight: bold;
|
|
margin-top:0; }
|
|
.todo { font-family: monospace; color: red; }
|
|
.done { font-family: monospace; color: green; }
|
|
.priority { font-family: monospace; color: orange; }
|
|
.tag { background-color: #eee; font-family: monospace;
|
|
padding: 2px; font-size: 80%; font-weight: normal; }
|
|
.timestamp { color: #bebebe; }
|
|
.timestamp-kwd { color: #5f9ea0; }
|
|
.org-right { margin-left: auto; margin-right: 0px; text-align: right; }
|
|
.org-left { margin-left: 0px; margin-right: auto; text-align: left; }
|
|
.org-center { margin-left: auto; margin-right: auto; text-align: center; }
|
|
.underline { text-decoration: underline; }
|
|
#postamble p, #preamble p { font-size: 90%; margin: .2em; }
|
|
p.verse { margin-left: 3%; }
|
|
pre {
|
|
border: 1px solid #ccc;
|
|
box-shadow: 3px 3px 3px #eee;
|
|
padding: 8pt;
|
|
font-family: monospace;
|
|
overflow: auto;
|
|
margin: 1.2em;
|
|
}
|
|
pre.src {
|
|
position: relative;
|
|
overflow: visible;
|
|
padding-top: 1.2em;
|
|
}
|
|
pre.src:before {
|
|
display: none;
|
|
position: absolute;
|
|
background-color: white;
|
|
top: -10px;
|
|
right: 10px;
|
|
padding: 3px;
|
|
border: 1px solid black;
|
|
}
|
|
pre.src:hover:before { display: inline;}
|
|
/* Languages per Org manual */
|
|
pre.src-asymptote:before { content: 'Asymptote'; }
|
|
pre.src-awk:before { content: 'Awk'; }
|
|
pre.src-C:before { content: 'C'; }
|
|
/* pre.src-C++ doesn't work in CSS */
|
|
pre.src-clojure:before { content: 'Clojure'; }
|
|
pre.src-css:before { content: 'CSS'; }
|
|
pre.src-D:before { content: 'D'; }
|
|
pre.src-ditaa:before { content: 'ditaa'; }
|
|
pre.src-dot:before { content: 'Graphviz'; }
|
|
pre.src-calc:before { content: 'Emacs Calc'; }
|
|
pre.src-emacs-lisp:before { content: 'Emacs Lisp'; }
|
|
pre.src-fortran:before { content: 'Fortran'; }
|
|
pre.src-gnuplot:before { content: 'gnuplot'; }
|
|
pre.src-haskell:before { content: 'Haskell'; }
|
|
pre.src-hledger:before { content: 'hledger'; }
|
|
pre.src-java:before { content: 'Java'; }
|
|
pre.src-js:before { content: 'Javascript'; }
|
|
pre.src-latex:before { content: 'LaTeX'; }
|
|
pre.src-ledger:before { content: 'Ledger'; }
|
|
pre.src-lisp:before { content: 'Lisp'; }
|
|
pre.src-lilypond:before { content: 'Lilypond'; }
|
|
pre.src-lua:before { content: 'Lua'; }
|
|
pre.src-matlab:before { content: 'MATLAB'; }
|
|
pre.src-mscgen:before { content: 'Mscgen'; }
|
|
pre.src-ocaml:before { content: 'Objective Caml'; }
|
|
pre.src-octave:before { content: 'Octave'; }
|
|
pre.src-org:before { content: 'Org mode'; }
|
|
pre.src-oz:before { content: 'OZ'; }
|
|
pre.src-plantuml:before { content: 'Plantuml'; }
|
|
pre.src-processing:before { content: 'Processing.js'; }
|
|
pre.src-python:before { content: 'Python'; }
|
|
pre.src-R:before { content: 'R'; }
|
|
pre.src-ruby:before { content: 'Ruby'; }
|
|
pre.src-sass:before { content: 'Sass'; }
|
|
pre.src-scheme:before { content: 'Scheme'; }
|
|
pre.src-screen:before { content: 'Gnu Screen'; }
|
|
pre.src-sed:before { content: 'Sed'; }
|
|
pre.src-sh:before { content: 'shell'; }
|
|
pre.src-sql:before { content: 'SQL'; }
|
|
pre.src-sqlite:before { content: 'SQLite'; }
|
|
/* additional languages in org.el's org-babel-load-languages alist */
|
|
pre.src-forth:before { content: 'Forth'; }
|
|
pre.src-io:before { content: 'IO'; }
|
|
pre.src-J:before { content: 'J'; }
|
|
pre.src-makefile:before { content: 'Makefile'; }
|
|
pre.src-maxima:before { content: 'Maxima'; }
|
|
pre.src-perl:before { content: 'Perl'; }
|
|
pre.src-picolisp:before { content: 'Pico Lisp'; }
|
|
pre.src-scala:before { content: 'Scala'; }
|
|
pre.src-shell:before { content: 'Shell Script'; }
|
|
pre.src-ebnf2ps:before { content: 'ebfn2ps'; }
|
|
/* additional language identifiers per "defun org-babel-execute"
|
|
in ob-*.el */
|
|
pre.src-cpp:before { content: 'C++'; }
|
|
pre.src-abc:before { content: 'ABC'; }
|
|
pre.src-coq:before { content: 'Coq'; }
|
|
pre.src-groovy:before { content: 'Groovy'; }
|
|
/* additional language identifiers from org-babel-shell-names in
|
|
ob-shell.el: ob-shell is the only babel language using a lambda to put
|
|
the execution function name together. */
|
|
pre.src-bash:before { content: 'bash'; }
|
|
pre.src-csh:before { content: 'csh'; }
|
|
pre.src-ash:before { content: 'ash'; }
|
|
pre.src-dash:before { content: 'dash'; }
|
|
pre.src-ksh:before { content: 'ksh'; }
|
|
pre.src-mksh:before { content: 'mksh'; }
|
|
pre.src-posh:before { content: 'posh'; }
|
|
/* Additional Emacs modes also supported by the LaTeX listings package */
|
|
pre.src-ada:before { content: 'Ada'; }
|
|
pre.src-asm:before { content: 'Assembler'; }
|
|
pre.src-caml:before { content: 'Caml'; }
|
|
pre.src-delphi:before { content: 'Delphi'; }
|
|
pre.src-html:before { content: 'HTML'; }
|
|
pre.src-idl:before { content: 'IDL'; }
|
|
pre.src-mercury:before { content: 'Mercury'; }
|
|
pre.src-metapost:before { content: 'MetaPost'; }
|
|
pre.src-modula-2:before { content: 'Modula-2'; }
|
|
pre.src-pascal:before { content: 'Pascal'; }
|
|
pre.src-ps:before { content: 'PostScript'; }
|
|
pre.src-prolog:before { content: 'Prolog'; }
|
|
pre.src-simula:before { content: 'Simula'; }
|
|
pre.src-tcl:before { content: 'tcl'; }
|
|
pre.src-tex:before { content: 'TeX'; }
|
|
pre.src-plain-tex:before { content: 'Plain TeX'; }
|
|
pre.src-verilog:before { content: 'Verilog'; }
|
|
pre.src-vhdl:before { content: 'VHDL'; }
|
|
pre.src-xml:before { content: 'XML'; }
|
|
pre.src-nxml:before { content: 'XML'; }
|
|
/* add a generic configuration mode; LaTeX export needs an additional
|
|
(add-to-list 'org-latex-listings-langs '(conf " ")) in .emacs */
|
|
pre.src-conf:before { content: 'Configuration File'; }
|
|
|
|
table { border-collapse:collapse; }
|
|
caption.t-above { caption-side: top; }
|
|
caption.t-bottom { caption-side: bottom; }
|
|
td, th { vertical-align:top; }
|
|
th.org-right { text-align: center; }
|
|
th.org-left { text-align: center; }
|
|
th.org-center { text-align: center; }
|
|
td.org-right { text-align: right; }
|
|
td.org-left { text-align: left; }
|
|
td.org-center { text-align: center; }
|
|
dt { font-weight: bold; }
|
|
.footpara { display: inline; }
|
|
.footdef { margin-bottom: 1em; }
|
|
.figure { padding: 1em; }
|
|
.figure p { text-align: center; }
|
|
.equation-container {
|
|
display: table;
|
|
text-align: center;
|
|
width: 100%;
|
|
}
|
|
.equation {
|
|
vertical-align: middle;
|
|
}
|
|
.equation-label {
|
|
display: table-cell;
|
|
text-align: right;
|
|
vertical-align: middle;
|
|
}
|
|
.inlinetask {
|
|
padding: 10px;
|
|
border: 2px solid gray;
|
|
margin: 10px;
|
|
background: #ffffcc;
|
|
}
|
|
#org-div-home-and-up
|
|
{ text-align: right; font-size: 70%; white-space: nowrap; }
|
|
textarea { overflow-x: auto; }
|
|
.linenr { font-size: smaller }
|
|
.code-highlighted { background-color: #ffff00; }
|
|
.org-info-js_info-navigation { border-style: none; }
|
|
#org-info-js_console-label
|
|
{ font-size: 10px; font-weight: bold; white-space: nowrap; }
|
|
.org-info-js_search-highlight
|
|
{ background-color: #ffff00; color: #000000; font-weight: bold; }
|
|
.org-svg { width: 90%; }
|
|
/*]]>*/-->
|
|
</style>
|
|
<script type="text/javascript">
|
|
// @license magnet:?xt=urn:btih:1f739d935676111cfff4b4693e3816e664797050&dn=gpl-3.0.txt GPL-v3-or-Later
|
|
<!--/*--><![CDATA[/*><!--*/
|
|
function CodeHighlightOn(elem, id)
|
|
{
|
|
var target = document.getElementById(id);
|
|
if(null != target) {
|
|
elem.cacheClassElem = elem.className;
|
|
elem.cacheClassTarget = target.className;
|
|
target.className = "code-highlighted";
|
|
elem.className = "code-highlighted";
|
|
}
|
|
}
|
|
function CodeHighlightOff(elem, id)
|
|
{
|
|
var target = document.getElementById(id);
|
|
if(elem.cacheClassElem)
|
|
elem.className = elem.cacheClassElem;
|
|
if(elem.cacheClassTarget)
|
|
target.className = elem.cacheClassTarget;
|
|
}
|
|
/*]]>*///-->
|
|
// @license-end
|
|
</script>
|
|
</head>
|
|
<body>
|
|
<div id="content">
|
|
<h1 class="title">PDF Parsing</h1>
|
|
<div id="table-of-contents">
|
|
<h2>Table of Contents</h2>
|
|
<div id="text-table-of-contents">
|
|
<ul>
|
|
<li><a href="#org3fab902">1. Overview</a>
|
|
<ul>
|
|
<li><a href="#orgaf477b8">1.1. Requirements</a>
|
|
<ul>
|
|
<li><a href="#org43dd3dc">1.1.1. Python packages</a></li>
|
|
<li><a href="#org8927075">1.1.2. External</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#org14c36da">1.2. Contributing</a></li>
|
|
<li><a href="#org8aef2ca">1.3. Example usage</a></li>
|
|
<li><a href="#org7e5cd11">1.4. Possible improvements</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#org51af43b">2. Preparing data</a>
|
|
<ul>
|
|
<li><a href="#orga4dde96">2.1. Converting PDFs to images</a></li>
|
|
<li><a href="#org6c75ffa">2.2. Detecting image orientation and applying rotation.</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#orgc195fec">3. Detecting tables</a>
|
|
<ul>
|
|
<li><a href="#org9dd75a6">3.1. Improving accuracy</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#org904debc">4. OCR tables</a>
|
|
<ul>
|
|
<li><a href="#orgb03a965">4.1. Training Tesseract</a>
|
|
<ul>
|
|
<li><a href="#org5adeb27">4.1.1. Training tips</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#org152ead5">4.2. Blur</a></li>
|
|
<li><a href="#org858fb89">4.3. Threshold</a></li>
|
|
<li><a href="#orgcf17042">4.4. Finding the vertical and horizontal lines of the table</a></li>
|
|
<li><a href="#org94f71b3">4.5. Finding the contours</a></li>
|
|
<li><a href="#orgc64b6ef">4.6. Sorting the bounding rectangles</a></li>
|
|
<li><a href="#orgd4dc4cc">4.7. Cropping each cell to the text</a></li>
|
|
<li><a href="#org22a3e7b">4.8. OCR each cell</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#org9d0b21d">5. Files</a>
|
|
<ul>
|
|
<li><a href="#orgd57e56a">5.1. setup.py</a></li>
|
|
<li><a href="#org4b36161">5.2. table_ocr</a>
|
|
<ul>
|
|
<li><a href="#orgbdd2fc0">5.2.1. table_ocr/__init__.py</a></li>
|
|
<li><a href="#org09e5a07">5.2.2. table_ocr/util.py</a></li>
|
|
<li><a href="#org5a371bd">5.2.3. table_ocr/pdf_to_images/</a>
|
|
<ul>
|
|
<li><a href="#orgd777fae">5.2.3.1. table_ocr/pdf_to_images/__init__.py</a></li>
|
|
<li><a href="#org0064754">5.2.3.2. table_ocr/pdf_to_images/__main__.py</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#org03e58e9">5.2.4. table_ocr/extract_tables/</a>
|
|
<ul>
|
|
<li><a href="#orgfedc867">5.2.4.1. table_ocr/extract_tables/__init__.py</a></li>
|
|
<li><a href="#org82b2c3a">5.2.4.2. table_ocr/extract_tables/__main__.py</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#org7ec79e9">5.2.5. table_ocr/extract_cells/</a>
|
|
<ul>
|
|
<li><a href="#org6d6ddc7">5.2.5.1. table_ocr/extract_cells/__init__.py</a></li>
|
|
<li><a href="#orgd698866">5.2.5.2. table_ocr/extract_cells/__main__.py</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#org5ff2e40">5.2.6. table_ocr/ocr_image/</a>
|
|
<ul>
|
|
<li><a href="#org1bc0eb3">5.2.6.1. table_ocr/ocr_image/__init__.py</a></li>
|
|
<li><a href="#org11f1d0c">5.2.6.2. table_ocr/ocr_image/__main__.py</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#org7612c04">5.2.7. table_ocr/ocr_to_csv/</a>
|
|
<ul>
|
|
<li><a href="#orgb76e923">5.2.7.1. table_ocr/ocr_to_csv/__init__.py</a></li>
|
|
<li><a href="#orgb9ce258">5.2.7.2. table_ocr/ocr_to_csv/__main__.py</a></li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#org446b9ad">6. Utils</a>
|
|
<ul>
|
|
<li><a href="#orgac512bd">6.1. Logging</a></li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org3fab902" class="outline-2">
|
|
<h2 id="org3fab902"><span class="section-number-2">1</span> Overview</h2>
|
|
<div class="outline-text-2" id="text-1">
|
|
<p>
|
|
This Python package provides utilities for extracting tabular data from PDF
|
|
files and images of tables.
|
|
</p>
|
|
|
|
<p>
|
|
Given an image that contains a table…
|
|
</p>
|
|
|
|
|
|
<div class="figure">
|
|
<p><img src="resources/examples/example-page.png" alt="example-page.png" width="25%" />
|
|
</p>
|
|
</div>
|
|
|
|
<p>
|
|
Extract the the text into a CSV format…
|
|
</p>
|
|
|
|
<pre class="example">
|
|
PRIZE,ODDS 1 IN:,# OF WINNERS*
|
|
$3,9.09,"282,447"
|
|
$5,16.66,"154,097"
|
|
$7,40.01,"64,169"
|
|
$10,26.67,"96,283"
|
|
$20,100.00,"25,677"
|
|
$30,290.83,"8,829"
|
|
$50,239.66,"10,714"
|
|
$100,919.66,"2,792"
|
|
$500,"6,652.07",386
|
|
"$40,000","855,899.99",3
|
|
1,i223,
|
|
Toa,,
|
|
,,
|
|
,,"* Based upon 2,567,700"
|
|
</pre>
|
|
|
|
<p>
|
|
The package is split into modules with narrow focuses.
|
|
</p>
|
|
|
|
<ul class="org-ul">
|
|
<li><code>pdf_to_images</code> uses Poppler and ImageMagick to extract images from a PDF.</li>
|
|
<li><code>extract_tables</code> finds and extracts table-looking things from an image.</li>
|
|
<li><code>extract_cells</code> extracts and orders cells from a table.</li>
|
|
<li><code>ocr_image</code> uses Tesseract to OCR the text from an image of a cell.</li>
|
|
<li><code>ocr_to_csv</code> converts into a CSV the directory structure that <code>ocr_image</code> outputs.</li>
|
|
</ul>
|
|
</div>
|
|
|
|
<div id="outline-container-orgaf477b8" class="outline-3">
|
|
<h3 id="orgaf477b8"><span class="section-number-3">1.1</span> Requirements</h3>
|
|
<div class="outline-text-3" id="text-1-1">
|
|
</div>
|
|
<div id="outline-container-org43dd3dc" class="outline-4">
|
|
<h4 id="org43dd3dc"><span class="section-number-4">1.1.1</span> Python packages</h4>
|
|
<div class="outline-text-4" id="text-1-1-1">
|
|
<ul class="org-ul">
|
|
<li>numpy</li>
|
|
<li>opencv-python</li>
|
|
<li>pytesseract</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org8927075" class="outline-4">
|
|
<h4 id="org8927075"><span class="section-number-4">1.1.2</span> External</h4>
|
|
<div class="outline-text-4" id="text-1-1-2">
|
|
<ul class="org-ul">
|
|
<li><code>pdfimages</code> from Poppler</li>
|
|
<li>Tesseract</li>
|
|
<li><code>mogfrify</code> ImageMagick</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org14c36da" class="outline-3">
|
|
<h3 id="org14c36da"><span class="section-number-3">1.2</span> Contributing</h3>
|
|
<div class="outline-text-3" id="text-1-2">
|
|
<p>
|
|
This package was created in a <a href="https://en.wikipedia.org/wiki/Literate_programming">literate programming</a> style with the help of <a href="https://orgmode.org/worg/org-contrib/babel/intro.html">Babel</a>.
|
|
</p>
|
|
|
|
<p>
|
|
The unfortunate downside is the obscurity of the tooling. It creates a bit of a
|
|
barrier for contributors who aren’t already familiar with Emacs and Babel.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org8aef2ca" class="outline-3">
|
|
<h3 id="org8aef2ca"><span class="section-number-3">1.3</span> Example usage</h3>
|
|
<div class="outline-text-3" id="text-1-3">
|
|
<p>
|
|
Here is an example of a shell script that uses each module to turn a pdf with a
|
|
table into CSV output.
|
|
</p>
|
|
|
|
<p>
|
|
Depending on your needs, you may not need all of these steps. If you already
|
|
have an image of a table, you can jum straight to extracting the cells.
|
|
</p>
|
|
|
|
<p>
|
|
Each piece is its own python module, so you can also simply import the pieces
|
|
you need into your own python projects and use them as needed.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-shell" id="org5f5c842"><span style="color: #5B6268;">#</span><span style="color: #5B6268;">!/bin/</span><span style="color: #51afef;">sh</span>
|
|
|
|
<span style="color: #dcaeea;">PDF</span>=$<span style="color: #da8548; font-weight: bold;">1</span>
|
|
|
|
python -m table_ocr.pdf_to_images $<span style="color: #dcaeea;">PDF</span> | <span style="color: #ECBE7B;">grep</span> .png > /tmp/pdf-images.txt
|
|
<span style="color: #ECBE7B;">cat</span> /tmp/pdf-images.txt | xargs -I<span style="color: #51afef;">{}</span> python -m table_ocr.extract_tables <span style="color: #51afef;">{}</span> | <span style="color: #ECBE7B;">grep</span> table > /tmp/extracted-tables.txt
|
|
<span style="color: #ECBE7B;">cat</span> /tmp/extracted-tables.txt | xargs -I<span style="color: #51afef;">{}</span> python -m table_ocr.extract_cells <span style="color: #51afef;">{}</span> | <span style="color: #ECBE7B;">grep</span> cells > /tmp/extracted-cells.txt
|
|
<span style="color: #ECBE7B;">cat</span> /tmp/extracted-cells.txt | xargs -I<span style="color: #51afef;">{}</span> python -m table_ocr.ocr_image <span style="color: #51afef;">{}</span>
|
|
<span style="color: #51afef;">for</span> image<span style="color: #51afef;"> in</span> $<span style="color: #51afef;">(</span><span style="color: #ECBE7B;">cat</span> /tmp/extracted-tables.txt<span style="color: #51afef;">)</span>; <span style="color: #51afef;">do</span>
|
|
<span style="color: #dcaeea;">dir</span>=$<span style="color: #51afef;">(</span>dirname $<span style="color: #dcaeea;">image</span><span style="color: #51afef;">)</span>
|
|
python -m table_ocr.ocr_to_csv $<span style="color: #51afef;">(</span><span style="color: #ECBE7B;">find</span> $<span style="color: #dcaeea;">dir</span>/cells -name <span style="color: #98be65;">"*.txt"</span><span style="color: #51afef;">)</span>
|
|
<span style="color: #51afef;">done</span>
|
|
</pre>
|
|
</div>
|
|
|
|
<p>
|
|
Any extra args you pass after the image path to <code>python -m table_ocr.ocr_image</code> will be passed directly to tesseract as options. If you don’t pass anything, reasonable english defaults are used.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org7e5cd11" class="outline-3">
|
|
<h3 id="org7e5cd11"><span class="section-number-3">1.4</span> Possible improvements</h3>
|
|
<div class="outline-text-3" id="text-1-4">
|
|
<p>
|
|
Detect text with the stroke-width-transform alogoritm. <a href="https://zablo.net/blog/post/stroke-width-transform-swt-python/index.html">https://zablo.net/blog/post/stroke-width-transform-swt-python/index.html</a>
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org51af43b" class="outline-2">
|
|
<h2 id="org51af43b"><span class="section-number-2">2</span> Preparing data</h2>
|
|
<div class="outline-text-2" id="text-2">
|
|
<p>
|
|
Not all pdfs need to be sent through OCR to extract the text content. If you can
|
|
click and drag to highlight text in the pdf, then the tools in this library
|
|
probably aren’t necessary.
|
|
</p>
|
|
</div>
|
|
|
|
<div id="outline-container-orga4dde96" class="outline-3">
|
|
<h3 id="orga4dde96"><span class="section-number-3">2.1</span> Converting PDFs to images</h3>
|
|
<div class="outline-text-3" id="text-2-1">
|
|
<p>
|
|
This code calls out to <a href="https://manpages.debian.org/testing/poppler-utils/pdfimages.1.en.html">pdfimages</a> from <a href="https://poppler.freedesktop.org/">Poppler</a>.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python" id="orgdb8901b"><span style="color: #5B6268;"># </span><span style="color: #5B6268;">Wrapper around the Poppler command line utility "pdfimages" and helpers for</span>
|
|
<span style="color: #5B6268;"># </span><span style="color: #5B6268;">finding the output files of that command.</span>
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">pdf_to_images</span>(pdf_filepath):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #83898d;">"""</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> Turn a pdf into images</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> Returns the filenames of the created images sorted lexicographically.</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> """</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(pdf_filepath)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">image_filenames</span> = pdfimages(pdf_filepath)
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Since pdfimages creates a number of files named each for there page number</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">and doesn't return us the list that it created</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> <span style="color: #c678dd;">sorted</span>([os.path.join(directory, f) <span style="color: #51afef;">for</span> f <span style="color: #51afef;">in</span> image_filenames])
|
|
|
|
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">pdfimages</span>(pdf_filepath):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #83898d;">"""</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> Uses the `pdfimages` utility from Poppler</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> (https://poppler.freedesktop.org/). Creates images out of each page. Images</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> are prefixed by their name sans extension and suffixed by their page number.</span>
|
|
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> This should work up to pdfs with 999 pages since find matching files in dir</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> uses 3 digits in its regex.</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> """</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(pdf_filepath)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">if</span> <span style="color: #51afef;">not</span> os.path.isabs(directory):
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">directory</span> = os.path.abspath(directory)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">filename_sans_ext</span> = filename.split(<span style="color: #98be65;">".pdf"</span>)[<span style="color: #da8548; font-weight: bold;">0</span>]
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">pdfimages outputs results to the current working directory</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">with</span> working_dir(directory):
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> subprocess.run([<span style="color: #98be65;">"pdfimages"</span>, <span style="color: #98be65;">"-png"</span>, filename, filename.split(<span style="color: #98be65;">".pdf"</span>)[<span style="color: #da8548; font-weight: bold;">0</span>]])
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">image_filenames</span> = find_matching_files_in_dir(filename_sans_ext, directory)
|
|
<span style="background-color: #282c34;"> </span> logger.debug(
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #98be65;">"Converted {} into files:\n{}"</span>.<span style="color: #c678dd;">format</span>(pdf_filepath, <span style="color: #98be65;">"\n"</span>.join(image_filenames))
|
|
<span style="background-color: #282c34;"> </span> )
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> image_filenames
|
|
|
|
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">find_matching_files_in_dir</span>(file_prefix, directory):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">files</span> = [
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> filename
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">for</span> filename <span style="color: #51afef;">in</span> os.listdir(directory)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">if</span> re.match(r<span style="color: #98be65;">"{}-\d{{3}}.*\.png"</span>.<span style="color: #c678dd;">format</span>(re.escape(file_prefix)), filename)
|
|
<span style="background-color: #282c34;"> </span> ]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> files
|
|
</pre>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org6c75ffa" class="outline-3">
|
|
<h3 id="org6c75ffa"><span class="section-number-3">2.2</span> Detecting image orientation and applying rotation.</h3>
|
|
<div class="outline-text-3" id="text-2-2">
|
|
<p>
|
|
Tesseract can detect orientation and we can then use <a href="https://www.imagemagick.org/script/mogrify.php">ImageMagick’s mogrify</a> to
|
|
rotate the image.
|
|
</p>
|
|
|
|
<p>
|
|
Here’s an example of the output we get from orientation detection with
|
|
Tesseract.
|
|
</p>
|
|
|
|
<pre class="example">
|
|
➜ example/ tesseract --psm 0 example-000.png -
|
|
Page number: 0
|
|
Orientation in degrees: 90
|
|
Rotate: 270
|
|
Orientation confidence: 26.86
|
|
Script: Latin
|
|
Script confidence: 2.44
|
|
</pre>
|
|
|
|
<p>
|
|
The following are some helpers to detect orientation of the images that Poppler
|
|
extracted and, if the images are rotated or skewed, use ImageMagick’s `mogrify`
|
|
to correct the rotation. This makes OCR more straightforward.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python" id="org44f8315"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">preprocess_img</span>(filepath, tess_params=<span style="color: #a9a1e1;">None</span>):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #83898d;">"""Processing that involves running shell executables,</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> like mogrify to rotate.</span>
|
|
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> Uses tesseract to detect rotation.</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> </span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> Orientation and script detection is only available for legacy tesseract</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> (--oem 0). Some versions of tesseract will segfault if you let it run OSD</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> with the default oem (3).</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> """</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">if</span> tess_params <span style="color: #51afef;">is</span> <span style="color: #a9a1e1;">None</span>:
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">tess_params</span> = [<span style="color: #98be65;">"--psm"</span>, <span style="color: #98be65;">"0"</span>, <span style="color: #98be65;">"--oem"</span>, <span style="color: #98be65;">"0"</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">rotate</span> = get_rotate(filepath, tess_params)
|
|
<span style="background-color: #282c34;"> </span> logger.debug(<span style="color: #98be65;">"Rotating {} by {}."</span>.<span style="color: #c678dd;">format</span>(filepath, rotate))
|
|
<span style="background-color: #282c34;"> </span> mogrify(filepath, rotate)
|
|
|
|
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">get_rotate</span>(image_filepath, tess_params):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #83898d;">"""</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> """</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">tess_command</span> = [<span style="color: #98be65;">"tesseract"</span>] + tess_params + [image_filepath, <span style="color: #98be65;">"-"</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">output</span> = (
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> subprocess.check_output(tess_command)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> .decode(<span style="color: #98be65;">"utf-8"</span>)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> .split(<span style="color: #98be65;">"\n"</span>)
|
|
<span style="background-color: #282c34;"> </span> )
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">output</span> = <span style="color: #c678dd;">next</span>(l <span style="color: #51afef;">for</span> l <span style="color: #51afef;">in</span> output <span style="color: #51afef;">if</span> <span style="color: #98be65;">"Rotate: "</span> <span style="color: #51afef;">in</span> l)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">output</span> = output.split(<span style="color: #98be65;">": "</span>)[<span style="color: #da8548; font-weight: bold;">1</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> output
|
|
|
|
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">mogrify</span>(image_filepath, rotate):
|
|
<span style="background-color: #282c34;"> </span> subprocess.run([<span style="color: #98be65;">"mogrify"</span>, <span style="color: #98be65;">"-rotate"</span>, rotate, image_filepath])
|
|
</pre>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-orgc195fec" class="outline-2">
|
|
<h2 id="orgc195fec"><span class="section-number-2">3</span> Detecting tables</h2>
|
|
<div class="outline-text-2" id="text-3">
|
|
<p>
|
|
This answer from opencv.org was heavily referenced while writing the code around
|
|
table detection:
|
|
<a href="https://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/">https://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/</a>.
|
|
</p>
|
|
|
|
<p>
|
|
It’s much easier to OCR a table when the table is the only thing in the image.
|
|
This code detects tables in an image and returns a list of images of just the
|
|
tables, no surrounding text or noise.
|
|
</p>
|
|
|
|
<p>
|
|
The blurring, thresholding, and line detection is used here as well as later on
|
|
for cell extraction. They are good techniques for cleaning an image up in a way
|
|
that makes things like shape detection more accurate.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python" id="orgd821c1d"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">find_tables</span>(image):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">BLUR_KERNEL_SIZE</span> = (<span style="color: #da8548; font-weight: bold;">17</span>, <span style="color: #da8548; font-weight: bold;">17</span>)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">STD_DEV_X_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">STD_DEV_Y_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">blurred</span> = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">MAX_COLOR_VAL</span> = <span style="color: #da8548; font-weight: bold;">255</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">BLOCK_SIZE</span> = <span style="color: #da8548; font-weight: bold;">15</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">SUBTRACT_FROM_MEAN</span> = -<span style="color: #da8548; font-weight: bold;">2</span>
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">img_bin</span> = cv2.adaptiveThreshold(
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> ~blurred,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> MAX_COLOR_VAL,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> cv2.ADAPTIVE_THRESH_MEAN_C,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> cv2.THRESH_BINARY,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> BLOCK_SIZE,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> SUBTRACT_FROM_MEAN,
|
|
<span style="background-color: #282c34;"> </span> )
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">vertical</span> = <span style="color: #dcaeea;">horizontal</span> = img_bin.copy()
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">SCALE</span> = <span style="color: #da8548; font-weight: bold;">5</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">image_width</span>, <span style="color: #dcaeea;">image_height</span> = horizontal.shape
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">horizontal_kernel</span> = cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #c678dd;">int</span>(image_width / SCALE), <span style="color: #da8548; font-weight: bold;">1</span>))
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">horizontally_opened</span> = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">vertical_kernel</span> = cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">1</span>, <span style="color: #c678dd;">int</span>(image_height / SCALE)))
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">vertically_opened</span> = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">horizontally_dilated</span> = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">40</span>, <span style="color: #da8548; font-weight: bold;">1</span>)))
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">vertically_dilated</span> = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">1</span>, <span style="color: #da8548; font-weight: bold;">60</span>)))
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">mask</span> = horizontally_dilated + vertically_dilated
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">contours</span>, <span style="color: #dcaeea;">heirarchy</span> = cv2.findContours(
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
|
<span style="background-color: #282c34;"> </span> )
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">MIN_TABLE_AREA</span> = 1e5
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">contours</span> = [c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> contours <span style="color: #51afef;">if</span> cv2.contourArea(c) > MIN_TABLE_AREA]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">perimeter_lengths</span> = [cv2.arcLength(c, <span style="color: #a9a1e1;">True</span>) <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> contours]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">epsilons</span> = [<span style="color: #da8548; font-weight: bold;">0.1</span> * p <span style="color: #51afef;">for</span> p <span style="color: #51afef;">in</span> perimeter_lengths]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">approx_polys</span> = [cv2.approxPolyDP(c, e, <span style="color: #a9a1e1;">True</span>) <span style="color: #51afef;">for</span> c, e <span style="color: #51afef;">in</span> <span style="color: #c678dd;">zip</span>(contours, epsilons)]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">bounding_rects</span> = [cv2.boundingRect(a) <span style="color: #51afef;">for</span> a <span style="color: #51afef;">in</span> approx_polys]
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">The link where a lot of this code was borrowed from recommends an</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">additional step to check the number of "joints" inside this bounding rectangle.</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">A table should have a lot of intersections. We might have a rectangular image</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">here though which would only have 4 intersections, 1 at each corner.</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Leaving that step as a future </span><span style="color: #ECBE7B; font-weight: bold;">TODO</span><span style="color: #5B6268;"> if it is ever necessary.</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">images</span> = [image[y:y+h, x:x+w] <span style="color: #51afef;">for</span> x, y, w, h <span style="color: #51afef;">in</span> bounding_rects]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> images
|
|
</pre>
|
|
</div>
|
|
|
|
<p>
|
|
Here is an the an example of the result of the <code>find_tables</code> function.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #51afef;">import</span> cv2
|
|
|
|
<span style="color: #dcaeea;">image_filename</span> = <span style="color: #98be65;">"resources/examples/example-page.png"</span>
|
|
<span style="color: #dcaeea;">image</span> = cv2.imread(image_filename, cv2.IMREAD_GRAYSCALE)
|
|
<span style="color: #dcaeea;">image</span> = find_tables(image)[<span style="color: #da8548; font-weight: bold;">0</span>]
|
|
cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table.png"</span>, image)
|
|
</pre>
|
|
</div>
|
|
|
|
<div class="org-center">
|
|
|
|
<div class="figure">
|
|
<p><img src="resources/examples/example-page.png" alt="example-page.png" width="250px" />
|
|
</p>
|
|
</div>
|
|
|
|
<p>
|
|
↓
|
|
</p>
|
|
|
|
|
|
<div class="figure">
|
|
<p><img src="resources/examples/example-table.png" alt="example-table.png" width="250px" />
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org9dd75a6" class="outline-3">
|
|
<h3 id="org9dd75a6"><span class="section-number-3">3.1</span> Improving accuracy</h3>
|
|
<div class="outline-text-3" id="text-3-1">
|
|
<p>
|
|
It’s likely that some images will contain tables that aren’t accurately
|
|
recognized by the code above. The code will then need to be made more robust.
|
|
But how will we know that changes to the code don’t break the detection of
|
|
tables that were previously detected?
|
|
</p>
|
|
|
|
<p>
|
|
It might be good to add some type of test suite in the future that contains a
|
|
spec that matches a pdf with the pages and pixel coordinates of the detected
|
|
tables. The coordinates would need to have a range. Something like
|
|
“example-1.pdf, page-2.png, [450:470, 200:210, 800:820, 1270:1290]” where the
|
|
elements of the list are valid x, y, w, h ranges. So the test will pass if if
|
|
the x, y, width and height are anywhere in that range.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org904debc" class="outline-2">
|
|
<h2 id="org904debc"><span class="section-number-2">4</span> OCR tables</h2>
|
|
<div class="outline-text-2" id="text-4">
|
|
<p>
|
|
Tesseract does not perform well when run on images of tables. It performs best
|
|
when given a single line of text with no extra noise.
|
|
</p>
|
|
|
|
<p>
|
|
Therefore, our next task is to find and extract the bounding box of each cell in
|
|
the table. Run tesseract on each cell. Print a comma seperated output.
|
|
</p>
|
|
|
|
<p>
|
|
We’ll start with an image shown at the end of the previous section.
|
|
</p>
|
|
</div>
|
|
|
|
<div id="outline-container-orgb03a965" class="outline-3">
|
|
<h3 id="orgb03a965"><span class="section-number-3">4.1</span> Training Tesseract</h3>
|
|
<div class="outline-text-3" id="text-4-1">
|
|
<p>
|
|
Tesseract is used for recognizing characters. It is not involved in extracting the tables from an image or in extracting cells from the table.
|
|
</p>
|
|
|
|
<p>
|
|
It’s a very good idea to train tesseract. Accuracy will improve tremendously.
|
|
</p>
|
|
|
|
<p>
|
|
Clone the tesstrain repo at <a href="https://github.com/tesseract-ocr/tesstrain">https://github.com/tesseract-ocr/tesstrain</a>.
|
|
</p>
|
|
|
|
<p>
|
|
Run the <a href="#org5f5c842"><code>ocr_tables</code></a> script on a few pdfs to generate some training data. That
|
|
script outputs pairs of <code>.png</code> and <code>.gt.txt</code> files that can be used by
|
|
tesstrain.
|
|
</p>
|
|
|
|
<p>
|
|
Make sure the <code>.gt.txt</code> files contain an accurate recognition of the
|
|
corresponding image. Since the first few runs will be untrained, you’ll probably
|
|
need to fix up a few of the text files.
|
|
</p>
|
|
|
|
<p>
|
|
Once they are accurate, move them to a new subdirectory of the tesstrain repo;
|
|
<code>tesstrain/data/<model-name>-ground-truth/</code>.
|
|
</p>
|
|
|
|
<p>
|
|
You’ll also need to clone the <code>tessdata_best</code> repo,
|
|
<a href="https://github.com/tesseract-ocr/tessdata_best">https://github.com/tesseract-ocr/tessdata_best</a> and the
|
|
<a href="https://github.com/tesseract-ocr/langdata">https://github.com/tesseract-ocr/langdata</a> to use as the start of the
|
|
training model.
|
|
</p>
|
|
|
|
<p>
|
|
I’m actually not sure how much the punctuation and numbers from <code>langdata</code> help.
|
|
I didn’t keep accurate records while playing with the training, I don’t
|
|
thoroughly understand it, and it’s not profitable for me to explore it at the
|
|
moment. It worked for my purposes and that has been good enough.
|
|
</p>
|
|
|
|
<pre class="example">
|
|
make training MODEL_NAME=table-ocr START_MODEL=eng TESSDATA=~/src/tessdata_best PUNC_FILE=~/src/langdata/eng/eng.punc NUMBERS_FILE=~/src/langdata/eng/eng.numbers
|
|
</pre>
|
|
|
|
<p>
|
|
Once the training is complete, there will be a new file
|
|
<code>tesstrain/data/<model-name>.traineddata</code>. Copy that file to the directory
|
|
Tesseract searches for models. On my machine, it was <code>/usr/local/share/tessdata/</code>.
|
|
</p>
|
|
</div>
|
|
|
|
<div id="outline-container-org5adeb27" class="outline-4">
|
|
<h4 id="org5adeb27"><span class="section-number-4">4.1.1</span> Training tips</h4>
|
|
<div class="outline-text-4" id="text-4-1-1">
|
|
<p>
|
|
Here is a tip for quickly creating training data.
|
|
</p>
|
|
|
|
<p>
|
|
The output of the <code>ocr_cells</code> script will be a directory named <code>ocr_data</code> that
|
|
will have two files for each cell. One file is the image of the cell and the
|
|
other file is the OCR text.
|
|
</p>
|
|
|
|
<p>
|
|
You’ll want to compare each image to its OCR text to check for accuracy. If
|
|
the text doesn’t match, you’ll want to update the text and add the image to the
|
|
training data.
|
|
</p>
|
|
|
|
<p>
|
|
The fastest way to do this is with <code>feh</code>.
|
|
</p>
|
|
|
|
<p>
|
|
<code>feh</code> lets you view an image and a caption at the same time and lets you edit
|
|
the caption from within <code>feh</code>.
|
|
</p>
|
|
|
|
<p>
|
|
<code>feh</code> expects the captions to be named <code><image-name>.txt</code>, so use a little
|
|
shell-fu to do a quick rename.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-shell"><span style="color: #51afef;">for</span> f<span style="color: #51afef;"> in</span> *.txt; <span style="color: #51afef;">do</span> <span style="color: #dcaeea;">f1</span>=$<span style="color: #51afef;">(</span>cut -d<span style="color: #98be65;">"."</span> -f1 <<span style="color: #c678dd;">(</span><span style="color: #ECBE7B;">echo</span> $<span style="color: #dcaeea;">f</span><span style="color: #c678dd;">)</span><span style="color: #51afef;">)</span>; <span style="color: #ECBE7B;">mv</span> $<span style="color: #dcaeea;">f</span> $<span style="color: #51afef;">{</span><span style="color: #dcaeea;">f1</span><span style="color: #51afef;">}</span>.png.txt; <span style="color: #51afef;">done</span>
|
|
</pre>
|
|
</div>
|
|
|
|
<p>
|
|
Then run <code>feh -K .</code> to specify the current directory as the caption directory.
|
|
This will open a window with the first image in the directory and its caption.
|
|
</p>
|
|
|
|
<p>
|
|
Press <code>c</code> to edit the caption (if needed) and <code>n~/~p</code> to move to the
|
|
next/previons images. Press <code>q</code> to quit.
|
|
</p>
|
|
|
|
<p>
|
|
When finished, rename the files back to the filename structure that Tesseract
|
|
looks for in training.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-shell"><span style="color: #51afef;">for</span> f<span style="color: #51afef;"> in</span> *.txt; <span style="color: #51afef;">do</span> <span style="color: #dcaeea;">f1</span>=$<span style="color: #51afef;">(</span>cut -d<span style="color: #98be65;">"."</span> -f1 <<span style="color: #c678dd;">(</span><span style="color: #ECBE7B;">echo</span> $<span style="color: #dcaeea;">f</span><span style="color: #c678dd;">)</span><span style="color: #51afef;">)</span>; <span style="color: #ECBE7B;">mv</span> $<span style="color: #dcaeea;">f</span> $<span style="color: #51afef;">{</span><span style="color: #dcaeea;">f1</span><span style="color: #51afef;">}</span>.gt.txt; <span style="color: #51afef;">done</span>
|
|
</pre>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org152ead5" class="outline-3">
|
|
<h3 id="org152ead5"><span class="section-number-3">4.2</span> Blur</h3>
|
|
<div class="outline-text-3" id="text-4-2">
|
|
<p>
|
|
Blurring helps to make noise less noisy so that the overall structure of an
|
|
image is more detectable.
|
|
</p>
|
|
|
|
<p>
|
|
That gray row at the bottom is kind of noisy. If we don’t somehow clean it up,
|
|
OpenCV will detect all sorts of odd shapes in there and it will throw off our
|
|
cell detection.
|
|
</p>
|
|
|
|
<p>
|
|
Cleanup can be accomplished with a blur followed by some thresholding.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #dcaeea;">BLUR_KERNEL_SIZE</span> = (<span style="color: #da8548; font-weight: bold;">17</span>, <span style="color: #da8548; font-weight: bold;">17</span>)
|
|
<span style="color: #dcaeea;">STD_DEV_X_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
|
|
<span style="color: #dcaeea;">STD_DEV_Y_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
|
|
<span style="color: #dcaeea;">blurred</span> = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
|
|
</pre>
|
|
</div>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #dcaeea;">image</span> = ~cv2.imread(<span style="color: #98be65;">"resources/examples/example-table.png"</span>, cv2.IMREAD_GRAYSCALE)
|
|
<<blur>>
|
|
cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-blurred.png"</span>, blurred)
|
|
</pre>
|
|
</div>
|
|
|
|
|
|
<div class="figure">
|
|
<p><img src="resources/examples/example-table-blurred.png" alt="example-table-blurred.png" width="500px" height="100%" />
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org858fb89" class="outline-3">
|
|
<h3 id="org858fb89"><span class="section-number-3">4.3</span> Threshold</h3>
|
|
<div class="outline-text-3" id="text-4-3">
|
|
<p>
|
|
We’ve got a bunch of pixels that are gray. Thresholding will turn them all
|
|
either black or white. Having all black or white pixels lets us do morphological
|
|
transformations like erosion and dilation.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #dcaeea;">MAX_COLOR_VAL</span> = <span style="color: #da8548; font-weight: bold;">255</span>
|
|
<span style="color: #dcaeea;">BLOCK_SIZE</span> = <span style="color: #da8548; font-weight: bold;">15</span>
|
|
<span style="color: #dcaeea;">SUBTRACT_FROM_MEAN</span> = -<span style="color: #da8548; font-weight: bold;">2</span>
|
|
|
|
<span style="color: #dcaeea;">img_bin</span> = cv2.adaptiveThreshold(
|
|
<span style="background-color: #282c34;"> </span> ~blurred,
|
|
<span style="background-color: #282c34;"> </span> MAX_COLOR_VAL,
|
|
<span style="background-color: #282c34;"> </span> cv2.ADAPTIVE_THRESH_MEAN_C,
|
|
<span style="background-color: #282c34;"> </span> cv2.THRESH_BINARY,
|
|
<span style="background-color: #282c34;"> </span> BLOCK_SIZE,
|
|
<span style="background-color: #282c34;"> </span> SUBTRACT_FROM_MEAN,
|
|
)
|
|
</pre>
|
|
</div>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><<threshold>>
|
|
cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-thresholded.png"</span>, img_bin)
|
|
</pre>
|
|
</div>
|
|
|
|
|
|
<div class="figure">
|
|
<p><img src="resources/examples/example-table-thresholded.png" alt="example-table-thresholded.png" width="500px" height="100%" />
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-orgcf17042" class="outline-3">
|
|
<h3 id="orgcf17042"><span class="section-number-3">4.4</span> Finding the vertical and horizontal lines of the table</h3>
|
|
<div class="outline-text-3" id="text-4-4">
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #dcaeea;">vertical</span> = <span style="color: #dcaeea;">horizontal</span> = img_bin.copy()
|
|
<span style="color: #dcaeea;">SCALE</span> = <span style="color: #da8548; font-weight: bold;">5</span>
|
|
<span style="color: #dcaeea;">image_width</span>, <span style="color: #dcaeea;">image_height</span> = horizontal.shape
|
|
<span style="color: #dcaeea;">horizontal_kernel</span> = cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #c678dd;">int</span>(image_width / SCALE), <span style="color: #da8548; font-weight: bold;">1</span>))
|
|
<span style="color: #dcaeea;">horizontally_opened</span> = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
|
|
<span style="color: #dcaeea;">vertical_kernel</span> = cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">1</span>, <span style="color: #c678dd;">int</span>(image_height / SCALE)))
|
|
<span style="color: #dcaeea;">vertically_opened</span> = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
|
|
|
|
<span style="color: #dcaeea;">horizontally_dilated</span> = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">40</span>, <span style="color: #da8548; font-weight: bold;">1</span>)))
|
|
<span style="color: #dcaeea;">vertically_dilated</span> = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">1</span>, <span style="color: #da8548; font-weight: bold;">60</span>)))
|
|
|
|
<span style="color: #dcaeea;">mask</span> = horizontally_dilated + vertically_dilated
|
|
</pre>
|
|
</div>
|
|
|
|
<p>
|
|
Note: There’s a wierd issue with the results of the example below when it’s
|
|
evaluated as part of an export or a full-buffer evaluation. If you evaluate the
|
|
example by itself, it looks the way it’s intended. If you evaluate it as part of
|
|
an entire buffer evaluation, like during export, it’s distorted.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><<lines-of-table>>
|
|
cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-lines.png"</span>, mask)
|
|
</pre>
|
|
</div>
|
|
|
|
|
|
<div class="figure">
|
|
<p><img src="resources/examples/example-table-lines.png" alt="example-table-lines.png" width="500px" />
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org94f71b3" class="outline-3">
|
|
<h3 id="org94f71b3"><span class="section-number-3">4.5</span> Finding the contours</h3>
|
|
<div class="outline-text-3" id="text-4-5">
|
|
<p>
|
|
Blurring and thresholding allow us to find the lines. Opening the lines allows
|
|
us to find the contours.
|
|
</p>
|
|
|
|
<p>
|
|
An “Opening” is an erosion followed by a dilation. Great examples and
|
|
descriptions of each morphological operation can be found at
|
|
<a href="https://docs.opencv.org/trunk/d9/d61/tutorial_py_morphological_ops.html">https://docs.opencv.org/trunk/d9/d61/tutorial_py_morphological_ops.html</a>.
|
|
</p>
|
|
|
|
<blockquote>
|
|
<p>
|
|
Contours can be explained simply as a curve joining all the continuous points
|
|
(along the boundary), having same color or intensity. The contours are a useful
|
|
tool for shape analysis and object detection and recognition.
|
|
</p>
|
|
</blockquote>
|
|
|
|
<p>
|
|
We can search those contours to find rectangles of certain size.
|
|
</p>
|
|
|
|
<p>
|
|
To do that, we can use OpenCV’s <code>approxPolyEP</code> function. It takes as arguments
|
|
the contour (list of contiguous points), and a number representing how different
|
|
the polygon perimeter length can be from the true perimeter length of the
|
|
contour. <code>0.1</code> (10%) seems to be a good value. The difference in perimeter
|
|
length between a 4-sided polygon and a 3-sided polygon is greater than 10% and
|
|
the difference between a 5+ sided polygon and a 4-sided polygon is less than
|
|
10%. So a 4-sided polygon is the polygon with the fewest sides that leaves the
|
|
difference in perimeter length within our 10% threshold.
|
|
</p>
|
|
|
|
<p>
|
|
Then we just get the bounding rectangle of that polygon and we have our cells!
|
|
</p>
|
|
|
|
<p>
|
|
We might need to do a little more filtering of those rectangles though. We might
|
|
have accidentally found some noise such as another image on the page or a title
|
|
header bar or something. If we know our cells are all within a certain size (by
|
|
area of pixels) then we can filter out the junk cells by removing cells
|
|
above/below certain sizes.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python" id="orgf486a5a"><span style="color: #dcaeea;">contours</span>, <span style="color: #dcaeea;">heirarchy</span> = cv2.findContours(
|
|
<span style="background-color: #282c34;"> </span> mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
|
|
)
|
|
|
|
<span style="color: #dcaeea;">perimeter_lengths</span> = [cv2.arcLength(c, <span style="color: #a9a1e1;">True</span>) <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> contours]
|
|
<span style="color: #dcaeea;">epsilons</span> = [<span style="color: #da8548; font-weight: bold;">0.05</span> * p <span style="color: #51afef;">for</span> p <span style="color: #51afef;">in</span> perimeter_lengths]
|
|
<span style="color: #dcaeea;">approx_polys</span> = [cv2.approxPolyDP(c, e, <span style="color: #a9a1e1;">True</span>) <span style="color: #51afef;">for</span> c, e <span style="color: #51afef;">in</span> <span style="color: #c678dd;">zip</span>(contours, epsilons)]
|
|
|
|
<span style="color: #5B6268;"># </span><span style="color: #5B6268;">Filter out contours that aren't rectangular. Those that aren't rectangular</span>
|
|
<span style="color: #5B6268;"># </span><span style="color: #5B6268;">are probably noise.</span>
|
|
<span style="color: #dcaeea;">approx_rects</span> = [p <span style="color: #51afef;">for</span> p <span style="color: #51afef;">in</span> approx_polys <span style="color: #51afef;">if</span> <span style="color: #c678dd;">len</span>(p) == <span style="color: #da8548; font-weight: bold;">4</span>]
|
|
<span style="color: #dcaeea;">bounding_rects</span> = [cv2.boundingRect(a) <span style="color: #51afef;">for</span> a <span style="color: #51afef;">in</span> approx_polys]
|
|
|
|
<span style="color: #5B6268;"># </span><span style="color: #5B6268;">Filter out rectangles that are too narrow or too short.</span>
|
|
<span style="color: #dcaeea;">MIN_RECT_WIDTH</span> = <span style="color: #da8548; font-weight: bold;">40</span>
|
|
<span style="color: #dcaeea;">MIN_RECT_HEIGHT</span> = <span style="color: #da8548; font-weight: bold;">10</span>
|
|
<span style="color: #dcaeea;">bounding_rects</span> = [
|
|
<span style="background-color: #282c34;"> </span> r <span style="color: #51afef;">for</span> r <span style="color: #51afef;">in</span> bounding_rects <span style="color: #51afef;">if</span> MIN_RECT_WIDTH < r[<span style="color: #da8548; font-weight: bold;">2</span>] <span style="color: #51afef;">and</span> MIN_RECT_HEIGHT < r[<span style="color: #da8548; font-weight: bold;">3</span>]
|
|
]
|
|
|
|
<span style="color: #5B6268;"># </span><span style="color: #5B6268;">The largest bounding rectangle is assumed to be the entire table.</span>
|
|
<span style="color: #5B6268;"># </span><span style="color: #5B6268;">Remove it from the list. We don't want to accidentally try to OCR</span>
|
|
<span style="color: #5B6268;"># </span><span style="color: #5B6268;">the entire table.</span>
|
|
<span style="color: #dcaeea;">largest_rect</span> = <span style="color: #c678dd;">max</span>(bounding_rects, key=<span style="color: #51afef;">lambda</span> r: r[<span style="color: #da8548; font-weight: bold;">2</span>] * r[<span style="color: #da8548; font-weight: bold;">3</span>])
|
|
<span style="color: #dcaeea;">bounding_rects</span> = [b <span style="color: #51afef;">for</span> b <span style="color: #51afef;">in</span> bounding_rects <span style="color: #51afef;">if</span> b <span style="color: #51afef;">is</span> <span style="color: #51afef;">not</span> largest_rect]
|
|
|
|
<span style="color: #dcaeea;">cells</span> = [c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> bounding_rects]
|
|
</pre>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-orgc64b6ef" class="outline-3">
|
|
<h3 id="orgc64b6ef"><span class="section-number-3">4.6</span> Sorting the bounding rectangles</h3>
|
|
<div class="outline-text-3" id="text-4-6">
|
|
<p>
|
|
We want to process these from left-to-right, top-to-bottom.
|
|
</p>
|
|
|
|
<p>
|
|
I’ve thought of a straightforward algorithm for it, but it could probably be
|
|
made more efficient.
|
|
</p>
|
|
|
|
<p>
|
|
We’ll find the most rectangle with the most top-left corner. Then we’ll find all
|
|
of the rectangles that have a center that is within the top-y and bottom-y
|
|
values of that top-left rectangle. Then we’ll sort those rectangles by the x
|
|
value of their center. We’ll remove those rectangles from the list and repeat.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python" id="org30980d9"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">cell_in_same_row</span>(c1, c2):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">c1_center</span> = c1[<span style="color: #da8548; font-weight: bold;">1</span>] + c1[<span style="color: #da8548; font-weight: bold;">3</span>] - c1[<span style="color: #da8548; font-weight: bold;">3</span>] / <span style="color: #da8548; font-weight: bold;">2</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">c2_bottom</span> = c2[<span style="color: #da8548; font-weight: bold;">1</span>] + c2[<span style="color: #da8548; font-weight: bold;">3</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">c2_top</span> = c2[<span style="color: #da8548; font-weight: bold;">1</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> c2_top < c1_center < c2_bottom
|
|
|
|
<span style="color: #dcaeea;">orig_cells</span> = [c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> cells]
|
|
<span style="color: #dcaeea;">rows</span> = []
|
|
<span style="color: #51afef;">while</span> cells:
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">first</span> = cells[<span style="color: #da8548; font-weight: bold;">0</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">rest</span> = cells[<span style="color: #da8548; font-weight: bold;">1</span>:]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">cells_in_same_row</span> = <span style="color: #c678dd;">sorted</span>(
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> [
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> rest
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">if</span> cell_in_same_row(c, first)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> ],
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> key=<span style="color: #51afef;">lambda</span> c: c[<span style="color: #da8548; font-weight: bold;">0</span>]
|
|
<span style="background-color: #282c34;"> </span> )
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">row_cells</span> = <span style="color: #c678dd;">sorted</span>([first] + cells_in_same_row, key=<span style="color: #51afef;">lambda</span> c: c[<span style="color: #da8548; font-weight: bold;">0</span>])
|
|
<span style="background-color: #282c34;"> </span> rows.append(row_cells)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">cells</span> = [
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> rest
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">if</span> <span style="color: #51afef;">not</span> cell_in_same_row(c, first)
|
|
<span style="background-color: #282c34;"> </span> ]
|
|
|
|
<span style="color: #5B6268;"># </span><span style="color: #5B6268;">Sort rows by average height of their center.</span>
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">avg_height_of_center</span>(row):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">centers</span> = [y + h - h / <span style="color: #da8548; font-weight: bold;">2</span> <span style="color: #51afef;">for</span> x, y, w, h <span style="color: #51afef;">in</span> row]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> <span style="color: #c678dd;">sum</span>(centers) / <span style="color: #c678dd;">len</span>(centers)
|
|
|
|
rows.sort(key=avg_height_of_center)
|
|
</pre>
|
|
</div>
|
|
|
|
<p>
|
|
To test if this code works, let’s try sorting the bounding rectangles and
|
|
numbering them from right to left, top to bottom.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #51afef;">import</span> cv2
|
|
<span style="color: #dcaeea;">image</span> = cv2.imread(<span style="color: #98be65;">"resources/examples/example-table.png"</span>, cv2.IMREAD_GRAYSCALE)
|
|
<<blur>>
|
|
<<threshold>>
|
|
<<lines-of-table>>
|
|
<<bounding-rects>>
|
|
<<sort-contours>>
|
|
|
|
<span style="color: #dcaeea;">FONT_SCALE</span> = <span style="color: #da8548; font-weight: bold;">0.7</span>
|
|
<span style="color: #dcaeea;">FONT_COLOR</span> = (<span style="color: #da8548; font-weight: bold;">127</span>, <span style="color: #da8548; font-weight: bold;">127</span>, <span style="color: #da8548; font-weight: bold;">127</span>)
|
|
<span style="color: #51afef;">for</span> i, row <span style="color: #51afef;">in</span> <span style="color: #c678dd;">enumerate</span>(rows):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">for</span> j, cell <span style="color: #51afef;">in</span> <span style="color: #c678dd;">enumerate</span>(row):
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">x</span>, <span style="color: #dcaeea;">y</span>, <span style="color: #dcaeea;">w</span>, <span style="color: #dcaeea;">h</span> = cell
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> cv2.putText(
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> image,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #98be65;">"{},{}"</span>.<span style="color: #c678dd;">format</span>(i, j),
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> (<span style="color: #c678dd;">int</span>(x + w - w / <span style="color: #da8548; font-weight: bold;">2</span>), <span style="color: #c678dd;">int</span>(y + h - h / <span style="color: #da8548; font-weight: bold;">2</span>)),
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> cv2.FONT_HERSHEY_SIMPLEX,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> FONT_SCALE,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> FONT_COLOR,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #da8548; font-weight: bold;">2</span>,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> )
|
|
cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-cells-numbered.png"</span>, image)
|
|
</pre>
|
|
</div>
|
|
|
|
|
|
<div class="figure">
|
|
<p><img src="resources/examples/example-table-cells-numbered.png" alt="example-table-cells-numbered.png" width="500px" height="100%" />
|
|
</p>
|
|
</div>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python" id="org74e59e6"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">extract_cell_images_from_table</span>(image):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">BLUR_KERNEL_SIZE</span> = (<span style="color: #da8548; font-weight: bold;">17</span>, <span style="color: #da8548; font-weight: bold;">17</span>)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">STD_DEV_X_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">STD_DEV_Y_DIRECTION</span> = <span style="color: #da8548; font-weight: bold;">0</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">blurred</span> = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">MAX_COLOR_VAL</span> = <span style="color: #da8548; font-weight: bold;">255</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">BLOCK_SIZE</span> = <span style="color: #da8548; font-weight: bold;">15</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">SUBTRACT_FROM_MEAN</span> = -<span style="color: #da8548; font-weight: bold;">2</span>
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">img_bin</span> = cv2.adaptiveThreshold(
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> ~blurred,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> MAX_COLOR_VAL,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> cv2.ADAPTIVE_THRESH_MEAN_C,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> cv2.THRESH_BINARY,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> BLOCK_SIZE,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> SUBTRACT_FROM_MEAN,
|
|
<span style="background-color: #282c34;"> </span> )
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">vertical</span> = <span style="color: #dcaeea;">horizontal</span> = img_bin.copy()
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">SCALE</span> = <span style="color: #da8548; font-weight: bold;">5</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">image_width</span>, <span style="color: #dcaeea;">image_height</span> = horizontal.shape
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">horizontal_kernel</span> = cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #c678dd;">int</span>(image_width / SCALE), <span style="color: #da8548; font-weight: bold;">1</span>))
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">horizontally_opened</span> = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">vertical_kernel</span> = cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">1</span>, <span style="color: #c678dd;">int</span>(image_height / SCALE)))
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">vertically_opened</span> = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">horizontally_dilated</span> = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">40</span>, <span style="color: #da8548; font-weight: bold;">1</span>)))
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">vertically_dilated</span> = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">1</span>, <span style="color: #da8548; font-weight: bold;">60</span>)))
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">mask</span> = horizontally_dilated + vertically_dilated
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">contours</span>, <span style="color: #dcaeea;">heirarchy</span> = cv2.findContours(
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
|
|
<span style="background-color: #282c34;"> </span> )
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">perimeter_lengths</span> = [cv2.arcLength(c, <span style="color: #a9a1e1;">True</span>) <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> contours]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">epsilons</span> = [<span style="color: #da8548; font-weight: bold;">0.05</span> * p <span style="color: #51afef;">for</span> p <span style="color: #51afef;">in</span> perimeter_lengths]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">approx_polys</span> = [cv2.approxPolyDP(c, e, <span style="color: #a9a1e1;">True</span>) <span style="color: #51afef;">for</span> c, e <span style="color: #51afef;">in</span> <span style="color: #c678dd;">zip</span>(contours, epsilons)]
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Filter out contours that aren't rectangular. Those that aren't rectangular</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">are probably noise.</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">approx_rects</span> = [p <span style="color: #51afef;">for</span> p <span style="color: #51afef;">in</span> approx_polys <span style="color: #51afef;">if</span> <span style="color: #c678dd;">len</span>(p) == <span style="color: #da8548; font-weight: bold;">4</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">bounding_rects</span> = [cv2.boundingRect(a) <span style="color: #51afef;">for</span> a <span style="color: #51afef;">in</span> approx_polys]
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Filter out rectangles that are too narrow or too short.</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">MIN_RECT_WIDTH</span> = <span style="color: #da8548; font-weight: bold;">40</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">MIN_RECT_HEIGHT</span> = <span style="color: #da8548; font-weight: bold;">10</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">bounding_rects</span> = [
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> r <span style="color: #51afef;">for</span> r <span style="color: #51afef;">in</span> bounding_rects <span style="color: #51afef;">if</span> MIN_RECT_WIDTH < r[<span style="color: #da8548; font-weight: bold;">2</span>] <span style="color: #51afef;">and</span> MIN_RECT_HEIGHT < r[<span style="color: #da8548; font-weight: bold;">3</span>]
|
|
<span style="background-color: #282c34;"> </span> ]
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">The largest bounding rectangle is assumed to be the entire table.</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Remove it from the list. We don't want to accidentally try to OCR</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">the entire table.</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">largest_rect</span> = <span style="color: #c678dd;">max</span>(bounding_rects, key=<span style="color: #51afef;">lambda</span> r: r[<span style="color: #da8548; font-weight: bold;">2</span>] * r[<span style="color: #da8548; font-weight: bold;">3</span>])
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">bounding_rects</span> = [b <span style="color: #51afef;">for</span> b <span style="color: #51afef;">in</span> bounding_rects <span style="color: #51afef;">if</span> b <span style="color: #51afef;">is</span> <span style="color: #51afef;">not</span> largest_rect]
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">cells</span> = [c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> bounding_rects]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">def</span> <span style="color: #c678dd;">cell_in_same_row</span>(c1, c2):
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">c1_center</span> = c1[<span style="color: #da8548; font-weight: bold;">1</span>] + c1[<span style="color: #da8548; font-weight: bold;">3</span>] - c1[<span style="color: #da8548; font-weight: bold;">3</span>] / <span style="color: #da8548; font-weight: bold;">2</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">c2_bottom</span> = c2[<span style="color: #da8548; font-weight: bold;">1</span>] + c2[<span style="color: #da8548; font-weight: bold;">3</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">c2_top</span> = c2[<span style="color: #da8548; font-weight: bold;">1</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> c2_top < c1_center < c2_bottom
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">orig_cells</span> = [c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> cells]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">rows</span> = []
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">while</span> cells:
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">first</span> = cells[<span style="color: #da8548; font-weight: bold;">0</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">rest</span> = cells[<span style="color: #da8548; font-weight: bold;">1</span>:]
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">cells_in_same_row</span> = <span style="color: #c678dd;">sorted</span>(
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> [
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> rest
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">if</span> cell_in_same_row(c, first)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> ],
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> key=<span style="color: #51afef;">lambda</span> c: c[<span style="color: #da8548; font-weight: bold;">0</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> )
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">row_cells</span> = <span style="color: #c678dd;">sorted</span>([first] + cells_in_same_row, key=<span style="color: #51afef;">lambda</span> c: c[<span style="color: #da8548; font-weight: bold;">0</span>])
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> rows.append(row_cells)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">cells</span> = [
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> c <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> rest
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">if</span> <span style="color: #51afef;">not</span> cell_in_same_row(c, first)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> ]
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Sort rows by average height of their center.</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">def</span> <span style="color: #c678dd;">avg_height_of_center</span>(row):
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">centers</span> = [y + h - h / <span style="color: #da8548; font-weight: bold;">2</span> <span style="color: #51afef;">for</span> x, y, w, h <span style="color: #51afef;">in</span> row]
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> <span style="color: #c678dd;">sum</span>(centers) / <span style="color: #c678dd;">len</span>(centers)
|
|
<span style="background-color: #282c34;"> </span>
|
|
<span style="background-color: #282c34;"> </span> rows.sort(key=avg_height_of_center)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">cell_images_rows</span> = []
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">for</span> row <span style="color: #51afef;">in</span> rows:
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">cell_images_row</span> = []
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">for</span> x, y, w, h <span style="color: #51afef;">in</span> row:
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> cell_images_row.append(image[y:y+h, x:x+w])
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> cell_images_rows.append(cell_images_row)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> cell_images_rows
|
|
</pre>
|
|
</div>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><<extract-cells-<span style="color: #51afef;">from</span>-table>>
|
|
<span style="color: #dcaeea;">image</span> = cv2.imread(<span style="color: #98be65;">"resources/examples/example-table.png"</span>, cv2.IMREAD_GRAYSCALE)
|
|
<span style="color: #dcaeea;">cell_images_rows</span> = extract_cell_images_from_table(image)
|
|
cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-cell-1-1.png"</span>, cell_images_rows[<span style="color: #da8548; font-weight: bold;">1</span>][<span style="color: #da8548; font-weight: bold;">1</span>])
|
|
</pre>
|
|
</div>
|
|
|
|
|
|
<div class="figure">
|
|
<p><img src="resources/examples/example-table-cell-1-1.png" alt="example-table-cell-1-1.png" width="200px" height="100%" />
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-orgd4dc4cc" class="outline-3">
|
|
<h3 id="orgd4dc4cc"><span class="section-number-3">4.7</span> Cropping each cell to the text</h3>
|
|
<div class="outline-text-3" id="text-4-7">
|
|
<p>
|
|
OCR with Tesseract works best when there is about 10 pixels of white border
|
|
around the text.
|
|
</p>
|
|
|
|
<p>
|
|
Our bounding rectangles may have picked up some stray pixels from the horizontal
|
|
and vertical lines of the cells in the table. It’s probobly just a few pixels,
|
|
much fewer than the width of the text. If that’s the case, then we can remove
|
|
that noise with a simple open morph.
|
|
</p>
|
|
|
|
<p>
|
|
Once the stray border pixels have been removed, we can expand our border using
|
|
<code>copyMakeBorder</code>.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">crop_to_text</span>(image):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">MAX_COLOR_VAL</span> = <span style="color: #da8548; font-weight: bold;">255</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">BLOCK_SIZE</span> = <span style="color: #da8548; font-weight: bold;">15</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">SUBTRACT_FROM_MEAN</span> = -<span style="color: #da8548; font-weight: bold;">2</span>
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">img_bin</span> = cv2.adaptiveThreshold(
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> ~image,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> MAX_COLOR_VAL,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> cv2.ADAPTIVE_THRESH_MEAN_C,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> cv2.THRESH_BINARY,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> BLOCK_SIZE,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> SUBTRACT_FROM_MEAN,
|
|
<span style="background-color: #282c34;"> </span> )
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">img_h</span>, <span style="color: #dcaeea;">img_w</span> = image.shape
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">horizontal_kernel</span> = cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #c678dd;">int</span>(img_w * <span style="color: #da8548; font-weight: bold;">0.5</span>), <span style="color: #da8548; font-weight: bold;">1</span>))
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">vertical_kernel</span> = cv2.getStructuringElement(cv2.MORPH_RECT, (<span style="color: #da8548; font-weight: bold;">1</span>, <span style="color: #c678dd;">int</span>(img_h * <span style="color: #da8548; font-weight: bold;">0.7</span>)))
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">horizontal_lines</span> = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">vertical_lines</span> = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">both</span> = horizontal_lines + vertical_lines
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">cleaned</span> = img_bin - both
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Get rid of little noise.</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">kernel</span> = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (<span style="color: #da8548; font-weight: bold;">3</span>, <span style="color: #da8548; font-weight: bold;">3</span>))
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">opened</span> = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">opened</span> = cv2.dilate(opened, kernel)
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">contours</span>, <span style="color: #dcaeea;">hierarchy</span> = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">bounding_rects</span> = [cv2.boundingRect(c) <span style="color: #51afef;">for</span> c <span style="color: #51afef;">in</span> contours]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">NUM_PX_COMMA</span> = <span style="color: #da8548; font-weight: bold;">6</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">MIN_CHAR_AREA</span> = <span style="color: #da8548; font-weight: bold;">5</span> * <span style="color: #da8548; font-weight: bold;">9</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">char_sized_bounding_rects</span> = [(x, y, w, h) <span style="color: #51afef;">for</span> x, y, w, h <span style="color: #51afef;">in</span> bounding_rects <span style="color: #51afef;">if</span> w * h > MIN_CHAR_AREA]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">if</span> char_sized_bounding_rects:
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">minx</span>, <span style="color: #dcaeea;">miny</span>, <span style="color: #dcaeea;">maxx</span>, <span style="color: #dcaeea;">maxy</span> = math.inf, math.inf, <span style="color: #da8548; font-weight: bold;">0</span>, <span style="color: #da8548; font-weight: bold;">0</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">for</span> x, y, w, h <span style="color: #51afef;">in</span> char_sized_bounding_rects:
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">minx</span> = <span style="color: #c678dd;">min</span>(minx, x)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">miny</span> = <span style="color: #c678dd;">min</span>(miny, y)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">maxx</span> = <span style="color: #c678dd;">max</span>(maxx, x + w)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">maxy</span> = <span style="color: #c678dd;">max</span>(maxy, y + h)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">x</span>, <span style="color: #dcaeea;">y</span>, <span style="color: #dcaeea;">w</span>, <span style="color: #dcaeea;">h</span> = minx, miny, maxx - minx, maxy - miny
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">cropped</span> = image[y:<span style="color: #c678dd;">min</span>(img_h, y+h+NUM_PX_COMMA), x:<span style="color: #c678dd;">min</span>(img_w, x+w)]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">else</span>:
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">If we morphed out all of the text, assume an empty image.</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">cropped</span> = MAX_COLOR_VAL * np.ones(shape=(<span style="color: #da8548; font-weight: bold;">20</span>, <span style="color: #da8548; font-weight: bold;">100</span>), dtype=np.uint8)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">bordered</span> = cv2.copyMakeBorder(cropped, <span style="color: #da8548; font-weight: bold;">5</span>, <span style="color: #da8548; font-weight: bold;">5</span>, <span style="color: #da8548; font-weight: bold;">5</span>, <span style="color: #da8548; font-weight: bold;">5</span>, cv2.BORDER_CONSTANT, <span style="color: #a9a1e1;">None</span>, <span style="color: #da8548; font-weight: bold;">255</span>)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> bordered
|
|
</pre>
|
|
</div>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #51afef;">import</span> cv2
|
|
<span style="color: #51afef;">import</span> numpy <span style="color: #51afef;">as</span> np
|
|
<<crop-to-text>>
|
|
<span style="color: #dcaeea;">image</span> = cv2.imread(<span style="color: #98be65;">"resources/examples/example-table-cell-1-1.png"</span>, cv2.IMREAD_GRAYSCALE)
|
|
<span style="color: #dcaeea;">image</span> = crop_to_text(image)
|
|
cv2.imwrite(<span style="color: #98be65;">"resources/examples/example-table-cell-1-1-cropped.png"</span>, image)
|
|
</pre>
|
|
</div>
|
|
|
|
|
|
<div class="figure">
|
|
<p><img src="resources/examples/example-table-cell-1-1-cropped.png" alt="example-table-cell-1-1-cropped.png" width="200px" height="100%" />
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org22a3e7b" class="outline-3">
|
|
<h3 id="org22a3e7b"><span class="section-number-3">4.8</span> OCR each cell</h3>
|
|
<div class="outline-text-3" id="text-4-8">
|
|
<p>
|
|
If we cleaned up the images well enough, we might get some accurate OCR!
|
|
</p>
|
|
|
|
<p>
|
|
There is plenty that could have gone wrong along the way.
|
|
</p>
|
|
|
|
<p>
|
|
The first step to troubleshooting is to view the intermediate images and see if
|
|
there’s something about your image that is obviously abnormal, like some really
|
|
thick noise or a wrongly detected table.
|
|
</p>
|
|
|
|
<p>
|
|
If everything looks reasonable but the OCR is doing something like turning a
|
|
period into a comma, then you might need to do some custom Tesseract training.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #51afef;">def</span> <span style="color: #c678dd;">ocr_image</span>(image, config):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> pytesseract.image_to_string(
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> image,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> config=config
|
|
<span style="background-color: #282c34;"> </span> )
|
|
</pre>
|
|
</div>
|
|
|
|
<p>
|
|
The second argument passed to <code>ocr_image</code> is a string of the command line arguments passed directly to <code>tesseract</code>. You can view the available options at <a href="https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#options">https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#options</a>
|
|
</p>
|
|
|
|
<p>
|
|
If no options are passed to <code>tesseract</code>, then language defaults to english. This means <code>tesseract</code> needs to be able to find a file named <code>eng.traineddata</code> on whatever path it searches for languages.
|
|
</p>
|
|
|
|
<p>
|
|
This python package comes with <code>eng.traineddata</code> and <code>table-ocr.traineddata</code>. <code>table-ocr.traineddata</code> is a personal model that I’ve found to be more accurate for my use case. You should train your own to maximize accuracy.
|
|
</p>
|
|
|
|
<p>
|
|
When you <code>pip install</code> this package, the traineddata gets copied to a <code>tessdata</code> folder in the same directory in which <code>pip</code> installs the package.
|
|
</p>
|
|
|
|
<p>
|
|
The <code>ocr_image</code> package in this repo defaults to using the <code>--tessdata-dir</code> option to the package’s <code>tessdata</code> directory in the package install location and the <code>-l</code> option to the <code>table_ocr</code> language.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #51afef;">import</span> pytesseract
|
|
<span style="color: #51afef;">import</span> cv2
|
|
<span style="color: #51afef;">import</span> numpy <span style="color: #51afef;">as</span> np
|
|
<span style="color: #51afef;">import</span> math
|
|
<span style="color: #dcaeea;">image</span> = cv2.imread(<span style="color: #98be65;">"resources/examples/example-table-cell-1-1.png"</span>, cv2.IMREAD_GRAYSCALE)
|
|
<<crop-to-text>>
|
|
<<ocr-image>>
|
|
<span style="color: #dcaeea;">image</span> = crop_to_text(image)
|
|
ocr_image(image, <span style="color: #98be65;">"--psm 7"</span>)
|
|
</pre>
|
|
</div>
|
|
|
|
<pre class="example">
|
|
9.09
|
|
</pre>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org9d0b21d" class="outline-2">
|
|
<h2 id="org9d0b21d"><span class="section-number-2">5</span> Files</h2>
|
|
<div class="outline-text-2" id="text-5">
|
|
<div class="org-src-container">
|
|
<pre class="src src-python">
|
|
</pre>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-orgd57e56a" class="outline-3">
|
|
<h3 id="orgd57e56a"><span class="section-number-3">5.1</span> setup.py</h3>
|
|
<div class="outline-text-3" id="text-5-1">
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #51afef;">import</span> setuptools
|
|
|
|
<span style="color: #dcaeea;">long_description</span> = <span style="color: #98be65;">"""</span>
|
|
<span style="color: #98be65;">Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.</span>
|
|
|
|
<span style="color: #98be65;">Requires binaries for tesseract, ImageMagick, and pdfimages (from Poppler).</span>
|
|
<span style="color: #98be65;">"""</span>
|
|
setuptools.setup(
|
|
<span style="background-color: #282c34;"> </span> name=<span style="color: #98be65;">"table_ocr"</span>,
|
|
<span style="background-color: #282c34;"> </span> version=<span style="color: #98be65;">"0.2.0"</span>,
|
|
<span style="background-color: #282c34;"> </span> author=<span style="color: #98be65;">"Eric Ihli"</span>,
|
|
<span style="background-color: #282c34;"> </span> author_email=<span style="color: #98be65;">"eihli@owoga.com"</span>,
|
|
<span style="background-color: #282c34;"> </span> description=<span style="color: #98be65;">"Extract text from tables in images."</span>,
|
|
<span style="background-color: #282c34;"> </span> long_description=long_description,
|
|
<span style="background-color: #282c34;"> </span> long_description_content_type=<span style="color: #98be65;">"text/plain"</span>,
|
|
<span style="background-color: #282c34;"> </span> url=<span style="color: #98be65;">"https://github.com/eihli/image-table-ocr"</span>,
|
|
<span style="background-color: #282c34;"> </span> packages=setuptools.find_packages(),
|
|
<span style="background-color: #282c34;"> </span> package_data={
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #98be65;">"table_ocr"</span>: [<span style="color: #98be65;">"tessdata/table-ocr.traineddata"</span>, <span style="color: #98be65;">"tessdata/eng.traineddata"</span>]
|
|
<span style="background-color: #282c34;"> </span> },
|
|
<span style="background-color: #282c34;"> </span> classifiers=[
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #98be65;">"Programming Language :: Python :: 3"</span>,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #98be65;">"License :: OSI Approved :: MIT License"</span>,
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #98be65;">"Operating System :: OS Independent"</span>,
|
|
<span style="background-color: #282c34;"> </span> ],
|
|
<span style="background-color: #282c34;"> </span> install_requires=[<span style="color: #98be65;">"pytesseract~=0.3"</span>, <span style="color: #98be65;">"opencv-python~=4.2"</span>,],
|
|
<span style="background-color: #282c34;"> </span> python_requires=<span style="color: #98be65;">">=3.6"</span>,
|
|
)
|
|
</pre>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org4b36161" class="outline-3">
|
|
<h3 id="org4b36161"><span class="section-number-3">5.2</span> table_ocr</h3>
|
|
<div class="outline-text-3" id="text-5-2">
|
|
</div>
|
|
<div id="outline-container-orgbdd2fc0" class="outline-4">
|
|
<h4 id="orgbdd2fc0"><span class="section-number-4">5.2.1</span> table_ocr/__init__.py</h4>
|
|
</div>
|
|
<div id="outline-container-org09e5a07" class="outline-4">
|
|
<h4 id="org09e5a07"><span class="section-number-4">5.2.2</span> table_ocr/util.py</h4>
|
|
<div class="outline-text-4" id="text-5-2-2">
|
|
<div class="org-src-container">
|
|
<pre class="src src-python"><span style="color: #51afef;">from</span> contextlib <span style="color: #51afef;">import</span> contextmanager
|
|
<span style="color: #51afef;">import</span> functools
|
|
<span style="color: #51afef;">import</span> logging
|
|
<span style="color: #51afef;">import</span> os
|
|
<span style="color: #51afef;">import</span> tempfile
|
|
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">get_logger</span>(name):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">logger</span> = logging.getLogger(name)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">lvl</span> = os.environ.get(<span style="color: #98be65;">"PY_LOG_LVL"</span>, <span style="color: #98be65;">"info"</span>).upper()
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">handler</span> = logging.StreamHandler()
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">formatter</span> = logging.Formatter(logging.BASIC_FORMAT)
|
|
<span style="background-color: #282c34;"> </span> handler.setFormatter(formatter)
|
|
<span style="background-color: #282c34;"> </span> logger.addHandler(handler)
|
|
<span style="background-color: #282c34;"> </span> handler.setLevel(lvl)
|
|
<span style="background-color: #282c34;"> </span> logger.setLevel(lvl)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> logger
|
|
|
|
<span style="color: #ECBE7B;">@contextmanager</span>
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">working_dir</span>(directory):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">original_working_dir</span> = os.getcwd()
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">try</span>:
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> os.chdir(directory)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">yield</span> directory
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">finally</span>:
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> os.chdir(original_working_dir)
|
|
|
|
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">make_tempdir</span>(identifier):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> tempfile.mkdtemp(prefix=<span style="color: #98be65;">"{}_"</span>.<span style="color: #c678dd;">format</span>(identifier))
|
|
</pre>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org5a371bd" class="outline-4">
|
|
<h4 id="org5a371bd"><span class="section-number-4">5.2.3</span> table_ocr/pdf_to_images/</h4>
|
|
<div class="outline-text-4" id="text-5-2-3">
|
|
</div>
|
|
<div id="outline-container-orgd777fae" class="outline-5">
|
|
<h5 id="orgd777fae"><span class="section-number-5">5.2.3.1</span> table_ocr/pdf_to_images/__init__.py</h5>
|
|
<div class="outline-text-5" id="text-5-2-3-1">
|
|
<div class="org-src-container">
|
|
<pre class="src src-python" id="orgdf64015"><span style="color: #51afef;">import</span> os
|
|
<span style="color: #51afef;">import</span> re
|
|
<span style="color: #51afef;">import</span> subprocess
|
|
|
|
<span style="color: #51afef;">from</span> table_ocr.util <span style="color: #51afef;">import</span> get_logger, working_dir
|
|
|
|
<span style="color: #dcaeea;">logger</span> = get_logger(<span style="color: #c678dd;">__name__</span>)
|
|
|
|
<span style="color: #5B6268;"># </span><span style="color: #5B6268;">Wrapper around the Poppler command line utility "pdfimages" and helpers for</span>
|
|
<span style="color: #5B6268;"># </span><span style="color: #5B6268;">finding the output files of that command.</span>
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">pdf_to_images</span>(pdf_filepath):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #83898d;">"""</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> Turn a pdf into images</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> Returns the filenames of the created images sorted lexicographically.</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> """</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(pdf_filepath)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">image_filenames</span> = pdfimages(pdf_filepath)
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">Since pdfimages creates a number of files named each for there page number</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">and doesn't return us the list that it created</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> <span style="color: #c678dd;">sorted</span>([os.path.join(directory, f) <span style="color: #51afef;">for</span> f <span style="color: #51afef;">in</span> image_filenames])
|
|
|
|
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">pdfimages</span>(pdf_filepath):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #83898d;">"""</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> Uses the `pdfimages` utility from Poppler</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> (https://poppler.freedesktop.org/). Creates images out of each page. Images</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> are prefixed by their name sans extension and suffixed by their page number.</span>
|
|
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> This should work up to pdfs with 999 pages since find matching files in dir</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> uses 3 digits in its regex.</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> """</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">directory</span>, <span style="color: #dcaeea;">filename</span> = os.path.split(pdf_filepath)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">if</span> <span style="color: #51afef;">not</span> os.path.isabs(directory):
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">directory</span> = os.path.abspath(directory)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">filename_sans_ext</span> = filename.split(<span style="color: #98be65;">".pdf"</span>)[<span style="color: #da8548; font-weight: bold;">0</span>]
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #5B6268;"># </span><span style="color: #5B6268;">pdfimages outputs results to the current working directory</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">with</span> working_dir(directory):
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> subprocess.run([<span style="color: #98be65;">"pdfimages"</span>, <span style="color: #98be65;">"-png"</span>, filename, filename.split(<span style="color: #98be65;">".pdf"</span>)[<span style="color: #da8548; font-weight: bold;">0</span>]])
|
|
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">image_filenames</span> = find_matching_files_in_dir(filename_sans_ext, directory)
|
|
<span style="background-color: #282c34;"> </span> logger.debug(
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #98be65;">"Converted {} into files:\n{}"</span>.<span style="color: #c678dd;">format</span>(pdf_filepath, <span style="color: #98be65;">"\n"</span>.join(image_filenames))
|
|
<span style="background-color: #282c34;"> </span> )
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> image_filenames
|
|
|
|
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">find_matching_files_in_dir</span>(file_prefix, directory):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">files</span> = [
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> filename
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">for</span> filename <span style="color: #51afef;">in</span> os.listdir(directory)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #51afef;">if</span> re.match(r<span style="color: #98be65;">"{}-\d{{3}}.*\.png"</span>.<span style="color: #c678dd;">format</span>(re.escape(file_prefix)), filename)
|
|
<span style="background-color: #282c34;"> </span> ]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> files
|
|
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">preprocess_img</span>(filepath, tess_params=<span style="color: #a9a1e1;">None</span>):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #83898d;">"""Processing that involves running shell executables,</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> like mogrify to rotate.</span>
|
|
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> Uses tesseract to detect rotation.</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> </span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> Orientation and script detection is only available for legacy tesseract</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> (--oem 0). Some versions of tesseract will segfault if you let it run OSD</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> with the default oem (3).</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> """</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">if</span> tess_params <span style="color: #51afef;">is</span> <span style="color: #a9a1e1;">None</span>:
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">tess_params</span> = [<span style="color: #98be65;">"--psm"</span>, <span style="color: #98be65;">"0"</span>, <span style="color: #98be65;">"--oem"</span>, <span style="color: #98be65;">"0"</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">rotate</span> = get_rotate(filepath, tess_params)
|
|
<span style="background-color: #282c34;"> </span> logger.debug(<span style="color: #98be65;">"Rotating {} by {}."</span>.<span style="color: #c678dd;">format</span>(filepath, rotate))
|
|
<span style="background-color: #282c34;"> </span> mogrify(filepath, rotate)
|
|
|
|
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">get_rotate</span>(image_filepath, tess_params):
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #83898d;">"""</span>
|
|
<span style="color: #83898d; background-color: #282c34;"> </span><span style="color: #83898d;"> """</span>
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">tess_command</span> = [<span style="color: #98be65;">"tesseract"</span>] + tess_params + [image_filepath, <span style="color: #98be65;">"-"</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">output</span> = (
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> subprocess.check_output(tess_command)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> .decode(<span style="color: #98be65;">"utf-8"</span>)
|
|
<span style="background-color: #282c34;"> </span> <span style="background-color: #282c34;"> </span> .split(<span style="color: #98be65;">"\n"</span>)
|
|
<span style="background-color: #282c34;"> </span> )
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">output</span> = <span style="color: #c678dd;">next</span>(l <span style="color: #51afef;">for</span> l <span style="color: #51afef;">in</span> output <span style="color: #51afef;">if</span> <span style="color: #98be65;">"Rotate: "</span> <span style="color: #51afef;">in</span> l)
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #dcaeea;">output</span> = output.split(<span style="color: #98be65;">": "</span>)[<span style="color: #da8548; font-weight: bold;">1</span>]
|
|
<span style="background-color: #282c34;"> </span> <span style="color: #51afef;">return</span> output
|
|
|
|
|
|
<span style="color: #51afef;">def</span> <span style="color: #c678dd;">mogrify</span>(image_filepath, rotate):
|
|
<span style="background-color: #282c34;"> </span> subprocess.run([<span style="color: #98be65;">"mogrify"</span>, <span style="color: #98be65;">"-rotate"</span>, rotate, image_filepath])
|
|
</pre>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="outline-container-org0064754" class="outline-5">
|
|
<h5 id="org0064754"><span class="section-number-5">5.2.3.2</span> table_ocr/pdf_to_images/__main__.py</h5>
|
|
<div class="outline-text-5" id="text-5-2-3-2">
|
|
<p>
|
|
Takes a variable number of pdf files and creates images out of each page of the
|
|
file using <code>pdfimages</code> from Poppler. Images are created in the same directory
|
|
that contains the pdf.
|
|
</p>
|
|
|
|
<p>
|
|
Prints each pdf followed by the images extracted from that pdf followed by a
|
|
blank line.
|
|
</p>
|
|
|
|
<div class="org-src-container">
|
|
<pre class="src src-shell">python -m table_ocr.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
|