From 7e5516eb5da7f19ff625b4266be6f9932fee12cc Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Sat, 17 Oct 2020 05:36:40 -0700 Subject: [PATCH] Update README in setup.py --- README.md | 60 ++++++++++++++++++++++++++++++++ pdf_table_extraction_and_ocr.org | 8 ++--- setup.py | 8 ++--- 3 files changed, 68 insertions(+), 8 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..043edfe --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ + +# Table of Contents + + + +This python package contains modules to help with finding and extracting tabular +data from a PDF or image into a CSV format. + +Given an image that contains a table… + +![img](resources/examples/example-page.png) + +Extract the the text into a CSV format… + + PRIZE,ODDS 1 IN:,# OF WINNERS* + $3,9.09,"282,447" + $5,16.66,"154,097" + $7,40.01,"64,169" + $10,26.67,"96,283" + $20,100.00,"25,677" + $30,290.83,"8,829" + $50,239.66,"10,714" + $100,919.66,"2,792" + $500,"6,652.07",386 + "$40,000","855,899.99",3 + 1,i223, + Toa,, + ,, + ,,"* Based upon 2,567,700" + +The package is split into modules with narrow focuses. + +- `pdf_to_images` uses Poppler and ImageMagick to extract images from a PDF. +- `extract_tables` finds and extracts table-looking things from an image. +- `extract_cells` extracts and orders cells from a table. +- `ocr_image` uses Tesseract to OCR the text from an image of a cell. +- `ocr_to_csv` converts into a CSV the directory structure that `ocr_image` outputs. + +The outputs of a previous module can be used by a subsequent module so that they +can be chained together to create the entire workflow, as demonstrated by the +following shell script. + + #!/bin/sh + + PDF=$1 + + python -m table_ocr.pdf_to_images $PDF | grep .png > /tmp/pdf-images.txt + cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt + cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells {} | grep cells > /tmp/extracted-cells.txt + cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} + + for image in $(cat /tmp/extracted-tables.txt); do + dir=$(dirname $image) + python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt") + done + +The package was written in a [literate programming](https://en.wikipedia.org/wiki/Literate_programming) style. The source code at + is +meant to act as the documentation and reference material. + diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index a86c5f4..7b5e966 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -785,13 +785,13 @@ ocr_image(image, "--psm 7") ** setup.py #+BEGIN_SRC python :tangle setup.py :results none +import os import setuptools -long_description = """ -Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV. +this_dir = os.path.abspath(os.path.dirname(__file__)) +with open(os.path.join(this_dir, "README.md"), encoding="utf-8") as f: + long_description = f.read() -Requires binaries for tesseract, ImageMagick, and pdfimages (from Poppler). -""" setuptools.setup( name="table_ocr", version="0.2.1", diff --git a/setup.py b/setup.py index 9f65856..78d375c 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,10 @@ +import os import setuptools -long_description = """ -Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV. +this_dir = os.path.abspath(os.path.dirname(__file__)) +with open(os.path.join(this_dir, "README.md"), encoding="utf-8") as f: + long_description = f.read() -Requires binaries for tesseract, ImageMagick, and pdfimages (from Poppler). -""" setuptools.setup( name="table_ocr", version="0.2.1",