From 0420f97bd625da30b3af03c8b1225a8d5f6ae4de Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Sat, 25 Apr 2020 12:21:10 -0700 Subject: [PATCH] Update exported html --- pdf_table_extraction_and_ocr.html | 1219 +++++++++++++---- .../example-table-cell-1-1-cropped.png | Bin 1067 -> 1129 bytes 2 files changed, 974 insertions(+), 245 deletions(-) diff --git a/pdf_table_extraction_and_ocr.html b/pdf_table_extraction_and_ocr.html index 703f770..be38955 100644 --- a/pdf_table_extraction_and_ocr.html +++ b/pdf_table_extraction_and_ocr.html @@ -3,7 +3,7 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - + PDF Parsing @@ -225,66 +225,254 @@

Table of Contents

-
-

1 Preparing our data

+
+

1 Overview

+

+This Python package provides utilities for extracting tabular data from PDF +files and images of tables. +

+ +

+Given an image that contains a table… +

+ + +
+

example-page.png +

+
+ +

+Extract the the text into a CSV format… +

+ +
+PRIZE,ODDS 1 IN:,# OF WINNERS*
+$3,9.09,"282,447"
+$5,16.66,"154,097"
+$7,40.01,"64,169"
+$10,26.67,"96,283"
+$20,100.00,"25,677"
+$30,290.83,"8,829"
+$50,239.66,"10,714"
+$100,919.66,"2,792"
+$500,"6,652.07",386
+"$40,000","855,899.99",3
+1,i223,
+Toa,,
+,,
+,,"* Based upon 2,567,700"
+
+ +

+The package is split into modules with narrow focuses. +

+ +
    +
  • pdf_to_images uses Poppler and ImageMagick to extract images from a PDF.
  • +
  • extract_tables finds and extracts table-looking things from an image.
  • +
  • extract_cells extracts and orders cells from a table.
  • +
  • ocr_image uses Tesseract to turn a OCR the text from an image of a cell.
  • +
  • ocr_to_csv converts into a CSV the directory structure that ocr_image outputs.
  • +
-
-

1.1 Converting PDFs to images

+ +
+

1.1 Requirements

+
+
+

1.1.1 Python packages

+
+
    +
  • numpy
  • +
  • opencv-python
  • +
  • pytesseract
  • +
+
+
+ +
+

1.1.2 External

+
+
    +
  • pdfimages from Poppler
  • +
  • Tesseract
  • +
  • mogfrify ImageMagick
  • +
+
+
+
+ +
+

1.2 Contributing

+
+

+This package was created in a literate programming style with the help of Babel. +

+ +

+The unfortunate downside is the obscurity of the tooling. It creates a bit of a +barrier for contributors who aren’t already familiar with Emacs and Babel. +

+
+
+ +
+

1.3 Example usage

+
+

+Here is an example of a shell script that uses each module to turn a pdf with a +table into CSV output. +

+ +

+Depending on your needs, you may not need all of these steps. If you already +have an image of a table, you can jum straight to extracting the cells. +

+ +

+Each piece is its own python module, so you can also simply import the pieces +you need into your own python projects and use them as needed. +

+ +
+
#!/bin/sh
+
+PDF=$1
+
+python -m table_ocr.pdf_to_images $PDF | grep .png > /tmp/pdf-images.txt
+cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {}  | grep table > /tmp/extracted-tables.txt
+cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells {} | grep cells > /tmp/extracted-cells.txt
+cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l table-ocr
+
+for image in $(cat /tmp/extracted-tables.txt); do
+    dir=$(dirname $image)
+    python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt")
+done
+
+
+
+
+ +
+

1.4 Possible improvements

+
+

+Detect text with the stroke-width-transform alogoritm. https://zablo.net/blog/post/stroke-width-transform-swt-python/index.html +

+
+
+
+ +
+

2 Preparing data

+

Not all pdfs need to be sent through OCR to extract the text content. If you can click and drag to highlight text in the pdf, then the tools in this library probably aren’t necessary.

+
+
+

2.1 Converting PDFs to images

+

This code calls out to pdfimages from Poppler.

-
def pdf_to_images(pdf_filepath):
+
# Wrapper around the Poppler command line utility "pdfimages" and helpers for
+# finding the output files of that command.
+def pdf_to_images(pdf_filepath):
     """
     Turn a pdf into images
     """
@@ -302,12 +490,17 @@ This code calls out to     Uses the `pdfimages` utility from Poppler
     (https://poppler.freedesktop.org/). Creates images out of each page. Images
     are prefixed by their name sans extension and suffixed by their page number.
+
+    This should work up to pdfs with 999 pages since find matching files in dir
+    uses 3 digits in its regex.
     """
     directory, filename = os.path.split(pdf_filepath)
     filename_sans_ext = filename.split(".pdf")[0]
     subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
     image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
-    logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
+    logger.debug(
+        "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
+    )
     return image_filenames
 
 
@@ -315,7 +508,7 @@ This code calls out to     files = [
         filename
         for filename in os.listdir(directory)
-        if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename)
+        if re.match(r"{}-\d{{3}}.*\.png".format(re.escape(file_prefix)), filename)
     ]
     return files
 
@@ -323,9 +516,9 @@ This code calls out to -

1.2 Detecting image orientation and applying rotation.

-
+

+The following are some helpers to detect orientation of the images that Poppler +extracted and, if the images are rotated or skewed, use ImageMagick’s `mogrify` +to correct the rotation. This makes OCR more straightforward. +

+
-
def preprocess_img(filepath):
+
def preprocess_img(filepath):
     """
     Processing that involves running shell executables,
     like mogrify to rotate.
@@ -376,9 +575,9 @@ Script confidence: 2.44
 
-
-

2 Detecting tables

-
+
+

3 Detecting tables

+

This answer from opencv.org was heavily referenced while writing the code around table detection: @@ -398,10 +597,35 @@ that makes things like shape detection more accurate.

-
def find_tables(image):
-    <<blur>>
-    <<threshold>>
-    <<lines-of-table>>
+
def find_tables(image):
+    BLUR_KERNEL_SIZE = (17, 17)
+    STD_DEV_X_DIRECTION = 0
+    STD_DEV_Y_DIRECTION = 0
+    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    SUBTRACT_FROM_MEAN = -2
+    
+    img_bin = cv2.adaptiveThreshold(
+        ~blurred,
+        MAX_COLOR_VAL,
+        cv2.ADAPTIVE_THRESH_MEAN_C,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+        SUBTRACT_FROM_MEAN,
+    )
+    vertical = horizontal = img_bin.copy()
+    SCALE = 5
+    image_width, image_height = horizontal.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
+    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
+    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    
+    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
+    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
+    
+    mask = horizontally_dilated + vertically_dilated
     contours, heirarchy = cv2.findContours(
         mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
     )
@@ -423,32 +647,72 @@ that makes things like shape detection more accurate.
 
+

+Here is an the an example of the result of the find_tables function. +

+
import cv2
 
-<<detect-table>>
-
 image_filename = "resources/examples/example-page.png"
 image = cv2.imread(image_filename, cv2.IMREAD_GRAYSCALE)
 image = find_tables(image)[0]
 cv2.imwrite("resources/examples/example-table.png", image)
-"resources/examples/example-table.png"
 
+
+ +
+

example-page.png +

+
+ +

+↓ +

+ +
-

example-table.png +

example-table.png

-
-

3 OCR tables

-
+
+

3.1 Improving accuracy

+
+

+It’s likely that some images will contain tables that aren’t accurately +recognized by the code above. The code will then need to be made more robust. +But how will we know that changes to the code don’t break the detection of +tables that were previously detected? +

+ +

+It might be good to add some type of test suite in the future that contains a +spec that matches a pdf with the pages and pixel coordinates of the detected +tables. The coordinates would need to have a range. Something like +“example-1.pdf, page-2.png, [450:470, 200:210, 800:820, 1270:1290]” where the +elements of the list are valid x, y, w, h ranges. So the test will pass if if +the x, y, width and height are anywhere in that range. +

+
+
+
+ +
+

4 OCR tables

+

-Find the bounding box of each cell in the table. Run tesseract on each cell. -Print a comma seperated output. +Tesseract does not perform well when run on images of tables. It performs best +when given a single line of text with no extra noise. +

+ +

+Therefore, our next task is to find and extract the bounding box of each cell in +the table. Run tesseract on each cell. Print a comma seperated output.

@@ -456,9 +720,63 @@ We’ll start with an image shown at the end of the previous section.

-
-

3.0.1 Blur

-
+
+

4.1 Training Tesseract

+
+

+It’s a very good idea to train tesseract. Accuracy will improve tremendously. +

+ +

+Clone the tesstrain repo at https://github.com/tesseract-ocr/tesstrain. +

+ +

+Run the ocr_tables script on a few pdfs to generate some training data. That +script outputs pairs of .png and .gt.txt files that can be used by +tesstrain. +

+ +

+Make sure the .gt.txt files contain an accurate recognition of the +corresponding image. Since the first few runs will be untrained, you’ll probably +need to fix up a few of the text files. +

+ +

+Once they are accurate, move them to a new subdirectory of the tesstrain repo; +tesstrain/data/<model-name>-ground-truth/. +

+ +

+You’ll also need to clone the tessdata_best repo, +https://github.com/tesseract-ocr/tessdata_best and the +https://github.com/tesseract-ocr/langdata to use as the start of the +training model. +

+ +

+I’m actually not sure how much the punctuation and numbers from langdata help. +I didn’t keep accurate records while playing with the training, I don’t +thoroughly understand it, and it’s not profitable for me to explore it at the +moment. It worked for my purposes and that has been good enough. +

+ +
+make training MODEL_NAME=table-ocr START_MODEL=eng TESSDATA=~/src/tessdata_best PUNC_FILE=~/src/langdata/eng/eng.punc NUMBERS_FILE=~/src/langdata/eng/eng.numbers
+
+ +

+Once the training is complete, there will be a new file +tesstrain/data/<model-name>.traineddata. Copy that file to the directory +Tesseract searches for models. On my machine, it was /usr/local/share/tessdata/. +

+
+
+ +
+

4.2 Blur

+

Blurring helps to make noise less noisy so that the overall structure of an image is more detectable. @@ -486,7 +804,6 @@ Cleanup can be accomplished with a blur followed by some thresholding.

image = ~cv2.imread("resources/examples/example-table.png", cv2.IMREAD_GRAYSCALE)
 <<blur>>
 cv2.imwrite("resources/examples/example-table-blurred.png", blurred)
-"resources/examples/example-table-blurred.png"
 
@@ -498,9 +815,9 @@ cv2.imwrite("resources/examples/example-table-blur
-
-

3.0.2 Threshold

-
+
+

4.3 Threshold

+

We’ve got a bunch of pixels that are gray. Thresholding will turn them all either black or white. Having all black or white pixels lets us do morphological @@ -526,7 +843,6 @@ transformations like erosion and dilation.

<<threshold>>
 cv2.imwrite("resources/examples/example-table-thresholded.png", img_bin)
-"resources/examples/example-table-thresholded.png"
 
@@ -538,16 +854,9 @@ cv2.imwrite("resources/examples/example-table-thre
-
-

3.0.3 Finding the vertical and horizontal lines of the table

-
-

-Note: There’s a wierd issue with the results of the example below when it’s -evaluated as part of an export or a full-buffer evaluation. If you evaluate the -example by itself, it looks the way it’s intended. If you evaluate it as part of -an entire buffer evaluation, it’s distorted. -

- +
+

4.4 Finding the vertical and horizontal lines of the table

+
vertical = horizontal = img_bin.copy()
 SCALE = 5
@@ -564,24 +873,30 @@ an entire buffer evaluation, it’s distorted.
 
+

+Note: There’s a wierd issue with the results of the example below when it’s +evaluated as part of an export or a full-buffer evaluation. If you evaluate the +example by itself, it looks the way it’s intended. If you evaluate it as part of +an entire buffer evaluation, like during export, it’s distorted. +

+
<<lines-of-table>>
 cv2.imwrite("resources/examples/example-table-lines.png", mask)
-"resources/examples/example-table-lines.png"
 
-

example-table-lines.png +

example-table-lines.png

-
-

3.0.4 Finding the contours

-
+
+

4.5 Finding the contours

+

Blurring and thresholding allow us to find the lines. Opening the lines allows us to find the contours. @@ -629,7 +944,7 @@ above/below certain sizes.

-
contours, heirarchy = cv2.findContours(
+
contours, heirarchy = cv2.findContours(
     mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
 )
 
@@ -661,9 +976,9 @@ above/below certain sizes.
 
-
-

3.0.5 Sorting the bounding rectangles

-
+
+

4.6 Sorting the bounding rectangles

+

We want to process these from left-to-right, top-to-bottom.

@@ -681,7 +996,7 @@ value of their center. We’ll remove those rectangles from the list and rep

-
def cell_in_same_row(c1, c2):
+
def cell_in_same_row(c1, c2):
     c1_center = c1[1] + c1[3] - c1[3] / 2
     c2_bottom = c2[1] + c2[3]
     c2_top = c2[1]
@@ -745,7 +1060,6 @@ numbering them from right to left, top to bottom.
             2,
         )
 cv2.imwrite("resources/examples/example-table-cells-numbered.png", image)
-"resources/examples/example-table-cells-numbered.png"
 
@@ -756,7 +1070,7 @@ cv2.imwrite("resources/examples/example-table-cell
-
def extract_cell_images_from_table(image):
+
def extract_cell_images_from_table(image):
     BLUR_KERNEL_SIZE = (17, 17)
     STD_DEV_X_DIRECTION = 0
     STD_DEV_Y_DIRECTION = 0
@@ -859,7 +1173,6 @@ cv2.imwrite("resources/examples/example-table-cell
 image = cv2.imread("resources/examples/example-table.png", cv2.IMREAD_GRAYSCALE)
 cell_images_rows = extract_cell_images_from_table(image)
 cv2.imwrite("resources/examples/example-table-cell-1-1.png", cell_images_rows[1][1])
-"resources/examples/example-table-cell-1-1.png"
 
@@ -871,9 +1184,9 @@ cv2.imwrite("resources/examples/example-table-cell
-
-

3.0.6 Cropping each cell to the text

-
+
+

4.7 Cropping each cell to the text

+

OCR with Tesseract works best when there is about 10 pixels of white border around the text. @@ -888,20 +1201,54 @@ that noise with a simple open morph.

Once the stray border pixels have been removed, we can expand our border using -openMakeBorder. +copyMakeBorder.

def crop_to_text(image):
-    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
-    opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    SUBTRACT_FROM_MEAN = -2
+
+    img_bin = cv2.adaptiveThreshold(
+        ~image,
+        MAX_COLOR_VAL,
+        cv2.ADAPTIVE_THRESH_MEAN_C,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+        SUBTRACT_FROM_MEAN,
+    )
+
+    img_h, img_w = image.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
+    horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    both = horizontal_lines + vertical_lines
+    cleaned = img_bin - both
+
+    # Get rid of little noise.
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
+    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
+    opened = cv2.dilate(opened, kernel)
 
     contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
     bounding_rects = [cv2.boundingRect(c) for c in contours]
-    # The largest contour is certainly the text that we're looking for.
-    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
-    x, y, w, h = largest_rect
-    cropped = image[y:y+h, x:x+w]
+    NUM_PX_COMMA = 6
+    MIN_CHAR_AREA = 5 * 9
+    char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
+    if char_sized_bounding_rects:
+        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
+        for x, y, w, h in char_sized_bounding_rects:
+            minx = min(minx, x)
+            miny = min(miny, y)
+            maxx = max(maxx, x + w)
+            maxy = max(maxy, y + h)
+        x, y, w, h = minx, miny, maxx - minx, maxy - miny
+        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
+    else:
+        # If we morphed out all of the text, assume an empty image.
+        cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
     bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
     return bordered
 
@@ -909,11 +1256,11 @@ Once the stray border pixels have been removed, we can expand our border using
import cv2
+import numpy as np
 <<crop-to-text>>
 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
 image = crop_to_text(image)
 cv2.imwrite("resources/examples/example-table-cell-1-1-cropped.png", image)
-"resources/examples/example-table-cell-1-1-cropped.png"
 
@@ -925,9 +1272,9 @@ cv2.imwrite("resources/examples/example-table-cell
-
-

3.0.7 OCR each cell

-
+
+

4.8 OCR each cell

+

If we cleaned up the images well enough, we might get some accurate OCR!

@@ -948,23 +1295,9 @@ period into a comma, then you might need to do some custom Tesseract training.

-
def crop_to_text(image):
-    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
-    opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
-
-    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
-    bounding_rects = [cv2.boundingRect(c) for c in contours]
-    # The largest contour is certainly the text that we're looking for.
-    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
-    x, y, w, h = largest_rect
-    cropped = image[y:y+h, x:x+w]
-    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
-    return bordered
-
-def ocr_image(image, config):
-    cropped = crop_to_text(image)
+
def ocr_image(image, config):
     return pytesseract.image_to_string(
-        ~cropped,
+        image,
         config=config
     )
 
@@ -973,8 +1306,11 @@ period into a comma, then you might need to do some custom Tesseract training.
import pytesseract
 import cv2
+import numpy as np
 image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
+<<crop-to-text>>
 <<ocr-image>>
+image = crop_to_text(image)
 ocr_image(image, "--psm 7")
 
@@ -986,39 +1322,45 @@ ocr_image(image, "--psm 7")
-
-

4 Files

-
+
+

5 Files

+
 
-
-

4.1 setup.py

-
+
+

5.1 setup.py

+
import setuptools
 
-with open("README.md", "r") as fh:
-    long_description = fh.read()
+long_description = """
+Utilities for turning images of tables into CSV data. Uses Tesseract and OpenCV.
 
+Requires binaries for tesseract and pdfimages (from Poppler).
+"""
 setuptools.setup(
-    name="example-pkg-YOUR-USERNAME-HERE", # Replace with your own username
+    name="table_ocr",
     version="0.0.1",
-    author="Example Author",
-    author_email="author@example.com",
-    description="A small example package",
+    author="Eric Ihli",
+    author_email="eihli@owoga.com",
+    description="Turn images of tables into CSV data.",
     long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/pypa/sampleproject",
+    long_description_content_type="text/plain",
+    url="https://github.com/eihli/image-table-ocr",
     packages=setuptools.find_packages(),
     classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],
+    install_requires=[
+        "pytesseract~=0.3",
+        "opencv-python~=4.2",
+    ],
     python_requires='>=3.6',
 )
 
@@ -1026,23 +1368,16 @@ setuptools.setup(
-
-

4.2 table_image_ocr

-
-
-
-

4.2.1 table_image_ocr/__init__.py

-
-
-
-
-
+
+

5.2 table_ocr

+
+
+

5.2.1 table_ocr/__init__.py

- -
-

4.2.2 table_image_ocr/util.py

-
+
+

5.2.2 table_ocr/util.py

+
from contextlib import contextmanager
 import functools
@@ -1050,16 +1385,16 @@ setuptools.setup(
 import os
 import tempfile
 
-from bs4 import BeautifulSoup as bs
-import requests
-
-
-
-
-logger = get_logger()
-
-
-
+def get_logger(name):
+    logger = logging.getLogger(name)
+    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter(logging.BASIC_FORMAT)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    handler.setLevel(lvl)
+    logger.setLevel(lvl)
+    return logger
 
 @contextmanager
 def working_dir(directory):
@@ -1071,13 +1406,6 @@ setuptools.setup(
         os.chdir(original_working_dir)
 
 
-def download(url, filepath):
-    response = request_get(url)
-    data = response.content
-    with open(filepath, "wb") as f:
-        f.write(data)
-
-
 def make_tempdir(identifier):
     return tempfile.mkdtemp(prefix="{}_".format(identifier))
 
@@ -1085,57 +1413,24 @@ setuptools.setup(
-
-

4.2.3 table_image_ocr/prepare_pdfs.py

-
-

-Takes a variable number of pdf files and creates images out of each page of the -file using pdfimages from Poppler. Images are created in the same directory -that contains the pdf. -

- -

-Prints each pdf followed by the images extracted from that pdf followed by a -blank line. -

- +
+

5.2.3 table_ocr/pdf_to_images/

+
+
+
+
5.2.3.1 table_ocr/pdf_to_images/__init__.py
+
-
python -m pdf.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
-
-
- - -
-
import argparse
-import logging
-import os
+
import os
 import re
 import subprocess
-import sys
-
-from pdf.util import request_get, working_dir, download, make_tempdir
-
 
+from table_ocr.util import get_logger, working_dir
 
+logger = get_logger(__name__)
 
-logger = get_logger()
-
-parser = argparse.ArgumentParser()
-parser.add_argument("files", nargs="+")
-
-def main(files):
-    pdf_images = []
-    for f in files:
-        pdf_images.append((f, pdf_to_images(f)))
-
-    for pdf, images in pdf_images:
-        for image in images:
-            preprocess_img(image)
-
-    for pdf, images in pdf_images:
-        print("{}\n{}\n".format(pdf, "\n".join(images)))
-
-
+# Wrapper around the Poppler command line utility "pdfimages" and helpers for
+# finding the output files of that command.
 def pdf_to_images(pdf_filepath):
     """
     Turn a pdf into images
@@ -1154,12 +1449,17 @@ parser.add_argument("files", nargs=    Uses the `pdfimages` utility from Poppler
     (https://poppler.freedesktop.org/). Creates images out of each page. Images
     are prefixed by their name sans extension and suffixed by their page number.
+
+    This should work up to pdfs with 999 pages since find matching files in dir
+    uses 3 digits in its regex.
     """
     directory, filename = os.path.split(pdf_filepath)
     filename_sans_ext = filename.split(".pdf")[0]
     subprocess.run(["pdfimages", "-png", pdf_filepath, filename.split(".pdf")[0]])
     image_filenames = find_matching_files_in_dir(filename_sans_ext, directory)
-    logger.debug("Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames)))
+    logger.debug(
+        "Converted {} into files:\n{}".format(pdf_filepath, "\n".join(image_filenames))
+    )
     return image_filenames
 
 
@@ -1167,9 +1467,10 @@ parser.add_argument("files", nargs=    files = [
         filename
         for filename in os.listdir(directory)
-        if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename)
+        if re.match(r"{}-\d{{3}}.*\.png".format(re.escape(file_prefix)), filename)
     ]
     return files
+
 def preprocess_img(filepath):
     """
     Processing that involves running shell executables,
@@ -1193,51 +1494,74 @@ parser.add_argument("files", nargs=def mogrify(image_filepath, rotate):
     subprocess.run(["mogrify", "-rotate", rotate, image_filepath])
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    main(args.files)
 
-
-

4.2.4 table_image_ocr/extract_tables.py

-
+
+
5.2.3.2 table_ocr/pdf_to_images/__main__.py
+
+

+Takes a variable number of pdf files and creates images out of each page of the +file using pdfimages from Poppler. Images are created in the same directory +that contains the pdf. +

+ +

+Prints each pdf followed by the images extracted from that pdf followed by a +blank line. +

+
-
. ~/.virtualenvs/lotto_odds/bin/activate
-python -m pdf.extract_tables "resources/examples/example-page.png"
+
python -m table_ocr.prepare_pdfs /tmp/file1/file1.pdf /tmp/file2/file2.pdf ...
 
+
-
import argparse
-import os
+
import argparse
 
-import cv2
+from table_ocr.util import working_dir, make_tempdir, get_logger
+from table_ocr.pdf_to_images import pdf_to_images, preprocess_img
+
+logger = get_logger(__name__)
 
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")
 
 
 def main(files):
-    results = []
+    pdf_images = []
     for f in files:
-        directory, filename = os.path.split(f)
-        image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
-        tables = find_tables(image)
-        files = []
-        for i, table in enumerate(tables):
-            filename_sans_extension = os.path.splitext(filename)[0]
-            table_filename = "{}-table-{:03d}.png".format(filename_sans_extension, i)
-            table_filepath = os.path.join(directory, table_filename)
-            files.append(table_filepath)
-            cv2.imwrite(table_filepath, table)
-        results.append((f, files))
+        pdf_images.append((f, pdf_to_images(f)))
+
+    for pdf, images in pdf_images:
+        for image in images:
+            preprocess_img(image)
+
+    for pdf, images in pdf_images:
+        print("{}\n{}\n".format(pdf, "\n".join(images)))
 
-    for image_filename, table_filenames in results:
-        print("{}\n{}\n".format(image_filename, "\n".join(table_filenames)))
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.files)
+
+
+
+
+
+ +
+

5.2.4 table_ocr/extract_tables/

+
+
+
+
5.2.4.1 table_ocr/extract_tables/__init__.py
+
+
+
import cv2
 
 def find_tables(image):
     BLUR_KERNEL_SIZE = (17, 17)
@@ -1286,6 +1610,78 @@ parser.add_argument("files", nargs=    # Leaving that step as a future TODO if it is ever necessary.
     images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
     return images
+
+
+
+
+ +
+
5.2.4.2 table_ocr/extract_tables/__main__.py
+
+

+Takes 1 or more image paths as arguments. +

+ +

+Images are opened and read with OpenCV. +

+ +

+Tables are detected and extracted to a new subdirectory of the given image. The +subdirectory will be the filename sans the extension. The tables inside that +directory will be named table-000.png. +

+ +

+If you want to do something with the output, like pipe the paths of the +extracted tables into some other utility, here is a description of the output. +

+ +

+For each image path given as an agument, outputs: +

+ +
    +
  1. The given image path
  2. +
  3. Paths of extracted tables; seperated by newlines
  4. +
  5. Empty newline
  6. +
+ +
+
import argparse
+import os
+
+import cv2
+
+from table_ocr.extract_tables import find_tables
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+
+def main(files):
+    results = []
+    for f in files:
+        directory, filename = os.path.split(f)
+        image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
+        tables = find_tables(image)
+        files = []
+        filename_sans_extension = os.path.splitext(filename)[0]
+        if tables:
+            os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
+        for i, table in enumerate(tables):
+            table_filename = "table-{:03d}.png".format(i)
+            table_filepath = os.path.join(
+                directory, filename_sans_extension, table_filename
+            )
+            files.append(table_filepath)
+            cv2.imwrite(table_filepath, table)
+        if tables:
+            results.append((f, files))
+
+    for image_filename, table_filenames in results:
+        print("\n".join(table_filenames))
+
 
 if __name__ == "__main__":
     args = parser.parse_args()
@@ -1295,22 +1691,148 @@ parser.add_argument("files", nargs=
 
+
-
-

4.2.5 table_image_ocr/extract_cells_from_table.py

-
+
+

5.2.5 table_ocr/extract_cells/

+
+
+
+
5.2.5.1 table_ocr/extract_cells/__init__.py
+
-
. ~/.virtualenvs/lotto_odds/bin/activate
-python -m pdf.extract_cells_from_table "resources/examples/example-table.png"
+
import cv2
+
+def extract_cell_images_from_table(image):
+    BLUR_KERNEL_SIZE = (17, 17)
+    STD_DEV_X_DIRECTION = 0
+    STD_DEV_Y_DIRECTION = 0
+    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    SUBTRACT_FROM_MEAN = -2
+    
+    img_bin = cv2.adaptiveThreshold(
+        ~blurred,
+        MAX_COLOR_VAL,
+        cv2.ADAPTIVE_THRESH_MEAN_C,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+        SUBTRACT_FROM_MEAN,
+    )
+    vertical = horizontal = img_bin.copy()
+    SCALE = 5
+    image_width, image_height = horizontal.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
+    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
+    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    
+    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
+    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
+    
+    mask = horizontally_dilated + vertically_dilated
+    contours, heirarchy = cv2.findContours(
+        mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
+    )
+    
+    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
+    epsilons = [0.05 * p for p in perimeter_lengths]
+    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
+    
+    # Filter out contours that aren't rectangular. Those that aren't rectangular
+    # are probably noise.
+    approx_rects = [p for p in approx_polys if len(p) == 4]
+    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
+    
+    # Filter out rectangles that are too narrow or too short.
+    MIN_RECT_WIDTH = 40
+    MIN_RECT_HEIGHT = 10
+    bounding_rects = [
+        r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
+    ]
+    
+    # The largest bounding rectangle is assumed to be the entire table.
+    # Remove it from the list. We don't want to accidentally try to OCR
+    # the entire table.
+    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
+    bounding_rects = [b for b in bounding_rects if b is not largest_rect]
+    
+    cells = [c for c in bounding_rects]
+    def cell_in_same_row(c1, c2):
+        c1_center = c1[1] + c1[3] - c1[3] / 2
+        c2_bottom = c2[1] + c2[3]
+        c2_top = c2[1]
+        return c2_top < c1_center < c2_bottom
+    
+    orig_cells = [c for c in cells]
+    rows = []
+    while cells:
+        first = cells[0]
+        rest = cells[1:]
+        cells_in_same_row = sorted(
+            [
+                c for c in rest
+                if cell_in_same_row(c, first)
+            ],
+            key=lambda c: c[0]
+        )
+    
+        row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
+        rows.append(row_cells)
+        cells = [
+            c for c in rest
+            if not cell_in_same_row(c, first)
+        ]
+    
+    # Sort rows by average height of their center.
+    def avg_height_of_center(row):
+        centers = [y + h - h / 2 for x, y, w, h in row]
+        return sum(centers) / len(centers)
+    
+    rows.sort(key=avg_height_of_center)
+    cell_images_rows = []
+    for row in rows:
+        cell_images_row = []
+        for x, y, w, h in row:
+            cell_images_row.append(image[y:y+h, x:x+w])
+        cell_images_rows.append(cell_images_row)
+    return cell_images_rows
 
+
+
+ +
+
5.2.5.2 table_ocr/extract_cells/__main__.py
+
+

+Takes as a command line argument a path to an image of a table. +

+ +

+Detects cells in the table and extracts each cell to an image file in a new +/cells/ subdirectory in the same directory of the given image’s path. +

+ +

+Each cell filename is suffixed with <row>-<column> so that the filenames can +be sorted lexicographically and will align with reading the cells from +left-to-right, top-to-bottom. +

+ +

+Prints to stdout the lexicographically sorted list of filenames of the extracted +cells. +

import os
 import sys
 
 import cv2
-import pytesseract
+
+from table_ocr.extract_cells import extract_cell_images_from_table
 
 def main(f):
     results = []
@@ -1324,7 +1846,7 @@ python -m pdf.extract_cells_from_table "resources/
             cell_filename = "{:03d}-{:03d}.png".format(i, j)
             path = os.path.join(cell_img_dir, cell_filename)
             cv2.imwrite(path, cell)
-            print(cell_filename)
+            print(path)
 
 
 def extract_cell_images_from_table(image):
@@ -1430,11 +1952,202 @@ python -m pdf.extract_cells_from_table "resources/
 
+ +
+

5.2.6 table_ocr/ocr_image/

+
+
+
5.2.6.1 table_ocr/ocr_image/__init__.py
+
+
+
import math
 
-
-

5 Utils

-
+import cv2 +import numpy as np +import pytesseract + +def crop_to_text(image): + MAX_COLOR_VAL = 255 + BLOCK_SIZE = 15 + SUBTRACT_FROM_MEAN = -2 + + img_bin = cv2.adaptiveThreshold( + ~image, + MAX_COLOR_VAL, + cv2.ADAPTIVE_THRESH_MEAN_C, + cv2.THRESH_BINARY, + BLOCK_SIZE, + SUBTRACT_FROM_MEAN, + ) + + img_h, img_w = image.shape + horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1)) + vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7))) + horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) + vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) + both = horizontal_lines + vertical_lines + cleaned = img_bin - both + + # Get rid of little noise. + kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) + opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel) + opened = cv2.dilate(opened, kernel) + + contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + bounding_rects = [cv2.boundingRect(c) for c in contours] + NUM_PX_COMMA = 6 + MIN_CHAR_AREA = 5 * 9 + char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA] + if char_sized_bounding_rects: + minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 + for x, y, w, h in char_sized_bounding_rects: + minx = min(minx, x) + miny = min(miny, y) + maxx = max(maxx, x + w) + maxy = max(maxy, y + h) + x, y, w, h = minx, miny, maxx - minx, maxy - miny + cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)] + else: + # If we morphed out all of the text, assume an empty image. + cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8) + bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) + return bordered +def ocr_image(image, config): + return pytesseract.image_to_string( + image, + config=config + ) +
+
+
+
+
+
5.2.6.2 table_ocr/ocr_image/__main__.py
+
+

+This does a little bit of cleanup before sending it through tesseract. +

+ +

+Creates images and text files that can be used for training tesseract. See +https://github.com/tesseract-ocr/tesstrain. +

+ +
+
import argparse
+import math
+import os
+import sys
+
+import cv2
+
+from table_ocr.ocr_image import crop_to_text, ocr_image
+
+description="""Takes a single argument that is the image to OCR.
+Remaining arguments are passed directly to Tesseract.
+
+Attempts to make OCR more accurate by performing some modifications on the image.
+Saves the modified image and the OCR text in an `ocr_data` directory.
+Filenames are of the format for training with tesstrain."""
+parser = argparse.ArgumentParser(description=description)
+parser.add_argument("image", help="filepath of image to perform OCR")
+
+def main(image_file, tess_args):
+    directory, filename = os.path.split(image_file)
+    filename_sans_ext, ext = os.path.splitext(filename)
+    image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
+    cropped = crop_to_text(image)
+    ocr_data_dir = os.path.join(directory, "ocr_data")
+    os.makedirs(ocr_data_dir, exist_ok=True)
+    out_imagepath = os.path.join(ocr_data_dir, filename)
+    out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
+    cv2.imwrite(out_imagepath, cropped)
+    txt = ocr_image(cropped, " ".join(tess_args))
+    print(txt)
+    with open(out_txtpath, "w") as txt_file:
+        txt_file.write(txt)
+
+if __name__ == "__main__":
+    args, tess_args = parser.parse_known_args()
+    main(args.image, tess_args)
+
+
+
+
+
+
+

5.2.7 table_ocr/ocr_to_csv/

+
+
+
+
5.2.7.1 table_ocr/ocr_to_csv/__init__.py
+
+
+
import csv
+import io
+import os
+
+
+def text_files_to_csv(files):
+    """Files must be sorted lexicographically
+    Filenames must be <row>-<colum>.txt.
+    000-000.txt
+    000-001.txt
+    001-000.txt
+    etc...
+    """
+    rows = []
+    for f in files:
+        directory, filename = os.path.split(f)
+        with open(f) as of:
+            txt = of.read()
+        row, column = map(int, filename.split(".")[0].split("-"))
+        if row == len(rows):
+            rows.append([])
+        rows[row].append(txt)
+
+    csv_file = io.StringIO()
+    writer = csv.writer(csv_file)
+    writer.writerows(rows)
+    return csv_file.getvalue()
+
+
+
+
+
+
5.2.7.2 table_ocr/ocr_to_csv/__main__.py
+
+
+
import argparse
+import os
+
+from table_ocr.ocr_to_csv import text_files_to_csv
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+
+def main(files):
+    print(text_files_to_csv(files))
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    files = args.files
+    files.sort()
+    main(files)
+
+
+
+
+
+
+
+ +
+

6 Utils

+

The following code lets us specify a size for images when they are exported to html. @@ -1460,12 +2173,9 @@ with advice-add.

-
(concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]")
+
(concat "#+ATTR_HTML: :width " width " :height " height "\n[[file:" text "]]")
 
-

- -

(defun remove-attributes-from-src-block-result (&rest args)
@@ -1484,11 +2194,30 @@ with advice-add.
 
+ +
+

6.1 Logging

+
+
+
def get_logger(name):
+    logger = logging.getLogger(name)
+    lvl = os.environ.get("PY_LOG_LVL", "info").upper()
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter(logging.BASIC_FORMAT)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    handler.setLevel(lvl)
+    logger.setLevel(lvl)
+    return logger
+
+
+
+

Author: Eric Ihli

-

Created: 2020-04-10 Fri 14:10

+

Created: 2020-04-25 Sat 12:20

diff --git a/resources/examples/example-table-cell-1-1-cropped.png b/resources/examples/example-table-cell-1-1-cropped.png index 5bbcbe9cfd30cfc8df5dff8a854d8781bc7bbc8c..540b7d64fb4ecdfba6243a27cc3a17e5ebb63358 100644 GIT binary patch literal 1129 zcmV-v1eW`WP)i*4T7fy4u|H)7?Gy-1GG5ocSM|*NYfo5F-p?gh7ljh!F-c z0_nIb1}7LoB};Y{Pck*n(L4$1ct%8+dWs#H$%vx2AQOj=7y(z@{B`Y+W72e2@gz7h zfPpLd^ZJ`xwxB&^i!2y6vDV)$~}yR|d6rSE@Lx^sS>ZmEt8eE$CM!hP*qJ6}040o`lc*3Hfh z9-n=%G>HH)e0*l(i04WDe=UtD%_XucGSS|O?!*sIixXRvw_nH1LB1~owMRC-CuI1D z;qy*xUvU~n^x6wOa~o86lC5-3jnuvYTz%eJd&(&C7Fi2yMI2luY7gX5{YKd!pK1i)=s zH1&o@)c4fD;(8Lv!d8*f4(%H&|0M|@F#=z|nfRjw&hMPusBJu1*o!`S zO_R5%0RdtJhR^Tlc`{FRFUCq*SP)BIdsXr-F=5T+RI@&?r0cPC+ETkpw{Jp#7(S+v zZS|r^UU{h{!JVRFrv4t!hmRNmPs+sLNc|PLN3J_Ed>!Rx+ayR>(U|<( zVFfUv*mGd|p8^451PrngMifo+^d&#C$N9ZWiW?z)-{|YG26uawYopTiU zh~dlCZGL##SxM|TIPpw{V@1<(?kTgB-=^g*ta|8~dbj3_l3Q_Ctp2(BCr^Tp7`~yg z840s)nep1G16vgNW_@+JqE&QwExFhShxPrN2RAil#=4UycNM&|O33gLBal`Ww%aj_ zrd-Z<{+*TT+P{AC==uhVMF!pTOW;u1-B@0KsyFLt7)Oi;j-8$o|0i2~8sLu(7ap3U zOGjJF(<43YH5)9|-g<4R)N$>TDp|7?_=w>fPP(HhPwFj@kkfM7kgCZe3Akb?HKq$7 zD^!yp3rB`26!?fSZZzMJ6@nRot-z7tvlYMqj4=KO_^;v+kt;Vt00000NkvXXu0mjfavB$> literal 1067 zcmV+`1l0S9P)Nkl z|7+BB90%~{>+}A6xzF9UyQ^&uR!%3W%i7vBr)e!zB!*wXNa+Vjau}Exl&I)nM5WEJ z%&eg#%)V>|qZCbS!a$hPEL&@AIxU@T?)$gfeeUjapU>yj`}QB)<3Wt{5hH!XNFOoM zM~w6lLvqFK-!_jsahmUHo&rY&FnT>}=}1S{E=@~nQ{3SnxEI={Vg@m0RCf0}c~jvB zfCptN%hJc+Kb6-PblpGa#eHI5ovR2teAkB=#7Oz5dcmH^qi+lMFUv4(-BE!vpG>T% z>Dk@)=F!J6u(^9nNn)(7q_!{wi5SxP?USCTjQqXEr*-71u88`FN(X{JzaYNerG4-Y z7LM^F6{tS3{bK_XF{Cql*EPWM-+Xm&NwW@5Nosv_ebw&*H(s$fpR;nkiX1cf?5;3Ij-V2^CGtu^v{!B~4uP~u| zVWZ-K70O@Uam_hd1&J6lZshNzTl;-p;>w1{Uu<%#&gR|8bj8MBD}F%=F{FnJ zg7L<+*4yWf?$XrTO%=tOQQGTuCIX)xH;?Qb+tC`G9thPR$o_DH0f`t=L|f654A?a1 zT9)&0QMmue)`F?6%@pvBxtCYNp`!bNjFIqA%rlTej2WjI=4AdGo7DjD=f`u7EjE>t zUB!{f!JevZww~O5bB)q_^UHG8KngKZ+^Mjq3}q`w7ztxsH`K`>TrnP=HU$t9dYqsN zM@8HRi5MwUS(d61;yxrbI4Y#12Dl<-f+-qEYGAT;%a0UdNE0CdNe!@s#;nHK^r8V4 zp6x?I_XLu{Q(~qP4MZ{_5hKM*8bI8~1Sw|1@;fRL23!HKZ7?B{=|aHPArT{e#7G}8 l(npN+5hH!XNFOoM_dl9Qy%42^wD14`002ovPDHLkV1g%@3ON7(