Refactor table extraction into module

5 years ago · b9f088cf92
parent 98ef6ffd85
commit b9f088cf92
5 changed files with 140 additions and 80 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -66,29 +66,10 @@ probably aren't necessary.

 This code calls out to [[https://manpages.debian.org/testing/poppler-utils/pdfimages.1.en.html][pdfimages]] from [[https://poppler.freedesktop.org/][Poppler]].

-#+NAME: pdf-to-images-overview
-#+HEADER: :mkdirp yes :tangle table_ocr/pdf_to_images/__init__.py
-#+BEGIN_SRC python :noweb strip-export :results none
-import os
-import re
-import subprocess
-
-from table_ocr.util import get_logger, working_dir
-
-logger = get_logger(__name__)
-
-# Wrapper around the Poppler command line utility "pdfimages" and helpers for
-# finding the output files of that command.
-<<pdf-to-images>>
-
-# Helpers to detect orientation of the images that Poppler extracted and if the
-# images are rotated or skewed, use ImageMagick's `mogrify` to correct the
-# rotation. (Makes OCR more straightforward.)
-<<fix-orientation>>
-#+END_SRC
-
 #+NAME: pdf-to-images
 #+BEGIN_SRC python :results none
+# Wrapper around the Poppler command line utility "pdfimages" and helpers for
+# finding the output files of that command.
 def pdf_to_images(pdf_filepath):
    """
    Turn a pdf into images
@ -148,6 +129,10 @@ Script: Latin
 Script confidence: 2.44
 #+END_EXAMPLE

+The following are some helpers to detect orientation of the images that Poppler
+extracted and, if the images are rotated or skewed, use ImageMagick's `mogrify`
+to correct the rotation. This makes OCR more straightforward.
+
 #+NAME: fix-orientation
 #+BEGIN_SRC python :results none
 def preprocess_img(filepath):
@ -189,7 +174,8 @@ The blurring, thresholding, and line detection is used here as well as later on
 for cell extraction. They are good techniques for cleaning an image up in a way
 that makes things like shape detection more accurate.

-#+BEGIN_SRC python :noweb-ref detect-table :results none :noweb no-export
+#+NAME: detect-tables
+#+BEGIN_SRC python :results none :noweb yes
 def find_tables(image):
    <<blur>>
    <<threshold>>
@ -215,10 +201,10 @@ def find_tables(image):
 #+END_SRC

 #+HEADER: :post html-image-size(text=*this*, width="500px")
-#+BEGIN_SRC python :noweb-ref test-detect-table :noweb no-export :results raw
+#+BEGIN_SRC python :noweb-ref test-detect-table :noweb strip-export :results raw
 import cv2

-<<detect-table>>
+<<detect-tables>>

 image_filename = "resources/examples/example-page.png"
 image = cv2.imread(image_filename, cv2.IMREAD_GRAYSCALE)
@ -701,8 +687,25 @@ def working_dir(directory):
 def make_tempdir(identifier):
    return tempfile.mkdtemp(prefix="{}_".format(identifier))
 #+END_SRC
+*** table_ocr/pdf_to_images/
+**** table_ocr/pdf_to_images/__init__.py
+#+NAME: pdf_to_images/__init__.py
+#+HEADER: :mkdirp yes :tangle table_ocr/pdf_to_images/__init__.py
+#+BEGIN_SRC python :noweb strip-export :results none
+import os
+import re
+import subprocess
+
+from table_ocr.util import get_logger, working_dir
+
+logger = get_logger(__name__)
+
+<<pdf-to-images>>

-*** table_ocr/pdf_to_images/__main__.py
+<<fix-orientation>>
+#+END_SRC
+
+**** table_ocr/pdf_to_images/__main__.py

 Takes a variable number of pdf files and creates images out of each page of the
 file using ~pdfimages~ from Poppler. Images are created in the same directory
@ -748,7 +751,18 @@ if __name__ == "__main__":
    main(args.files)
 #+END_SRC

-*** table_ocr/extract_tables.py
+*** table_ocr/extract_tables/
+**** table_ocr/extract_tables/__init__.py
+
+#+NAME: extract_tables/__init__.py
+#+HEADER: :tangle table_ocr/extract_tables/__init__.py
+#+BEGIN_SRC python
+import cv2
+
+<<detect-tables>>
+#+END_SRC
+
+**** table_ocr/extract_tables/__main__.py

 #+BEGIN_SRC shell
 . ~/.virtualenvs/lotto_odds/bin/activate
@ -759,12 +773,15 @@ python -m pdf.extract_tables "resources/examples/example-page.png"
 | resources/examples/example-page.png           |
 | resources/examples/example-page-table-000.png |

-#+BEGIN_SRC python :noweb yes :tangle table_ocr/extract_tables.py :results none
+#+NAME: extract_tables/__main__.py
+#+BEGIN_SRC python :tangle table_ocr/extract_tables/__main__.py :results none
 import argparse
 import os

 import cv2

+from table_ocr.extract_tables import find_tables
+
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")

@ -781,7 +798,9 @@ def main(files):
            os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
        for i, table in enumerate(tables):
            table_filename = "table-{:03d}.png".format(i)
-            table_filepath = os.path.join(directory, filename_sans_extension, table_filename)
+            table_filepath = os.path.join(
+                directory, filename_sans_extension, table_filename
+            )
            files.append(table_filepath)
            cv2.imwrite(table_filepath, table)
        if tables:
@ -790,7 +809,6 @@ def main(files):
    for image_filename, table_filenames in results:
        print("\n".join(table_filenames))

-<<detect-table>>

 if __name__ == "__main__":
    args = parser.parse_args()
--- a/table_ocr/extract_tables.py
+++ b/table_ocr/extract_tables.py
@ -3,6 +3,8 @@ import os

 import cv2

+from table_ocr.extract_tables import find_tables
+
 parser = argparse.ArgumentParser()
 parser.add_argument("files", nargs="+")

@ -19,7 +21,9 @@ def main(files):
            os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
        for i, table in enumerate(tables):
            table_filename = "table-{:03d}.png".format(i)
-            table_filepath = os.path.join(directory, filename_sans_extension, table_filename)
+            table_filepath = os.path.join(
+                directory, filename_sans_extension, table_filename
+            )
            files.append(table_filepath)
            cv2.imwrite(table_filepath, table)
        if tables:
@ -28,53 +32,6 @@ def main(files):
    for image_filename, table_filenames in results:
        print("\n".join(table_filenames))

-def find_tables(image):
-    BLUR_KERNEL_SIZE = (17, 17)
-    STD_DEV_X_DIRECTION = 0
-    STD_DEV_Y_DIRECTION = 0
-    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
-    MAX_COLOR_VAL = 255
-    BLOCK_SIZE = 15
-    SUBTRACT_FROM_MEAN = -2
-    
-    img_bin = cv2.adaptiveThreshold(
-        ~blurred,
-        MAX_COLOR_VAL,
-        cv2.ADAPTIVE_THRESH_MEAN_C,
-        cv2.THRESH_BINARY,
-        BLOCK_SIZE,
-        SUBTRACT_FROM_MEAN,
-    )
-    vertical = horizontal = img_bin.copy()
-    SCALE = 5
-    image_width, image_height = horizontal.shape
-    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
-    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
-    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
-    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
-    
-    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
-    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
-    
-    mask = horizontally_dilated + vertically_dilated
-    contours, heirarchy = cv2.findContours(
-        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
-    )
-
-    MIN_TABLE_AREA = 1e5
-    contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
-    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
-    epsilons = [0.1 * p for p in perimeter_lengths]
-    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
-    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
-
-    # The link where a lot of this code was borrowed from recommends an
-    # additional step to check the number of "joints" inside this bounding rectangle.
-    # A table should have a lot of intersections. We might have a rectangular image
-    # here though which would only have 4 intersections, 1 at each corner.
-    # Leaving that step as a future TODO if it is ever necessary.
-    images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
-    return images

 if __name__ == "__main__":
    args = parser.parse_args()
--- a/table_ocr/extract_tables/init.py
+++ b/table_ocr/extract_tables/init.py
@ -0,0 +1,49 @@
+import cv2
+
+def find_tables(image):
+    BLUR_KERNEL_SIZE = (17, 17)
+    STD_DEV_X_DIRECTION = 0
+    STD_DEV_Y_DIRECTION = 0
+    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
+    MAX_COLOR_VAL = 255
+    BLOCK_SIZE = 15
+    SUBTRACT_FROM_MEAN = -2
+    
+    img_bin = cv2.adaptiveThreshold(
+        ~blurred,
+        MAX_COLOR_VAL,
+        cv2.ADAPTIVE_THRESH_MEAN_C,
+        cv2.THRESH_BINARY,
+        BLOCK_SIZE,
+        SUBTRACT_FROM_MEAN,
+    )
+    vertical = horizontal = img_bin.copy()
+    SCALE = 5
+    image_width, image_height = horizontal.shape
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
+    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
+    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
+    
+    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
+    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
+    
+    mask = horizontally_dilated + vertically_dilated
+    contours, heirarchy = cv2.findContours(
+        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
+    )
+
+    MIN_TABLE_AREA = 1e5
+    contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
+    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
+    epsilons = [0.1 * p for p in perimeter_lengths]
+    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
+    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
+
+    # The link where a lot of this code was borrowed from recommends an
+    # additional step to check the number of "joints" inside this bounding rectangle.
+    # A table should have a lot of intersections. We might have a rectangular image
+    # here though which would only have 4 intersections, 1 at each corner.
+    # Leaving that step as a future TODO if it is ever necessary.
+    images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
+    return images
--- a/table_ocr/extract_tables/main.py
+++ b/table_ocr/extract_tables/main.py
@ -0,0 +1,39 @@
+import argparse
+import os
+
+import cv2
+
+from table_ocr.extract_tables import find_tables
+
+parser = argparse.ArgumentParser()
+parser.add_argument("files", nargs="+")
+
+
+def main(files):
+    results = []
+    for f in files:
+        directory, filename = os.path.split(f)
+        image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
+        tables = find_tables(image)
+        files = []
+        filename_sans_extension = os.path.splitext(filename)[0]
+        if tables:
+            os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
+        for i, table in enumerate(tables):
+            table_filename = "table-{:03d}.png".format(i)
+            table_filepath = os.path.join(
+                directory, filename_sans_extension, table_filename
+            )
+            files.append(table_filepath)
+            cv2.imwrite(table_filepath, table)
+        if tables:
+            results.append((f, files))
+
+    for image_filename, table_filenames in results:
+        print("\n".join(table_filenames))
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    files = args.files
+    main(files)
--- a/table_ocr/pdf_to_images/init.py
+++ b/table_ocr/pdf_to_images/init.py
@ -48,9 +48,6 @@ def find_matching_files_in_dir(file_prefix, directory):
    ]
    return files

-# Helpers to detect orientation of the images that Poppler extracted and if the
-# images are rotated or skewed, use ImageMagick's `mogrify` to correct the
-# rotation. (Makes OCR more straightforward.)
 def preprocess_img(filepath):
    """
    Processing that involves running shell executables,