Fix bug picking up noise in detecting contours

6 years ago · bea192678e
parent 54511b9a1f
commit bea192678e
2 changed files with 8 additions and 2 deletions
--- a/pdf_table_extraction_and_ocr.org
+++ b/pdf_table_extraction_and_ocr.org
@ -53,6 +53,10 @@ for image in $(cat /tmp/extracted-tables.txt); do
 done
 #+END_SRC

+** Possible improvements
+
+Detect text with the stroke-width-transform alogoritm. https://zablo.net/blog/post/stroke-width-transform-swt-python/index.html
+
 * Preparing data
 ** Converting PDFs to images

@ -538,9 +542,10 @@ def crop_to_text(image):
    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    NUM_PX_COMMA = 6
+    MIN_CHAR_AREA = 5 * 9
    if bounding_rects:
        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
-        for x, y, w, h in bounding_rects:
+        for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
            minx = min(minx, x)
            miny = min(miny, y)
            maxx = max(maxx, x + w)
--- a/table_ocr/ocr_image.py
+++ b/table_ocr/ocr_image.py
@ -44,9 +44,10 @@ def crop_to_text(image):
    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    NUM_PX_COMMA = 6
+    MIN_CHAR_AREA = 5 * 9
    if bounding_rects:
        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
-        for x, y, w, h in bounding_rects:
+        for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
            minx = min(minx, x)
            miny = min(miny, y)
            maxx = max(maxx, x + w)