Fix bug picking up noise in detecting contours

main
Eric Ihli 5 years ago
parent 54511b9a1f
commit bea192678e

@ -53,6 +53,10 @@ for image in $(cat /tmp/extracted-tables.txt); do
done done
#+END_SRC #+END_SRC
** Possible improvements
Detect text with the stroke-width-transform alogoritm. https://zablo.net/blog/post/stroke-width-transform-swt-python/index.html
* Preparing data * Preparing data
** Converting PDFs to images ** Converting PDFs to images
@ -538,9 +542,10 @@ def crop_to_text(image):
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours] bounding_rects = [cv2.boundingRect(c) for c in contours]
NUM_PX_COMMA = 6 NUM_PX_COMMA = 6
MIN_CHAR_AREA = 5 * 9
if bounding_rects: if bounding_rects:
minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
for x, y, w, h in bounding_rects: for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
minx = min(minx, x) minx = min(minx, x)
miny = min(miny, y) miny = min(miny, y)
maxx = max(maxx, x + w) maxx = max(maxx, x + w)

@ -44,9 +44,10 @@ def crop_to_text(image):
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours] bounding_rects = [cv2.boundingRect(c) for c in contours]
NUM_PX_COMMA = 6 NUM_PX_COMMA = 6
MIN_CHAR_AREA = 5 * 9
if bounding_rects: if bounding_rects:
minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
for x, y, w, h in bounding_rects: for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
minx = min(minx, x) minx = min(minx, x)
miny = min(miny, y) miny = min(miny, y)
maxx = max(maxx, x + w) maxx = max(maxx, x + w)

Loading…
Cancel
Save