|
|
@ -53,6 +53,10 @@ for image in $(cat /tmp/extracted-tables.txt); do
|
|
|
|
done
|
|
|
|
done
|
|
|
|
#+END_SRC
|
|
|
|
#+END_SRC
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
** Possible improvements
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Detect text with the stroke-width-transform alogoritm. https://zablo.net/blog/post/stroke-width-transform-swt-python/index.html
|
|
|
|
|
|
|
|
|
|
|
|
* Preparing data
|
|
|
|
* Preparing data
|
|
|
|
** Converting PDFs to images
|
|
|
|
** Converting PDFs to images
|
|
|
|
|
|
|
|
|
|
|
@ -538,9 +542,10 @@ def crop_to_text(image):
|
|
|
|
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
|
|
|
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
|
|
|
bounding_rects = [cv2.boundingRect(c) for c in contours]
|
|
|
|
bounding_rects = [cv2.boundingRect(c) for c in contours]
|
|
|
|
NUM_PX_COMMA = 6
|
|
|
|
NUM_PX_COMMA = 6
|
|
|
|
|
|
|
|
MIN_CHAR_AREA = 5 * 9
|
|
|
|
if bounding_rects:
|
|
|
|
if bounding_rects:
|
|
|
|
minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
|
|
|
|
minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
|
|
|
|
for x, y, w, h in bounding_rects:
|
|
|
|
for x, y, w, h in [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]:
|
|
|
|
minx = min(minx, x)
|
|
|
|
minx = min(minx, x)
|
|
|
|
miny = min(miny, y)
|
|
|
|
miny = min(miny, y)
|
|
|
|
maxx = max(maxx, x + w)
|
|
|
|
maxx = max(maxx, x + w)
|
|
|
|