Add script to ocr individual cells

main
Eric Ihli 5 years ago
parent 396782051e
commit 32c62fd773

@ -500,9 +500,7 @@ thick noise or a wrongly detected table.
If everything looks reasonable but the OCR is doing something like turning a
period into a comma, then you might need to do some custom Tesseract training.
#+BEGIN_SRC python :noweb-ref ocr-image :eval no :noweb yes
<<crop-to-text>>
#+BEGIN_SRC python :noweb-ref ocr-image :eval no
def ocr_image(image, config):
cropped = crop_to_text(image)
return pytesseract.image_to_string(
@ -515,6 +513,7 @@ def ocr_image(image, config):
import pytesseract
import cv2
image = cv2.imread("resources/examples/example-table-cell-1-1.png", cv2.IMREAD_GRAYSCALE)
<<crop-to-text>>
<<ocr-image>>
ocr_image(image, "--psm 7")
#+END_SRC
@ -733,6 +732,35 @@ if __name__ == "__main__":
main(sys.argv[1])
#+END_SRC
*** table_ocr/ocr_image.py
This does a little bit of cleanup before sending it through tesseract.
#+BEGIN_SRC shell :results output
. ~/.virtualenvs/lotto_odds/bin/activate
python -m table_ocr.ocr_cell resources/examples/cells/000-000.png
#+END_SRC
#+RESULTS:
: PRIZE
#+BEGIN_SRC python :tangle table_ocr/ocr_cell.py :mkdirp yes :results none
import sys
import cv2
import pytesseract
<<crop-to-text>>
<<ocr-image>>
def main(f):
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
print(ocr_image(image, "--psm 7"))
if __name__ == "__main__":
main(sys.argv[1])
#+END_SRC
* Utils
The following code lets us specify a size for images when they are exported to

@ -13,7 +13,6 @@ def main(files):
directory, filename = os.path.split(f)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
print("Reading {}".format(f))
tables = find_tables(image)
files = []
for i, table in enumerate(tables):

@ -0,0 +1,30 @@
import sys
import cv2
import pytesseract
def crop_to_text(image):
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours]
# The largest contour is certainly the text that we're looking for.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
x, y, w, h = largest_rect
cropped = image[y:y+h, x:x+w]
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
return bordered
def ocr_image(image, config):
cropped = crop_to_text(image)
return pytesseract.image_to_string(
~cropped,
config=config
)
def main(f):
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
print(ocr_image(image, "--psm 7"))
if __name__ == "__main__":
main(sys.argv[1])
Loading…
Cancel
Save