Add module for outputting csv from parsed table
Make cell extraction a little more accurate.main
parent
de398f73c2
commit
e49fffa5a7
Binary file not shown.
Before Width: | Height: | Size: 1.0 KiB After Width: | Height: | Size: 1.0 KiB |
@ -1,30 +1,69 @@
|
|||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import pytesseract
|
import pytesseract
|
||||||
|
|
||||||
def crop_to_text(image):
|
def crop_to_text(image):
|
||||||
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (4, 4))
|
MAX_COLOR_VAL = 255
|
||||||
opened = cv2.morphologyEx(~image, cv2.MORPH_OPEN, kernel)
|
BLOCK_SIZE = 15
|
||||||
|
SUBTRACT_FROM_MEAN = -2
|
||||||
|
|
||||||
|
img_bin = cv2.adaptiveThreshold(
|
||||||
|
~image,
|
||||||
|
MAX_COLOR_VAL,
|
||||||
|
cv2.ADAPTIVE_THRESH_MEAN_C,
|
||||||
|
cv2.THRESH_BINARY,
|
||||||
|
BLOCK_SIZE,
|
||||||
|
SUBTRACT_FROM_MEAN,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get rid of littl noise.
|
||||||
|
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
|
||||||
|
opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel)
|
||||||
|
|
||||||
|
# Dilate so each digit is connected, so we can get a bounding rectangle
|
||||||
|
# around all of the digits as one contour. This will make the bounding
|
||||||
|
# rectangle 8 pixels wider on the left and right, so we'll need to crop that
|
||||||
|
# out at the end so that we don't pick up stray border pixels.
|
||||||
|
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (16, 1))
|
||||||
|
dilated = cv2.dilate(opened, kernel)
|
||||||
|
|
||||||
|
contours, hierarchy = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
|
||||||
contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
|
||||||
bounding_rects = [cv2.boundingRect(c) for c in contours]
|
bounding_rects = [cv2.boundingRect(c) for c in contours]
|
||||||
# The largest contour is certainly the text that we're looking for.
|
|
||||||
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
|
if bounding_rects:
|
||||||
x, y, w, h = largest_rect
|
# The largest contour is certainly the text that we're looking for.
|
||||||
cropped = image[y:y+h, x:x+w]
|
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
|
||||||
|
x, y, w, h = largest_rect
|
||||||
|
# Commas sometimes go a little below the bounding box and we don't want
|
||||||
|
# to lost them or turn them into periods.
|
||||||
|
img_h, img_w = image.shape
|
||||||
|
cropped = image[y:min(img_h, y+h+6), x+8:x+w-8]
|
||||||
|
else:
|
||||||
|
cropped = image
|
||||||
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
|
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
|
||||||
return bordered
|
return bordered
|
||||||
def ocr_image(image, config):
|
def ocr_image(image, config):
|
||||||
cropped = crop_to_text(image)
|
|
||||||
return pytesseract.image_to_string(
|
return pytesseract.image_to_string(
|
||||||
~cropped,
|
image,
|
||||||
config=config
|
config=config
|
||||||
)
|
)
|
||||||
|
|
||||||
def main(f):
|
def main(f):
|
||||||
|
directory, filename = os.path.split(f)
|
||||||
|
filename_sans_ext, ext = os.path.splitext(filename)
|
||||||
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
|
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
|
||||||
print(ocr_image(image, "--psm 7"))
|
cropped = crop_to_text(image)
|
||||||
|
ocr_data_dir = os.path.join(directory, "ocr_data")
|
||||||
|
os.makedirs(ocr_data_dir, exist_ok=True)
|
||||||
|
out_imagepath = os.path.join(ocr_data_dir, filename)
|
||||||
|
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
|
||||||
|
cv2.imwrite(out_imagepath, cropped)
|
||||||
|
txt = ocr_image(cropped, "--psm 7")
|
||||||
|
with open(out_txtpath, "w") as txt_file:
|
||||||
|
txt_file.write(txt)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main(sys.argv[1])
|
main(sys.argv[1])
|
||||||
|
@ -0,0 +1,29 @@
|
|||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("files", nargs="+")
|
||||||
|
|
||||||
|
def main(files):
|
||||||
|
rows = []
|
||||||
|
for f in files:
|
||||||
|
directory, filename = os.path.split(f)
|
||||||
|
with open(f) as of:
|
||||||
|
txt = of.read()
|
||||||
|
row, column = map(int, filename.split(".")[0].split("-"))
|
||||||
|
if row == len(rows):
|
||||||
|
rows.append([])
|
||||||
|
rows[row].append(txt)
|
||||||
|
|
||||||
|
csv_file = io.StringIO()
|
||||||
|
writer = csv.writer(csv_file)
|
||||||
|
writer.writerows(rows)
|
||||||
|
print(csv_file.getvalue())
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args.files)
|
Loading…
Reference in New Issue