diff --git a/pdf_table_extraction_and_ocr.org b/pdf_table_extraction_and_ocr.org index 7ad322f..61dfd67 100644 --- a/pdf_table_extraction_and_ocr.org +++ b/pdf_table_extraction_and_ocr.org @@ -849,9 +849,32 @@ if __name__ == "__main__": #+NAME: extract_tables/__init__.py #+HEADER: :tangle table_ocr/extract_tables/__init__.py #+BEGIN_SRC python +import os import cv2 <> + +def main(files): + results = [] + for f in files: + directory, filename = os.path.split(f) + image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) + tables = find_tables(image) + files = [] + filename_sans_extension = os.path.splitext(filename)[0] + if tables: + os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True) + for i, table in enumerate(tables): + table_filename = "table-{:03d}.png".format(i) + table_filepath = os.path.join( + directory, filename_sans_extension, table_filename + ) + files.append(table_filepath) + cv2.imwrite(table_filepath, table) + if tables: + results.append((f, files)) + # Results is [[, []]] + return results #+END_SRC **** table_ocr/extract_tables/__main__.py @@ -876,44 +899,16 @@ For each image path given as an agument, outputs: #+NAME: extract_tables/__main__.py #+BEGIN_SRC python :tangle table_ocr/extract_tables/__main__.py :results none import argparse -import os -import cv2 - -from table_ocr.extract_tables import find_tables +from table_ocr.extract_tables import main parser = argparse.ArgumentParser() parser.add_argument("files", nargs="+") - - -def main(files): - results = [] - for f in files: - directory, filename = os.path.split(f) - image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) - tables = find_tables(image) - files = [] - filename_sans_extension = os.path.splitext(filename)[0] - if tables: - os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True) - for i, table in enumerate(tables): - table_filename = "table-{:03d}.png".format(i) - table_filepath = os.path.join( - directory, filename_sans_extension, table_filename - ) - files.append(table_filepath) - cv2.imwrite(table_filepath, table) - if tables: - results.append((f, files)) - # Results is [[, []]] - return results - -if __name__ == "__main__": - args = parser.parse_args() - files = args.files - results = main(files) - for image, tables in results: - print("\n".join(tables)) +args = parser.parse_args() +files = args.files +results = main(files) +for image, tables in results: + print("\n".join(tables)) #+END_SRC *** table_ocr/extract_cells/ diff --git a/table_ocr/extract_tables/__init__.py b/table_ocr/extract_tables/__init__.py index 7df2345..5cac689 100644 --- a/table_ocr/extract_tables/__init__.py +++ b/table_ocr/extract_tables/__init__.py @@ -1,3 +1,4 @@ +import os import cv2 def find_tables(image): @@ -47,3 +48,25 @@ def find_tables(image): # Leaving that step as a future TODO if it is ever necessary. images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects] return images + +def main(files): + results = [] + for f in files: + directory, filename = os.path.split(f) + image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) + tables = find_tables(image) + files = [] + filename_sans_extension = os.path.splitext(filename)[0] + if tables: + os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True) + for i, table in enumerate(tables): + table_filename = "table-{:03d}.png".format(i) + table_filepath = os.path.join( + directory, filename_sans_extension, table_filename + ) + files.append(table_filepath) + cv2.imwrite(table_filepath, table) + if tables: + results.append((f, files)) + # Results is [[, []]] + return results diff --git a/table_ocr/extract_tables/__main__.py b/table_ocr/extract_tables/__main__.py index 69aaa22..f272597 100644 --- a/table_ocr/extract_tables/__main__.py +++ b/table_ocr/extract_tables/__main__.py @@ -1,39 +1,11 @@ import argparse -import os -import cv2 - -from table_ocr.extract_tables import find_tables +from table_ocr.extract_tables import main parser = argparse.ArgumentParser() parser.add_argument("files", nargs="+") - - -def main(files): - results = [] - for f in files: - directory, filename = os.path.split(f) - image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) - tables = find_tables(image) - files = [] - filename_sans_extension = os.path.splitext(filename)[0] - if tables: - os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True) - for i, table in enumerate(tables): - table_filename = "table-{:03d}.png".format(i) - table_filepath = os.path.join( - directory, filename_sans_extension, table_filename - ) - files.append(table_filepath) - cv2.imwrite(table_filepath, table) - if tables: - results.append((f, files)) - # Results is [[, []]] - return results - -if __name__ == "__main__": - args = parser.parse_args() - files = args.files - results = main(files) - for image, tables in results: - print("\n".join(tables)) +args = parser.parse_args() +files = args.files +results = main(files) +for image, tables in results: + print("\n".join(tables))