Use cleaner filenames for intermediate files

main
Eric Ihli 5 years ago
parent e49fffa5a7
commit aa900de4e7

@ -24,14 +24,18 @@ output~ to a code block will minimize that noise.
** To get CSV data from a table in a scanned pdf document: ** To get CSV data from a table in a scanned pdf document:
* #+BEGIN_SRC shell :results none :session *Shell*
#+BEGIN_SRC shell TABLES=("/tmp/example-1/example-1.pdf" "/tmp/example-2/example-2.pdf")
python -m table_ocr.prepare_pdfs /tmp/example-1/example-1.pdf /tmp/example-2/example-2.pdf > /tmp/pdf-images.txt python -m table_ocr.prepare_pdfs $TABLES | grep .png > /tmp/pdf_images.txt
cat /tmp/pdf-images.txt | grep .png | xargs -I{} python -m table_ocr.extract_tables {} # All pngs that don't have "table" in their name. Assume "table" has already been found for files with table in name.
find /tmp -iregex ".*example.*table.*\.png" 2>/dev/null | xargs -I{} python -m table_ocr.extract_cells_from_table {} cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} # | grep tables > /tmp/extracted-tables.txt
find /tmp -iregex ".*example.*cells.*\.png" 2>/dev/null | xargs -I{} python -m table_ocr.ocr_image {} cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} # | grep cells > /tmp/extracted-cells.txt
cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {}
# This next one needs to be run on each subdirectory one at a time.
python -m table_ocr.ocr_to_csv $(find . -iregex ".*cells.*ocr_data.*\.txt" 2>/dev/null)
#+END_SRC #+END_SRC
* Preparing data * Preparing data
** Converting PDFs to images ** Converting PDFs to images
@ -61,6 +65,9 @@ def pdfimages(pdf_filepath):
Uses the `pdfimages` utility from Poppler Uses the `pdfimages` utility from Poppler
(https://poppler.freedesktop.org/). Creates images out of each page. Images (https://poppler.freedesktop.org/). Creates images out of each page. Images
are prefixed by their name sans extension and suffixed by their page number. are prefixed by their name sans extension and suffixed by their page number.
This should work up to pdfs with 999 pages since find matching files in dir
uses 3 digits in its regex.
""" """
directory, filename = os.path.split(pdf_filepath) directory, filename = os.path.split(pdf_filepath)
filename_sans_ext = filename.split(".pdf")[0] filename_sans_ext = filename.split(".pdf")[0]
@ -74,7 +81,7 @@ def find_matching_files_in_dir(file_prefix, directory):
files = [ files = [
filename filename
for filename in os.listdir(directory) for filename in os.listdir(directory)
if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename) if re.match(r"{}-\d{{3}}.*\.png".format(re.escape(file_prefix)), filename)
] ]
return files return files
#+END_SRC #+END_SRC
@ -730,20 +737,22 @@ def main(files):
results = [] results = []
for f in files: for f in files:
directory, filename = os.path.split(f) directory, filename = os.path.split(f)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
tables = find_tables(image) tables = find_tables(image)
files = [] files = []
filename_sans_extension = os.path.splitext(filename)[0]
if tables:
os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
for i, table in enumerate(tables): for i, table in enumerate(tables):
filename_sans_extension = os.path.splitext(filename)[0] table_filename = "table-{:03d}.png".format(i)
table_filename = "{}-table-{:03d}.png".format(filename_sans_extension, i) table_filepath = os.path.join(directory, filename_sans_extension, table_filename)
table_filepath = os.path.join(directory, table_filename)
files.append(table_filepath) files.append(table_filepath)
cv2.imwrite(table_filepath, table) cv2.imwrite(table_filepath, table)
results.append((f, files)) if tables:
results.append((f, files))
for image_filename, table_filenames in results: for image_filename, table_filenames in results:
print("{}\n{}\n".format(image_filename, "\n".join(table_filenames))) print("\n".join(table_filenames))
<<detect-table>> <<detect-table>>

@ -11,20 +11,22 @@ def main(files):
results = [] results = []
for f in files: for f in files:
directory, filename = os.path.split(f) directory, filename = os.path.split(f)
image = cv2.imread(f, cv2.IMREAD_GRAYSCALE) image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
tables = find_tables(image) tables = find_tables(image)
files = [] files = []
filename_sans_extension = os.path.splitext(filename)[0]
if tables:
os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
for i, table in enumerate(tables): for i, table in enumerate(tables):
filename_sans_extension = os.path.splitext(filename)[0] table_filename = "table-{:03d}.png".format(i)
table_filename = "{}-table-{:03d}.png".format(filename_sans_extension, i) table_filepath = os.path.join(directory, filename_sans_extension, table_filename)
table_filepath = os.path.join(directory, table_filename)
files.append(table_filepath) files.append(table_filepath)
cv2.imwrite(table_filepath, table) cv2.imwrite(table_filepath, table)
results.append((f, files)) if tables:
results.append((f, files))
for image_filename, table_filenames in results: for image_filename, table_filenames in results:
print("{}\n{}\n".format(image_filename, "\n".join(table_filenames))) print("\n".join(table_filenames))
def find_tables(image): def find_tables(image):
BLUR_KERNEL_SIZE = (17, 17) BLUR_KERNEL_SIZE = (17, 17)

@ -23,7 +23,7 @@ def main(files):
writer = csv.writer(csv_file) writer = csv.writer(csv_file)
writer.writerows(rows) writer.writerows(rows)
print(csv_file.getvalue()) print(csv_file.getvalue())
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
main(args.files) main(args.files)

@ -55,6 +55,9 @@ def pdfimages(pdf_filepath):
Uses the `pdfimages` utility from Poppler Uses the `pdfimages` utility from Poppler
(https://poppler.freedesktop.org/). Creates images out of each page. Images (https://poppler.freedesktop.org/). Creates images out of each page. Images
are prefixed by their name sans extension and suffixed by their page number. are prefixed by their name sans extension and suffixed by their page number.
This should work up to pdfs with 999 pages since find matching files in dir
uses 3 digits in its regex.
""" """
directory, filename = os.path.split(pdf_filepath) directory, filename = os.path.split(pdf_filepath)
filename_sans_ext = filename.split(".pdf")[0] filename_sans_ext = filename.split(".pdf")[0]
@ -68,7 +71,7 @@ def find_matching_files_in_dir(file_prefix, directory):
files = [ files = [
filename filename
for filename in os.listdir(directory) for filename in os.listdir(directory)
if re.match(r"{}.*\.png".format(re.escape(file_prefix)), filename) if re.match(r"{}-\d{{3}}.*\.png".format(re.escape(file_prefix)), filename)
] ]
return files return files
def preprocess_img(filepath): def preprocess_img(filepath):

Loading…
Cancel
Save