You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
56 lines
2.0 KiB
Python
56 lines
2.0 KiB
Python
import os
|
|
import sys
|
|
|
|
import requests
|
|
import table_ocr.util
|
|
import table_ocr.extract_tables
|
|
import table_ocr.extract_cells
|
|
import table_ocr.ocr_image
|
|
import table_ocr.ocr_to_csv
|
|
def download_image_to_tempdir(url, filename=None):
|
|
if filename is None:
|
|
filename = os.path.basename(url)
|
|
response = requests.get(url, stream=True)
|
|
tempdir = table_ocr.util.make_tempdir("demo")
|
|
filepath = os.path.join(tempdir, filename)
|
|
with open(filepath, 'wb') as f:
|
|
for chunk in response.iter_content():
|
|
f.write(chunk)
|
|
return filepath
|
|
|
|
def main(url):
|
|
image_filepath = download_image_to_tempdir(url)
|
|
image_tables = table_ocr.extract_tables.main([image_filepath])
|
|
print("Running `{}`".format(f"extract_tables.main([{image_filepath}])."))
|
|
print("Extracted the following tables from the image:")
|
|
print(image_tables)
|
|
for image, tables in image_tables:
|
|
print(f"Processing tables for {image}.")
|
|
for table in tables:
|
|
print(f"Processing table {table}.")
|
|
cells = table_ocr.extract_cells.main(table)
|
|
ocr = [
|
|
table_ocr.ocr_image.main(cell, None)
|
|
for cell in cells
|
|
]
|
|
print("Extracted {} cells from {}".format(len(ocr), table))
|
|
print("Cells:")
|
|
for c, o in zip(cells[:3], ocr[:3]):
|
|
with open(o) as ocr_file:
|
|
# Tesseract puts line feeds at end of text.
|
|
# Stript it out.
|
|
text = ocr_file.read().strip()
|
|
print("{}: {}".format(c, text))
|
|
# If we have more than 3 cells (likely), print an ellipses
|
|
# to show that we are truncating output for the demo.
|
|
if len(cells) > 3:
|
|
print("...")
|
|
return table_ocr.ocr_to_csv.text_files_to_csv(ocr)
|
|
|
|
if __name__ == "__main__":
|
|
csv_output = main(sys.argv[1])
|
|
print()
|
|
print("Here is the entire CSV output:")
|
|
print()
|
|
print(csv_output)
|