Add README and demo
parent
7e5516eb5d
commit
248fc827cc
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 15 KiB |
Binary file not shown.
After Width: | Height: | Size: 35 KiB |
@ -0,0 +1,55 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
import requests
|
||||
import table_ocr.util
|
||||
import table_ocr.extract_tables
|
||||
import table_ocr.extract_cells
|
||||
import table_ocr.ocr_image
|
||||
import table_ocr.ocr_to_csv
|
||||
def download_image_to_tempdir(url, filename=None):
|
||||
if filename is None:
|
||||
filename = os.path.basename(url)
|
||||
response = requests.get(url, stream=True)
|
||||
tempdir = table_ocr.util.make_tempdir("demo")
|
||||
filepath = os.path.join(tempdir, filename)
|
||||
with open(filepath, 'wb') as f:
|
||||
for chunk in response.iter_content():
|
||||
f.write(chunk)
|
||||
return filepath
|
||||
|
||||
def main(url):
|
||||
image_filepath = download_image_to_tempdir(url)
|
||||
image_tables = table_ocr.extract_tables.main([image_filepath])
|
||||
print("Running `{}`".format(f"extract_tables.main([{image_filepath}])."))
|
||||
print("Extracted the following tables from the image:")
|
||||
print(image_tables)
|
||||
for image, tables in image_tables:
|
||||
print(f"Processing tables for {image}.")
|
||||
for table in tables:
|
||||
print(f"Processing table {table}.")
|
||||
cells = table_ocr.extract_cells.main(table)
|
||||
ocr = [
|
||||
table_ocr.ocr_image.main(cell, None)
|
||||
for cell in cells
|
||||
]
|
||||
print("Extracted {} cells from {}".format(len(ocr), table))
|
||||
print("Cells:")
|
||||
for c, o in zip(cells[:3], ocr[:3]):
|
||||
with open(o) as ocr_file:
|
||||
# Tesseract puts line feeds at end of text.
|
||||
# Stript it out.
|
||||
text = ocr_file.read().strip()
|
||||
print("{}: {}".format(c, text))
|
||||
# If we have more than 3 cells (likely), print an ellipses
|
||||
# to show that we are truncating output for the demo.
|
||||
if len(cells) > 3:
|
||||
print("...")
|
||||
return table_ocr.ocr_to_csv.text_files_to_csv(ocr)
|
||||
|
||||
if __name__ == "__main__":
|
||||
csv_output = main(sys.argv[1])
|
||||
print()
|
||||
print("Here is the entire CSV output:")
|
||||
print()
|
||||
print(csv_output)
|
Loading…
Reference in New Issue