You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
#!/bin/sh
|
|
|
|
|
|
|
|
PDF=$1
|
|
|
|
|
|
|
|
python -m table_ocr.pdf_to_images $PDF | grep .png > /tmp/pdf-images.txt
|
|
|
|
cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt
|
|
|
|
cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells {} | grep cells > /tmp/extracted-cells.txt
|
|
|
|
cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l table-ocr
|
|
|
|
|
|
|
|
for image in $(cat /tmp/extracted-tables.txt); do
|
|
|
|
dir=$(dirname $image)
|
|
|
|
python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt")
|
|
|
|
done
|