You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
14 lines
586 B
Plaintext
14 lines
586 B
Plaintext
5 years ago
|
#!/bin/sh
|
||
|
|
||
|
PDF=$1
|
||
|
|
||
|
python -m table_ocr.prepare_pdfs $PDF | grep .png > /tmp/pdf-images.txt
|
||
|
cat /tmp/pdf-images.txt | xargs -I{} python -m table_ocr.extract_tables {} | grep table > /tmp/extracted-tables.txt
|
||
|
cat /tmp/extracted-tables.txt | xargs -I{} python -m table_ocr.extract_cells_from_table {} | grep cells > /tmp/extracted-cells.txt
|
||
|
cat /tmp/extracted-cells.txt | xargs -I{} python -m table_ocr.ocr_image {} --psm 7 -l data-table
|
||
|
|
||
|
for image in $(cat /tmp/extracted-tables.txt); do
|
||
|
dir=$(dirname $image)
|
||
|
python -m table_ocr.ocr_to_csv $(find $dir/cells -name "*.txt")
|
||
|
done
|