#!/bin/sh START=$(date +%s) SOURCE=$1 LANG=$2 PAGES=`/usr/local/libexec/xpdf/pdfinfo $SOURCE | grep -i pages | awk '{print $2}'` # set to the number of pages in the PDF #SOURCE=pamphlet-low.pdf # set to the file name of the PDF OUTPUT=$SOURCE RESOLUTION=600 # set to the resolution the scanner used (the higher, the better) #xpdf-pdfinfo pamphlet-low.pdf | grep Pages: | awk '{print $2}' | tail -n 1 touch $OUTPUT.txt for i in `seq 1 $PAGES`; do convert -density $RESOLUTION -depth 8 $SOURCE\[$(($i - 1 ))\] page$i.png # tesseract page$i.tif >> $OUTPUT tesseract page$i.png $OUTPUT$i -l $2 rm page$i.png cat $OUTPUT$i.txt >> $OUTPUT.txt rm $OUTPUT$i.txt done END=$(date +%s) DIFF=$(echo "$END - $START" | bc) echo $DIFF