30 lines
751 B
Bash
30 lines
751 B
Bash
|
#!/bin/sh
|
||
|
START=$(date +%s)
|
||
|
|
||
|
SOURCE=$1
|
||
|
LANG=$2
|
||
|
PAGES=`/usr/local/libexec/xpdf/pdfinfo $SOURCE | grep -i pages | awk '{print $2}'` # set to the number of pages in the PDF
|
||
|
#SOURCE=pamphlet-low.pdf # set to the file name of the PDF
|
||
|
OUTPUT=$SOURCE
|
||
|
RESOLUTION=600 # set to the resolution the scanner used (the higher, the better)
|
||
|
|
||
|
#xpdf-pdfinfo pamphlet-low.pdf | grep Pages: | awk '{print $2}' | tail -n 1
|
||
|
|
||
|
touch $OUTPUT.txt
|
||
|
for i in `seq 1 $PAGES`; do
|
||
|
convert -density $RESOLUTION -depth 8 $SOURCE\[$(($i - 1 ))\] page$i.png
|
||
|
# tesseract page$i.tif >> $OUTPUT
|
||
|
tesseract page$i.png $OUTPUT$i -l $2
|
||
|
rm page$i.png
|
||
|
cat $OUTPUT$i.txt >> $OUTPUT.txt
|
||
|
rm $OUTPUT$i.txt
|
||
|
done
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
END=$(date +%s)
|
||
|
DIFF=$(echo "$END - $START" | bc)
|
||
|
echo $DIFF
|
||
|
|