ArchivedTools/ScanScripts/Windows/ocr.sh
2023-12-21 01:48:37 +01:00

30 lines
751 B
Bash

#!/bin/sh
START=$(date +%s)
SOURCE=$1
LANG=$2
PAGES=`/usr/local/libexec/xpdf/pdfinfo $SOURCE | grep -i pages | awk '{print $2}'` # set to the number of pages in the PDF
#SOURCE=pamphlet-low.pdf # set to the file name of the PDF
OUTPUT=$SOURCE
RESOLUTION=600 # set to the resolution the scanner used (the higher, the better)
#xpdf-pdfinfo pamphlet-low.pdf | grep Pages: | awk '{print $2}' | tail -n 1
touch $OUTPUT.txt
for i in `seq 1 $PAGES`; do
convert -density $RESOLUTION -depth 8 $SOURCE\[$(($i - 1 ))\] page$i.png
# tesseract page$i.tif >> $OUTPUT
tesseract page$i.png $OUTPUT$i -l $2
rm page$i.png
cat $OUTPUT$i.txt >> $OUTPUT.txt
rm $OUTPUT$i.txt
done
END=$(date +%s)
DIFF=$(echo "$END - $START" | bc)
echo $DIFF