Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- sudo apt-get install gscan2pdf
- #!/bin/bash
- #scan2PDF
- #Requires: tesseract 3.03 for OCR to PDF
- # scanimage for scanning, I use 1.0.24
- # pdfunite to merge multiple PDF into one, I use 0.26.5
- #
- # Use scanimage -L to get a list of devices.
- # e.g. device `genesys:libusb:006:003' is a Canon LiDE 210 flatbed scanner
- # then copy/paste genesys:libusb:006:003 into SCANNER below.
- # play with CONTRAST to get good images
- DPI=300
- TESS_LANG=nor #Language that Tesseract uses for OCR
- SCANNER=genesys:libusb:006:003 #My USB scanner
- CONTRAST=35 #Contrast to remove paper look
- FILENAME=$1 #Agrument 1,filename
- PAGES=$2 #Argument 2, number of pages
- re='^[0-9]+$' #Check if second argument is a number
- if ! [[ ${PAGES} =~ $re ]] ; then
- echo "error: Usage: $0 filename number_of_pages" >&2; exit 1
- fi
- SCRIPT_NAME=`basename "$0" .sh` #Directory to store temporary files
- TMP_DIR=${SCRIPT_NAME}-tmp
- if [ -d ${TMP_DIR} ] #Check if it exists a directory already
- then
- echo Error: The directory ${TMP_DIR} exists.
- exit 2
- fi
- mkdir ${TMP_DIR} #Make and go to temp dir
- cd ${TMP_DIR}
- echo Starts Scanimage...
- scanimage -d ${SCANNER} --format=tiff --mode Color --resolution ${DPI} -p --contrast ${CONTRAST} --batch-start=1 --batch-count=${PAGES} --batch-prompt
- echo Starts Tesseract OCR
- for file in *.tif #Goes through every tif file in temp dir
- do
- tesseract $file ${file%.tif} -l ${TESS_LANG} pdf
- done
- if [ "$PAGES" = "1" ] #How many pages
- then
- cp out1.pdf ../${FILENAME}.pdf #Only one page, just copy the PDF back
- else
- for file in *.pdf #More pages, merge the pages into one PDF and copy back
- do
- pdfuniteargs+=${file}
- pdfuniteargs+=" "
- done
- pdfunite $pdfuniteargs ../${FILENAME}.pdf
- fi
- echo ${FILENAME}.pdf done
- rm * #Done, clean up
- cd ..
- rmdir ${TMP_DIR}
Add Comment
Please, Sign In to add comment