Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- echo Creating a searchable PDF for $1
- fullname=$(basename "$1")
- name=${fullname%.*}
- dpi="300" #Predefined definition is 300dpi
- language="ita" #Predefined language is italian
- if [[ -z $1 ]]; then
- echo "Usage:"
- echo "searchable-pdf-ocr.sh originalfile.pdf eng 300"
- echo "searchable-pdf-ocr.sh image1.png image2.png image3.png"
- echo "When you specify a pdf, you can also set the language and resolution (in dpi). When you specify a list of images, you cannot set language or dots-per-inch by command line."
- echo "WARNING: This script does not work if your filenames contain spaces."
- echo "This script has been written by Luca Tringali - TRINGALINVENT [at] libero.it"
- exit 0
- fi
- if [[ ! -z $2 ]]; then
- language="$2"
- fi
- if [[ ! -z $3 ]]; then
- dpi="$3"
- fi
- echo "Language and dpi:"
- echo $language
- echo $dpi
- if [[ "$1" =~ .*\.(pdf|eps) ]]; then
- #or just [[ "$1" == *.pdf ]]
- echo "Extracting pages"
- gs -dNOPAUSE -q -r$dpix$dpi -sDEVICE=tiff32nc -dBATCH -sOutputFile="$name-%04d.tmppage.tiff" "$fullname"
- images=( *.tmppage.tiff )
- else
- images=$@
- fi
- for f in ${images[@]}; do
- echo $f
- if [[ ! "$f" =~ .*\.(tiff) ]]; then
- echo "Converting to TIFF..."
- convert "$f" -background white -flatten +matte "${f%.*}.tiff"
- fi
- echo "Tesseract OCR..."
- tesseract -l $language -psm 3 "${f%.*}.tiff" "${f%.*}ocr" pdf 1>/dev/null 2>&1
- rm "${f%.*}.tiff"
- mv ${f%.*}ocr.pdf $f.tmp.pdf
- done
- echo "Merging all pages into a single PDF..."
- gs -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${name}.searchable.pdf *.tmp.pdf
- rm *.tmp.pdf
- echo "Created $name.searchable.pdf"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement