Advertisement
TringaliLuca

Create a searchable PDF from images or a raster PDF with OCR

Mar 21st, 2017
342
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 1.64 KB | None | 0 0
  1. #!/bin/bash
  2. echo Creating a searchable PDF for $1
  3. fullname=$(basename "$1")
  4. name=${fullname%.*}
  5.  
  6. dpi="300" #Predefined definition is 300dpi
  7. language="ita" #Predefined language is italian
  8.  
  9. if [[ -z $1 ]]; then
  10. echo "Usage:"
  11. echo "searchable-pdf-ocr.sh originalfile.pdf eng 300"
  12. echo "searchable-pdf-ocr.sh image1.png image2.png image3.png"
  13. echo "When you specify a pdf, you can also set the language and resolution (in dpi). When you specify a list of images, you cannot set language or dots-per-inch by command line."
  14. echo "WARNING: This script does not work if your filenames contain spaces."
  15. echo "This script has been written by Luca Tringali - TRINGALINVENT [at] libero.it"
  16. exit 0
  17. fi
  18.  
  19. if [[ ! -z $2 ]]; then
  20. language="$2"
  21. fi
  22.  
  23. if [[ ! -z $3 ]]; then
  24. dpi="$3"
  25. fi
  26.  
  27. echo "Language and dpi:"
  28. echo $language
  29. echo $dpi
  30.  
  31. if [[ "$1" =~ .*\.(pdf|eps) ]]; then
  32. #or just [[ "$1" == *.pdf  ]]
  33. echo "Extracting pages"
  34. gs -dNOPAUSE -q -r$dpix$dpi -sDEVICE=tiff32nc -dBATCH -sOutputFile="$name-%04d.tmppage.tiff" "$fullname"
  35. images=( *.tmppage.tiff )
  36. else
  37. images=$@
  38. fi
  39.  
  40. for f in ${images[@]}; do
  41.   echo $f
  42.   if [[ ! "$f" =~ .*\.(tiff) ]]; then
  43.    echo "Converting to TIFF..."
  44.    convert "$f" -background white -flatten +matte "${f%.*}.tiff"
  45.   fi
  46.  
  47.   echo "Tesseract OCR..."
  48.   tesseract -l $language -psm 3 "${f%.*}.tiff" "${f%.*}ocr" pdf 1>/dev/null 2>&1
  49.  
  50.   rm "${f%.*}.tiff"
  51.   mv ${f%.*}ocr.pdf $f.tmp.pdf
  52. done
  53.  
  54. echo "Merging all pages into a single PDF..."
  55. gs -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${name}.searchable.pdf *.tmp.pdf
  56.  
  57. rm *.tmp.pdf
  58. echo "Created $name.searchable.pdf"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement