baptx

tesseract-pdf2text.sh

Jan 14th, 2018
112
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/sh
  2. if [ "$#" -lt 3 ]; then
  3.   echo "Parameters: <input> <output> <3 chars language code> [startpage] [endpage]"
  4.   exit 1
  5. fi
  6.  
  7. params=''
  8. if [ ! -z "$4" ]; then
  9.     params=$params" -f $4"
  10. fi
  11. if [ ! -z "$5" ]; then
  12.     params=$params" -l $5"
  13. fi
  14.  
  15. pdftoppm -r 300 $params $1 $1
  16. for i in `ls *.ppm`; do
  17.     echo "OCR: "$i
  18.     tesseract -l $3 $i stdout >> $2
  19. done
  20.  
  21. rm *.ppm
RAW Paste Data