Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /bin/sh
- # Simple wrapper to recognize PDF files
- #
- # This script is based on ocropdf by Christian Mahnke
- # original script can be downloaded from http://groups.google.com/group/ocropus/attach/e3cd3c9c36dfce87/ocropdf?part=2
- #
- # I just modify it, to achieve my requirement
- #
- #Usage:
- #pdf2html input.pdf > hocr-output.html
- #
- #The following environment variables are recognised:
- #- PDFIMAGES: Path to 'pdfimages' if it's not in your path
- #- CONVERT: Path to 'pdfimages' if it's not in your path
- #- OCROSCRIPT: Path to 'ocroscript' if it's not in your path or this script is not #placed in the ocropus source tree (in the 'ocrocmd' directory)
- #- tesslanguage: The language tesseract should use.
- #
- #
- #Known problems:
- # - Doesn't work with file names containing spaces.
- # - Only works with a singe PDF file.
- #
- #Possible improvements
- # - reimplement it as Lua script.
- # - Use this approach (imagemagick) to be able to recognise TIFF and other file formats.
- #
- # By Alvin from orangunix.blogspot.com
- if test -z "$PDFIMAGES" ; then
- PDFIMAGES=`which pdfimages`
- fi
- if test -z "$CONVERT" ; then
- CONVERT=`which convert`
- fi
- if test -z "$PDFIMAGES" ; then
- echo "'pdfimages' not found in PATH (it's part of the xpdf package)"
- fi
- if test -z "$PDFIMAGES" ; then
- echo "'convert' not found in PATH (it's part of the imagemagick package)"
- fi
- if test -z "$OCROSCRIPT" ; then
- OCROSCRIPT=`which ocroscript`
- if test -z "$OCROSCRIPT" ; then
- DIR=`dirname $0`/../ocroscript
- OCROSCRIPT="$DIR/ocroscript"
- if test -z "$OCROSCRIPTS" ; then
- OCROSCRIPTS=$DIR/scripts
- fi
- fi
- fi
- if test -z "$1" ; then
- echo "Usage: ./pdf2txt input.pdf > hocr-output.html"
- exit 1
- fi
- TMP_DIR=`tempfile -p pdf2txt`
- rm -f $TMP_DIR
- mkdir $TMP_DIR
- echo $TMP_DIR
- PDFIMAGES_CMD="$PDFIMAGES $1 $TMP_DIR/pdf2txt"
- echo $PDFIMAGES_CMD
- $PDFIMAGES_CMD
- echo $TMP_DIR
- for FILE in `ls $TMP_DIR`
- do
- #echo $FILE
- CONVERT_CMD="$CONVERT $TMP_DIR/$FILE $TMP_DIR/$FILE.jpg"
- $CONVERT_CMD
- if test $? != 0 ; then
- echo "'convert' failed"
- exit 2
- fi
- FILES="$FILES $TMP_DIR/$FILE.jpg"
- #rm -f $TMP_DIR/$FILE
- done
- $OCROSCRIPT recognize `ls $TMP_DIR | grep .pbm.jpg`
- #if test -z "$tesslanguage" ; then
- # OCROSCIPT_CMD="$OCROSCRIPT rec-tess $FILES"
- #else
- # OCROSCIPT_CMD="$OCROSCRIPT rec-tess --tesslanguage=$tesslanguage $FILES"
- #fi
- #
- #$OCROSCIPT_CMD
- #rm -r $TMP_DIR
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement