image2text

#! /bin/sh
# Simple wrapper to recognize PDF files
#
# This script is based on ocropdf by Christian Mahnke
# original script can be downloaded from http://groups.google.com/group/ocropus/attach/e3cd3c9c36dfce87/ocropdf?part=2
#
# I just modify it, to achieve my requirement
#
#Usage:
#pdf2html input.pdf > hocr-output.html
#
#The following environment variables are recognised:
#- PDFIMAGES: Path to 'pdfimages' if it's not in your path
#- CONVERT: Path to 'pdfimages' if it's not in your path
#- OCROSCRIPT: Path to 'ocroscript' if it's not in your path or this script is not #placed in the ocropus source tree (in the 'ocrocmd' directory)
#- tesslanguage: The language tesseract should use.
#
#
#Known problems:
# - Doesn't work with file names containing spaces.
# - Only works with a singe PDF file.
#
#Possible improvements
# - reimplement it as Lua script.
# - Use this approach (imagemagick) to be able to recognise TIFF and other file formats.
#
# By Alvin from orangunix.blogspot.com
if test -z "$PDFIMAGES" ; then
PDFIMAGES=`which pdfimages`
fi
if test -z "$CONVERT" ; then
CONVERT=`which convert`
fi
if test -z "$PDFIMAGES" ; then
echo "'pdfimages' not found in PATH (it's part of the xpdf package)"
fi
if test -z "$PDFIMAGES" ; then
echo "'convert' not found in PATH (it's part of the imagemagick package)"
fi
if test -z "$OCROSCRIPT" ; then
OCROSCRIPT=`which ocroscript`
if test -z "$OCROSCRIPT" ; then
DIR=`dirname $0`/../ocroscript
OCROSCRIPT="$DIR/ocroscript"
if test -z "$OCROSCRIPTS" ; then
OCROSCRIPTS=$DIR/scripts
fi
fi
fi
if test -z "$1" ; then
echo "Usage: ./pdf2txt input.pdf > hocr-output.html"
exit 1
fi
TMP_DIR=`tempfile -p pdf2txt`
rm -f $TMP_DIR
mkdir $TMP_DIR
echo $TMP_DIR
PDFIMAGES_CMD="$PDFIMAGES $1 $TMP_DIR/pdf2txt"
echo $PDFIMAGES_CMD
$PDFIMAGES_CMD
echo $TMP_DIR
for FILE in `ls $TMP_DIR`
do
#echo $FILE
CONVERT_CMD="$CONVERT $TMP_DIR/$FILE $TMP_DIR/$FILE.jpg"
$CONVERT_CMD
if test $? != 0 ; then
echo "'convert' failed"
exit 2
fi
FILES="$FILES $TMP_DIR/$FILE.jpg"
#rm -f $TMP_DIR/$FILE
done
$OCROSCRIPT recognize `ls $TMP_DIR | grep .pbm.jpg`
#if test -z "$tesslanguage" ; then
# OCROSCIPT_CMD="$OCROSCRIPT rec-tess $FILES"
#else
# OCROSCIPT_CMD="$OCROSCRIPT rec-tess --tesslanguage=$tesslanguage $FILES"
#fi
#
#$OCROSCIPT_CMD
#rm -r $TMP_DIR