Advertisement
Guest User

image2text

a guest
May 7th, 2012
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 2.33 KB | None | 0 0
  1. #! /bin/sh
  2. # Simple wrapper to recognize PDF files
  3. #
  4. # This script is based on ocropdf by Christian Mahnke
  5. # original script can be downloaded from http://groups.google.com/group/ocropus/attach/e3cd3c9c36dfce87/ocropdf?part=2
  6. #
  7. # I just modify it, to achieve my requirement
  8. #
  9. #Usage:
  10. #pdf2html input.pdf > hocr-output.html
  11. #
  12. #The following environment variables are recognised:
  13. #- PDFIMAGES: Path to 'pdfimages' if it's not in your path
  14. #- CONVERT: Path to 'pdfimages' if it's not in your path
  15. #- OCROSCRIPT: Path to 'ocroscript' if it's not in your path or this script is not #placed in the ocropus source tree (in the 'ocrocmd' directory)
  16. #- tesslanguage: The language tesseract should use.
  17. #
  18. #
  19. #Known problems:
  20. # - Doesn't work with file names containing spaces.
  21. # - Only works with a singe PDF file.
  22. #
  23. #Possible improvements
  24. # - reimplement it as Lua script.
  25. # - Use this approach (imagemagick) to be able to recognise TIFF and other file formats.
  26. #
  27. # By Alvin from orangunix.blogspot.com
  28. if test -z "$PDFIMAGES" ; then
  29. PDFIMAGES=`which pdfimages`
  30. fi
  31. if test -z "$CONVERT" ; then
  32. CONVERT=`which convert`
  33. fi
  34. if test -z "$PDFIMAGES" ; then
  35. echo "'pdfimages' not found in PATH (it's part of the xpdf package)"
  36. fi
  37. if test -z "$PDFIMAGES" ; then
  38. echo "'convert' not found in PATH (it's part of the imagemagick package)"
  39. fi
  40. if test -z "$OCROSCRIPT" ; then
  41. OCROSCRIPT=`which ocroscript`
  42. if test -z "$OCROSCRIPT" ; then
  43. DIR=`dirname $0`/../ocroscript
  44. OCROSCRIPT="$DIR/ocroscript"
  45. if test -z "$OCROSCRIPTS" ; then
  46. OCROSCRIPTS=$DIR/scripts
  47. fi
  48. fi
  49. fi
  50. if test -z "$1" ; then
  51. echo "Usage: ./pdf2txt input.pdf > hocr-output.html"
  52. exit 1
  53. fi
  54. TMP_DIR=`tempfile -p pdf2txt`
  55. rm -f $TMP_DIR
  56. mkdir $TMP_DIR
  57. echo $TMP_DIR
  58. PDFIMAGES_CMD="$PDFIMAGES $1 $TMP_DIR/pdf2txt"
  59. echo $PDFIMAGES_CMD
  60. $PDFIMAGES_CMD
  61. echo $TMP_DIR
  62. for FILE in `ls $TMP_DIR`
  63. do
  64. #echo $FILE
  65. CONVERT_CMD="$CONVERT $TMP_DIR/$FILE $TMP_DIR/$FILE.jpg"
  66. $CONVERT_CMD
  67. if test $? != 0 ; then
  68. echo "'convert' failed"
  69. exit 2
  70. fi
  71. FILES="$FILES $TMP_DIR/$FILE.jpg"
  72. #rm -f $TMP_DIR/$FILE
  73. done
  74. $OCROSCRIPT recognize `ls $TMP_DIR | grep .pbm.jpg`
  75. #if test -z "$tesslanguage" ; then
  76. # OCROSCIPT_CMD="$OCROSCRIPT rec-tess $FILES"
  77. #else
  78. # OCROSCIPT_CMD="$OCROSCRIPT rec-tess --tesslanguage=$tesslanguage $FILES"
  79. #fi
  80. #
  81. #$OCROSCIPT_CMD
  82. #rm -r $TMP_DIR
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement