Guest User

Untitled

a guest
Jun 20th, 2011
284
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/bash
  2.  
  3. # this is a script to transform a PDF containing a scanned book
  4. #  into a beautiful searchable PDF :-)
  5. # depends on convert (ImageMagick), ghostscript, pdftk, pdfjam, cuneiform and hocr2pdf (ExactImage)
  6. # $ sudo apt-get install imagemagick ghostscript pdftk pdfjam exactimage
  7. # get Cuneiform from their homepage
  8.  
  9. #use shared library
  10. export LD_LIBRARY_PATH=/usr/local/lib64
  11.  
  12. echo "usage: process document.pdf orientation split left top right bottom lang author title middle"
  13. # where orientation is one of 0,1,2,3, meaning the amount of rotation by 90°
  14. # and split is either 0 (already single-paged) or 1 (2 book-pages per pdf-page)
  15. # and (left top right bottom) are the ranges in mm to trim
  16. # and lang is a language as in "cuneiform -l".
  17. # and author,title are used for the PDF metadata
  18. # all values relative to a resolution of 300dpi
  19. #
  20. # usage examples:
  21. # ./process.sh SomeFile.pdf 0 0 0 0 2500 2000 ger SomeAuthor SomeTitle middle
  22. # will process a PDF with one page per pdf-page, cropping to width 2500 and height 2000
  23.  
  24. pdftk "$1" burst dont_ask output pg_%04da.pdf
  25. left=$[$4*254/10/300]
  26. top=$[$5*254/10/300]
  27. right=$[$6*254/10/300]
  28. bottom=$[$7*254/10/300]
  29. middle=$[${11}*254/10/300]
  30. for f in pg_*a.pdf
  31. do
  32.     echo "pre-processing $f ..."
  33.     convert -quiet -monochrome -normalize -density 300 "$f" "$f.bmp"
  34.     bmpheight=`identify -format '%h' $f.bmp`
  35.     bmpwidth=`identify -format '%w' $f.bmp`
  36.  
  37.     convert -quiet -crop $[$bmpwidth-$4-$6]x$[$bmpheight-$5-$7]+$4+$5 "$f.bmp" "$f.bmp"
  38.     if [ "1" = "$3" ]; then #two-side
  39.         bmpheight=`identify -format '%h' $f.bmp` #in pixels
  40.         bmpwidth=`identify -format '%w' $f.bmp`
  41.         height=$[($bmpheight-$4-$6)]            #in pixels
  42.         llwidth=$[(($bmpwidth+${11})/2+$6)*254/10/300]
  43.         rrwidth=$[(($bmpwidth+${11})/2+$4)*254/10/300]
  44.  
  45.  
  46.         convert -quiet -crop $[($bmpwidth-${11})/2]x$bmpheight+0+0 $f.bmp $f.1.bmp
  47.         ./cuneiform -l $8 -f hocr -o $f.1.hocr $f.1.bmp &>/dev/null
  48.         if [ -e $f.1.hocr ]; then #readable
  49.             hocr2pdf1 -i $f.1.bmp -s -o ${f%a.pdf}.1.pdf < $f.1.hocr &>/dev/null
  50.         else #unreadable
  51.             pdfjoin -q --rotateoversize 'false' --trim $left'mm '$bottom'mm '$llwidth'mm '$top'mm' --outfile ${f%a.pdf}.1.pdf $f
  52.         fi
  53.  
  54.         convert -quiet -crop $[($bmpwidth-${11})/2]x$bmpheight+$[($bmpwidth+${11})/2]+0 "$f.bmp" "$f.2.bmp"
  55.         ./cuneiform -l $8 -f hocr -o $f.2.hocr $f.2.bmp &>/dev/null
  56.         if [ -e $f.2.hocr ]; then   #readable
  57.             hocr2pdf1 -i "$f.2.bmp" -s -o ${f%a.pdf}.2.pdf < $f.2.hocr &>/dev/null
  58.         else                    #unreadable
  59.             pdfjoin -q --rotateoversize 'false' --trim $rrwidth'mm '$bottom'mm '$right'mm '$top'mm' --outfile ${f%a.pdf}.2.pdf $f
  60.         fi
  61.     else    #one-side
  62.         ./cuneiform -l $8 -f hocr -o $f.hocr $f.bmp &>/dev/null
  63.         if [ -e $f.hocr ]; then #readable
  64.             hocr2pdf1 -i $f.bmp -s -o ${f%a.pdf}.pdf < $f.hocr &>/dev/null
  65.         else    #unreadable
  66.             pdfjoin -q --rotateoversize 'false' --trim $left'mm '$bottom'mm '$right'mm '$top'mm' --outfile ${f%a.pdf}.pdf $f
  67.         fi
  68.     fi
  69.     rm -f pg_*.bmp $f pg_*.hocr
  70.  
  71. done
  72.  
  73. echo "[ /Title (${10})
  74.  /Author ($9)
  75.  /DOCINFO pdfmark" > pdfmarks
  76.  
  77. pdfjoin -q --rotateoversize 'false' --outfile $1-ocr.pdf pg_*.pdf
  78. rm -f pg_*.pdf
  79. rm -f doc_data.txt
  80. rm -rf pg_*_files
  81. gs -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf $1-ocr.pdf pdfmarks
  82. rm pdfmarks
RAW Paste Data