Guest User

Untitled

a guest
Jun 22nd, 2011
342
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/bash
  2.  
  3. # this is a script to transform a PDF containing a scanned book
  4. #  into a beautiful searchable PDF :-)
  5. # depends on convert (ImageMagick), ghostscript, pdftk, pdfjam, cuneiform and hocr2pdf (ExactImage)
  6. # $ sudo apt-get install imagemagick ghostscript pdftk pdfjam exactimage
  7. # get Cuneiform from their homepage
  8.  
  9. #use shared library
  10. #export LD_LIBRARY_PATH=/usr/local/lib64
  11.  
  12. echo "usage: process document.pdf orientation split left top right bottom middle lang author title "
  13. # where orientation is one of 0,1,2,3, meaning the amount of rotation by 90°
  14. # and split is either 0 (already single-paged) or 1 (2 book-pages per pdf-page)
  15. # and (left top right bottom) are the ranges in pixels to trim
  16. # and lang is a language as in "cuneiform -l".
  17. # and author,title are used for the PDF metadata
  18. # all values relative to a resolution of 300dpi
  19. #
  20. # usage examples:
  21. # ./process.sh SomeFile.pdf 0 0 1 2 3 4 0 ger SomeAuthor SomeTitle
  22. # will process a PDF with one page per pdf-page, trimming left=1px top=2px right=3px bottom=4px
  23.  
  24. input=$1
  25. rotate=$2
  26. dpi=`identify -format '%y' $input`
  27. dpi=${dpi:0:3}
  28.  
  29. left=$[$4*254/10/$dpi] #pixels to mm
  30. top=$[$5*254/10/$dpi]
  31. right=$[$6*254/10/$dpi]
  32. bottom=$[$7*254/10/$dpi]
  33. tmpdir="tmp"
  34. if ! [ -d $tmpdir ]; then
  35.     mkdir $tmpdir
  36. fi
  37. pdfjoin -q --rotateoversize 'false' --trim $left'mm '$bottom'mm '$right'mm '$top'mm' --outfile ${input%.pdf}a.pdf $input
  38. input=${input%.pdf}a.pdf
  39. pdftk "$input" burst dont_ask output "$tmpdir/page-%04d.pdf"
  40.  
  41. width2=`identify -format '%w' "$tmpdir/page-0001.pdf"`
  42. width2=$[${width2:0:3}*254/10/$dpi] #pixels in mm
  43.  
  44. for page in "$tmpdir"/page-*.pdf
  45. do
  46.     echo "processing $page ..."
  47.     base="${page%.pdf}"
  48.     if [ $3 = "1" ]; then   #two-side
  49.  
  50.         middle=$[$8*254/10/300]
  51.         splitwidth=$[$width2/2+$middle]
  52.         pdfjoin -q --rotateoversize 'false' --trim '0mm 0mm '$splitwidth'mm 0mm' --outfile $base.1.pdf $page
  53.         gs -dQUIET -SDEVICE=tiffg4 -r300x300 -sOutputFile="$base".1.tiff -dNOPAUSE -dBATCH -- "$base.1.pdf"
  54.         cuneiform -l $9 -f hocr -o "$base.1.html" "$base.1.tiff" &>/dev/null
  55.         if [ -e "$base.1.html" ]; then #readable
  56.             hocr2pdf -i "$base.1.tiff" -s -o "$base.1.pdf" < "$base.1.html" &>/dev/null
  57.         fi
  58.  
  59.         pdfjoin -q --rotateoversize 'false' --trim $splitwidth'mm 0mm 0mm 0mm' --outfile $base.2.pdf $page
  60.         gs -dQUIET -SDEVICE=tiffg4 -r300x300 -sOutputFile="$base".2.tiff -dNOPAUSE -dBATCH -- "$base.2.pdf"
  61.         cuneiform -l $9 -f hocr -o "$base.2.html" "$base.2.tiff" &>/dev/null
  62.         if [ -e "$base.2.html" ]; then #readable
  63.             hocr2pdf -i "$base.2.tiff" -s -o "$base.2.pdf" < "$base.2.html" &>/dev/null
  64.         fi
  65.     rm $page
  66.     else    #one-side
  67.         gs -dQUIET -SDEVICE=tiffg4 -r300x300 -sOutputFile="$base".tiff -dNOPAUSE -dBATCH -- "$base.pdf"
  68.         cuneiform -l $9 -f hocr -o "$base.html" "$base.tiff" &>/dev/null
  69.         if [ -e "$base.html" ]; then #readable
  70.             hocr2pdf -i "$base.tiff" -s -o "$base.0.pdf" < "$base.html" &>/dev/null
  71.         fi
  72.     fi
  73. done
  74.  
  75. echo "[ /Title (${11})
  76.  /Author (${10})
  77.  /DOCINFO pdfmark" > $tmpdir/pdfmarks
  78.  
  79. pdfjoin -q --rotateoversize 'false' --outfile $input $tmpdir/page-*.pdf
  80. rm -f doc_data.txt
  81. gs -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf $input $tmpdir/pdfmarks
  82. rm $input
  83. rm -rf "$tmpdir"
  84. echo "Resulting pdf is saved as output.pdf"
RAW Paste Data