Guest User

Untitled

a guest
Jul 12th, 2011
544
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/bash
  2.  
  3. # this is a script to transform a PDF containing a scanned book
  4. #  into a beautiful searchable PDF :-)
  5. # depends on ghostscript, pdftk, pdfjoin, cuneiform and hocr2pdf (ExactImage)
  6. # $ sudo apt-get install ghostscript pdftk pdfjoin exactimage
  7. # get Cuneiform from their homepage
  8.  
  9. echo "usage: process document.pdf orientation split left top right bottom middle lang author title "
  10. # where orientation is one of 0,1,2,3, meaning the amount of rotation by 90°
  11. # and split is either 0 (already single-paged) or 1 (2 book-pages per pdf-page)
  12. # and (left top right bottom) are the ranges in pixels to trim
  13. # and lang is a language as in "cuneiform -l".
  14. # and author,title are used for the PDF metadata
  15. # all values relative to a resolution of 72dpi
  16. # middle is the range from center (after cropping left/right) to the contents
  17. #
  18. # usage examples:
  19. # ./process.sh SomeFile.pdf 0 0 1 2 3 4 0 ger SomeAuthor SomeTitle
  20. # will process a PDF with one page per pdf-page, trimming left=1px top=2px right=3px bottom=4px
  21.  
  22. input=$1
  23. rotate=$2
  24. dpi=`identify -format '%y' $input`
  25. dpi=${dpi:0:3}
  26.  
  27. left=$[$4*254/10/$dpi] #pixels to mm
  28. top=$[$5*254/10/$dpi]
  29. right=$[$6*254/10/$dpi]
  30. bottom=$[$7*254/10/$dpi]
  31. tmpdir="tmp"
  32. if ! [ -d $tmpdir ]; then
  33.     mkdir $tmpdir
  34. fi
  35. pdfjoin -q --rotateoversize 'false' --trim $left'mm '$bottom'mm '$right'mm '$top'mm' --outfile ${input%.pdf}a.pdf $input
  36. input=${input%.pdf}a.pdf
  37. pdftk "$input" burst dont_ask output "$tmpdir/page-%04d.pdf"
  38.  
  39. width2=`identify -format '%w' "$tmpdir/page-0001.pdf"`
  40. width2=$[${width2:0:3}*254/10/$dpi] #pixels in mm
  41. middle=$[$8*254/10/300]
  42. splitwidth=$[$width2/2+$middle]
  43. inputa=""
  44. for page in "$tmpdir"/page-*.pdf
  45. do
  46.     echo "processing $page ..."
  47.     base="${page%.pdf}"
  48.     if [ $3 = "1" ]; then   #two-side
  49.  
  50.         pdfjoin -q --rotateoversize 'false' --trim '0mm 0mm '$splitwidth'mm 0mm' --outfile $base.1.pdf $page
  51.         gs -dQUIET -SDEVICE=tiffg4 -r300 -sOutputFile="$base".1.tiff -dNOPAUSE -dBATCH -- "$base.1.pdf"
  52.         cuneiform -l $9 -f hocr -o "$base.1.html" "$base.1.tiff" &>/dev/null
  53.         if [ -e "$base.1.html" ]; then #readable
  54.             hocr2pdf -i "$base.1.tiff" -s -o "$base.1.pdf" < "$base.1.html" &>/dev/null
  55.             rm "$base.1.html"
  56.         fi
  57.         rm "$base".1.tiff
  58.         inputa=$inputa" "$base".1.pdf"
  59.  
  60.         pdfjoin -q --rotateoversize 'false' --trim $splitwidth'mm 0mm 0mm 0mm' --outfile $base.2.pdf $page
  61.         gs -dQUIET -SDEVICE=tiffg4 -r300 -sOutputFile="$base".2.tiff -dNOPAUSE -dBATCH -- "$base.2.pdf"
  62.         cuneiform -l $9 -f hocr -o "$base.2.html" "$base.2.tiff" &>/dev/null
  63.         if [ -e "$base.2.html" ]; then #readable
  64.             hocr2pdf -i "$base.2.tiff" -s -o "$base.2.pdf" < "$base.2.html" &>/dev/null
  65.             rm "$base.2.html"
  66.         fi
  67.         rm "$base".2.tiff
  68.         inputa=$inputa" "$base".2.pdf"
  69.     rm $page
  70.     else    #one-side
  71.         gs -dQUIET -SDEVICE=tiffg4 -r300 -sOutputFile="$base".tiff -dNOPAUSE -dBATCH -- "$base.pdf"
  72.         cuneiform -l $9 -f hocr -o "$base.html" "$base.tiff" &>/dev/null
  73.         if [ -e "$base.html" ]; then #readable
  74.             hocr2pdf -i "$base.tiff" -s -o "$base.0.pdf" < "$base.html" &>/dev/null
  75.             rm "$base.html"
  76.             inputa=$inputa" "$base".0.pdf"
  77.         else
  78.             inputa=$inputa" "$page
  79.         fi
  80.         rm "$base.tiff"
  81.     fi
  82. done
  83.  
  84. echo "[ /Title (${11})
  85.  /Author (${10})
  86.  /DOCINFO pdfmark" > $tmpdir/pdfmarks
  87.  
  88. rm -f doc_data.txt
  89. gs -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf $inputa $tmpdir/pdfmarks
  90. rm $input
  91. rm -rf "$tmpdir"
  92. echo "Resulting pdf is saved as output.pdf"
RAW Paste Data