Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # this is a script to transform a PDF containing a scanned book
- # into a beautiful searchable PDF :-)
- # depends on ghostscript, pdftk, pdfjoin, cuneiform and hocr2pdf (ExactImage)
- # $ sudo apt-get install ghostscript pdftk pdfjoin exactimage
- # get Cuneiform from their homepage
- echo "usage: process document.pdf orientation split left top right bottom middle lang author title "
- # where orientation is one of 0,1,2,3, meaning the amount of rotation by 90°
- # and split is either 0 (already single-paged) or 1 (2 book-pages per pdf-page)
- # and (left top right bottom) are the ranges in pixels to trim
- # and lang is a language as in "cuneiform -l".
- # and author,title are used for the PDF metadata
- # all values relative to a resolution of 72dpi
- # middle is the range from center (after cropping left/right) to the contents
- #
- # usage examples:
- # ./process.sh SomeFile.pdf 0 0 1 2 3 4 0 ger SomeAuthor SomeTitle
- # will process a PDF with one page per pdf-page, trimming left=1px top=2px right=3px bottom=4px
- input=$1
- rotate=$2
- dpi=`identify -format '%y' $input`
- dpi=${dpi:0:3}
- left=$[$4*254/10/$dpi] #pixels to mm
- top=$[$5*254/10/$dpi]
- right=$[$6*254/10/$dpi]
- bottom=$[$7*254/10/$dpi]
- tmpdir="tmp"
- if ! [ -d $tmpdir ]; then
- mkdir $tmpdir
- fi
- pdfjoin -q --rotateoversize 'false' --trim $left'mm '$bottom'mm '$right'mm '$top'mm' --outfile ${input%.pdf}a.pdf $input
- input=${input%.pdf}a.pdf
- pdftk "$input" burst dont_ask output "$tmpdir/page-%04d.pdf"
- width2=`identify -format '%w' "$tmpdir/page-0001.pdf"`
- width2=$[${width2:0:3}*254/10/$dpi] #pixels in mm
- middle=$[$8*254/10/300]
- splitwidth=$[$width2/2+$middle]
- inputa=""
- for page in "$tmpdir"/page-*.pdf
- do
- echo "processing $page ..."
- base="${page%.pdf}"
- if [ $3 = "1" ]; then #two-side
- pdfjoin -q --rotateoversize 'false' --trim '0mm 0mm '$splitwidth'mm 0mm' --outfile $base.1.pdf $page
- gs -dQUIET -SDEVICE=tiffg4 -r300 -sOutputFile="$base".1.tiff -dNOPAUSE -dBATCH -- "$base.1.pdf"
- cuneiform -l $9 -f hocr -o "$base.1.html" "$base.1.tiff" &>/dev/null
- if [ -e "$base.1.html" ]; then #readable
- hocr2pdf -i "$base.1.tiff" -s -o "$base.1.pdf" < "$base.1.html" &>/dev/null
- rm "$base.1.html"
- fi
- rm "$base".1.tiff
- inputa=$inputa" "$base".1.pdf"
- pdfjoin -q --rotateoversize 'false' --trim $splitwidth'mm 0mm 0mm 0mm' --outfile $base.2.pdf $page
- gs -dQUIET -SDEVICE=tiffg4 -r300 -sOutputFile="$base".2.tiff -dNOPAUSE -dBATCH -- "$base.2.pdf"
- cuneiform -l $9 -f hocr -o "$base.2.html" "$base.2.tiff" &>/dev/null
- if [ -e "$base.2.html" ]; then #readable
- hocr2pdf -i "$base.2.tiff" -s -o "$base.2.pdf" < "$base.2.html" &>/dev/null
- rm "$base.2.html"
- fi
- rm "$base".2.tiff
- inputa=$inputa" "$base".2.pdf"
- rm $page
- else #one-side
- gs -dQUIET -SDEVICE=tiffg4 -r300 -sOutputFile="$base".tiff -dNOPAUSE -dBATCH -- "$base.pdf"
- cuneiform -l $9 -f hocr -o "$base.html" "$base.tiff" &>/dev/null
- if [ -e "$base.html" ]; then #readable
- hocr2pdf -i "$base.tiff" -s -o "$base.0.pdf" < "$base.html" &>/dev/null
- rm "$base.html"
- inputa=$inputa" "$base".0.pdf"
- else
- inputa=$inputa" "$page
- fi
- rm "$base.tiff"
- fi
- done
- echo "[ /Title (${11})
- /Author (${10})
- /DOCINFO pdfmark" > $tmpdir/pdfmarks
- rm -f doc_data.txt
- gs -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf $inputa $tmpdir/pdfmarks
- rm $input
- rm -rf "$tmpdir"
- echo "Resulting pdf is saved as output.pdf"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement