Untitled

#!/bin/bash

# this is a script to transform a PDF containing a scanned book
#  into a beautiful searchable PDF :-)
# depends on ghostscript, pdftk, pdfjoin, cuneiform and hocr2pdf (ExactImage)
# $ sudo apt-get install ghostscript pdftk pdfjoin exactimage
# get Cuneiform from their homepage

echo "usage: process document.pdf orientation split left top right bottom middle lang author title "
# where orientation is one of 0,1,2,3, meaning the amount of rotation by 90°
# and split is either 0 (already single-paged) or 1 (2 book-pages per pdf-page)
# and (left top right bottom) are the ranges in pixels to trim
# and lang is a language as in "cuneiform -l".
# and author,title are used for the PDF metadata
# all values relative to a resolution of 72dpi
# middle is the range from center (after cropping left/right) to the contents
#
# usage examples:
# ./process.sh SomeFile.pdf 0 0 1 2 3 4 0 ger SomeAuthor SomeTitle
# will process a PDF with one page per pdf-page, trimming left=1px top=2px right=3px bottom=4px

input=$1
rotate=$2
dpi=`identify -format '%y' $input`
dpi=${dpi:0:3}

left=$[$4*254/10/$dpi] #pixels to mm
top=$[$5*254/10/$dpi]
right=$[$6*254/10/$dpi]
bottom=$[$7*254/10/$dpi]
tmpdir="tmp"
if ! [ -d $tmpdir ]; then
    mkdir $tmpdir
fi
pdfjoin -q --rotateoversize 'false' --trim $left'mm '$bottom'mm '$right'mm '$top'mm' --outfile ${input%.pdf}a.pdf $input
input=${input%.pdf}a.pdf
pdftk "$input" burst dont_ask output "$tmpdir/page-%04d.pdf"

width2=`identify -format '%w' "$tmpdir/page-0001.pdf"`
width2=$[${width2:0:3}*254/10/$dpi] #pixels in mm
middle=$[$8*254/10/300]
splitwidth=$[$width2/2+$middle]
inputa=""
for page in "$tmpdir"/page-*.pdf
do
    echo "processing $page ..."
    base="${page%.pdf}"
    if [ $3 = "1" ]; then   #two-side

        pdfjoin -q --rotateoversize 'false' --trim '0mm 0mm '$splitwidth'mm 0mm' --outfile $base.1.pdf $page
        gs -dQUIET -SDEVICE=tiffg4 -r300 -sOutputFile="$base".1.tiff -dNOPAUSE -dBATCH -- "$base.1.pdf"
        cuneiform -l $9 -f hocr -o "$base.1.html" "$base.1.tiff" &>/dev/null
        if [ -e "$base.1.html" ]; then #readable
            hocr2pdf -i "$base.1.tiff" -s -o "$base.1.pdf" < "$base.1.html" &>/dev/null
            rm "$base.1.html"
        fi
        rm "$base".1.tiff
        inputa=$inputa" "$base".1.pdf"

        pdfjoin -q --rotateoversize 'false' --trim $splitwidth'mm 0mm 0mm 0mm' --outfile $base.2.pdf $page
        gs -dQUIET -SDEVICE=tiffg4 -r300 -sOutputFile="$base".2.tiff -dNOPAUSE -dBATCH -- "$base.2.pdf"
        cuneiform -l $9 -f hocr -o "$base.2.html" "$base.2.tiff" &>/dev/null
        if [ -e "$base.2.html" ]; then #readable
            hocr2pdf -i "$base.2.tiff" -s -o "$base.2.pdf" < "$base.2.html" &>/dev/null
            rm "$base.2.html"
        fi
        rm "$base".2.tiff
        inputa=$inputa" "$base".2.pdf"
    rm $page
    else    #one-side
        gs -dQUIET -SDEVICE=tiffg4 -r300 -sOutputFile="$base".tiff -dNOPAUSE -dBATCH -- "$base.pdf"
        cuneiform -l $9 -f hocr -o "$base.html" "$base.tiff" &>/dev/null
        if [ -e "$base.html" ]; then #readable
            hocr2pdf -i "$base.tiff" -s -o "$base.0.pdf" < "$base.html" &>/dev/null
            rm "$base.html"
            inputa=$inputa" "$base".0.pdf"
        else
            inputa=$inputa" "$page
        fi
        rm "$base.tiff"
    fi
done

echo "[ /Title (${11})
  /Author (${10})
  /DOCINFO pdfmark" > $tmpdir/pdfmarks

rm -f doc_data.txt
gs -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf $inputa $tmpdir/pdfmarks
rm $input
rm -rf "$tmpdir"
echo "Resulting pdf is saved as output.pdf"