Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # this is a script to transform a PDF containing a scanned book
- # into a beautiful searchable PDF :-)
- # depends on convert (ImageMagick), ghostscript, pdftk, pdfjam, cuneiform and hocr2pdf (ExactImage)
- # $ sudo apt-get install imagemagick ghostscript pdftk pdfjam exactimage
- # get Cuneiform from their homepage
- #use shared library
- export LD_LIBRARY_PATH=/usr/local/lib64
- echo "usage: process document.pdf orientation split left top right bottom lang author title middle"
- # where orientation is one of 0,1,2,3, meaning the amount of rotation by 90°
- # and split is either 0 (already single-paged) or 1 (2 book-pages per pdf-page)
- # and (left top right bottom) are the ranges in mm to trim
- # and lang is a language as in "cuneiform -l".
- # and author,title are used for the PDF metadata
- # all values relative to a resolution of 300dpi
- #
- # usage examples:
- # ./process.sh SomeFile.pdf 0 0 0 0 2500 2000 ger SomeAuthor SomeTitle middle
- # will process a PDF with one page per pdf-page, cropping to width 2500 and height 2000
- pdftk "$1" burst dont_ask output pg_%04da.pdf
- left=$[$4*254/10/300]
- top=$[$5*254/10/300]
- right=$[$6*254/10/300]
- bottom=$[$7*254/10/300]
- middle=$[${11}*254/10/300]
- for f in pg_*a.pdf
- do
- echo "pre-processing $f ..."
- convert -quiet -monochrome -normalize -density 300 "$f" "$f.bmp"
- bmpheight=`identify -format '%h' $f.bmp`
- bmpwidth=`identify -format '%w' $f.bmp`
- convert -quiet -crop $[$bmpwidth-$4-$6]x$[$bmpheight-$5-$7]+$4+$5 "$f.bmp" "$f.bmp"
- if [ "1" = "$3" ]; then #two-side
- bmpheight=`identify -format '%h' $f.bmp` #in pixels
- bmpwidth=`identify -format '%w' $f.bmp`
- height=$[($bmpheight-$4-$6)] #in pixels
- llwidth=$[(($bmpwidth+${11})/2+$6)*254/10/300]
- rrwidth=$[(($bmpwidth+${11})/2+$4)*254/10/300]
- convert -quiet -crop $[($bmpwidth-${11})/2]x$bmpheight+0+0 $f.bmp $f.1.bmp
- ./cuneiform -l $8 -f hocr -o $f.1.hocr $f.1.bmp &>/dev/null
- if [ -e $f.1.hocr ]; then #readable
- hocr2pdf1 -i $f.1.bmp -s -o ${f%a.pdf}.1.pdf < $f.1.hocr &>/dev/null
- else #unreadable
- pdfjoin -q --rotateoversize 'false' --trim $left'mm '$bottom'mm '$llwidth'mm '$top'mm' --outfile ${f%a.pdf}.1.pdf $f
- fi
- convert -quiet -crop $[($bmpwidth-${11})/2]x$bmpheight+$[($bmpwidth+${11})/2]+0 "$f.bmp" "$f.2.bmp"
- ./cuneiform -l $8 -f hocr -o $f.2.hocr $f.2.bmp &>/dev/null
- if [ -e $f.2.hocr ]; then #readable
- hocr2pdf1 -i "$f.2.bmp" -s -o ${f%a.pdf}.2.pdf < $f.2.hocr &>/dev/null
- else #unreadable
- pdfjoin -q --rotateoversize 'false' --trim $rrwidth'mm '$bottom'mm '$right'mm '$top'mm' --outfile ${f%a.pdf}.2.pdf $f
- fi
- else #one-side
- ./cuneiform -l $8 -f hocr -o $f.hocr $f.bmp &>/dev/null
- if [ -e $f.hocr ]; then #readable
- hocr2pdf1 -i $f.bmp -s -o ${f%a.pdf}.pdf < $f.hocr &>/dev/null
- else #unreadable
- pdfjoin -q --rotateoversize 'false' --trim $left'mm '$bottom'mm '$right'mm '$top'mm' --outfile ${f%a.pdf}.pdf $f
- fi
- fi
- rm -f pg_*.bmp $f pg_*.hocr
- done
- echo "[ /Title (${10})
- /Author ($9)
- /DOCINFO pdfmark" > pdfmarks
- pdfjoin -q --rotateoversize 'false' --outfile $1-ocr.pdf pg_*.pdf
- rm -f pg_*.pdf
- rm -f doc_data.txt
- rm -rf pg_*_files
- gs -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf $1-ocr.pdf pdfmarks
- rm pdfmarks
RAW Paste Data