Advertisement
konradvoelkel

Shell Script to Scan directly to OCRed PDF/A

Mar 24th, 2013
2,772
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 1.19 KB | None | 0 0
  1. #!/usr/bin/env bash
  2. # 2013 by Konrad Voelkel, public domain
  3. echo "usage: ./scan-archive.sh filename.pdf title subject keywords"
  4. echo "scanning \"$2\" on \"$3\" about \"$4\"... ($1)"
  5. scanimage --mode Color --depth 8 --resolution 600 --format pnm > out.pnm
  6. echo "processing... ($1)"
  7. scantailor-cli --color-mode=black_and_white --despeckle=normal out.pnm ./
  8. rm -rf cache out.pnm
  9. tiff2pdf -o "$1" -z -u m -p "A4" -F -c "scanimage+unpaper+tiff2pdf+pdftk+imagemagick+tesseract+exactimage" -a "Author Name" -t "$2" -s "$3" -k "$4" out.tif
  10. rm -f out.tif
  11. echo "converting to PDF 1.4 ($1)..."
  12. mv "$1" "$1.bak"
  13. pdftk "$1.bak" dump_data > data_dump.info
  14. pdftk "$1.bak" cat output "$1.bk2" flatten
  15. echo "OCR in lang deu... ($1)"
  16. convert -normalize -density 300 -depth 8 "$1.bk2" "$1.png"
  17. tesseract -l deu -psm 1 "$1.png" "$1" hocr
  18. convert "$1.png" "$1.jpg"
  19. hocr2pdf -i "$1.jpg" -s -o "$1.bk2" < "$1.html"
  20. echo "Inserting metadata... ($1)"
  21. pdftk "$1.bk2" update_info data_dump.info output "$1"
  22. rm -f "$1.bak" "$1.bk2" data_dump.info
  23. rm -f "$1.png" "$1.jpg" "$1.html" "$1.pdf"
  24. echo "done. wrote file. ($1)"
  25. echo "validating... ($1)"
  26. java -jar jhove/bin/JhoveApp.jar -m PDF-hul "$1" |egrep "Status|Message"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement