Advertisement
Guest User

pdf2epub

a guest
Jul 30th, 2014
225
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 8.46 KB | None | 0 0
  1. SOURCE=${1%/}
  2. DESTINATION=${2%/}
  3. FILETYPE='*.pdf'
  4.  
  5. #pdf to html(smallest file first :)
  6.  
  7. for i in $(ls -Sr $SOURCE/$FILETYPE)
  8.   do
  9.       pdf_file=$(basename $i)
  10.       echo "Converting $pdf_file to html"
  11.  
  12.      #Create supporting directory structure
  13.      dirname=$(echo $pdf_file | awk -F'.pdf' '{print $1}')
  14.      $(mkdir -p $DESTINATION/$dirname)
  15.      $(mkdir -p $DESTINATION/$dirname/META-INF)
  16.      $(mkdir -p $DESTINATION/$dirname/OEBPS)
  17.      $(mkdir -p $DESTINATION/$dirname/Images)
  18.      $(mkdir -p $DESTINATION/$dirname/Style)
  19.      $(mkdir -p $DESTINATION/$dirname/Audio)
  20.      $(mkdir -p $DESTINATION/$dirname/Video)
  21.      $(mkdir -p $DESTINATION/$dirname/Misc)
  22.      $(mkdir -p $DESTINATION/$dirname/Fonts)
  23.      $(mkdir -p $DESTINATION/$dirname/html)
  24.  
  25.      pdftohtml -q -c -enc UTF-8 -nodrm -fmt png $i $DESTINATION/$dirname/book_
  26.      
  27.      sleep 2
  28.        
  29.     #seperate png & html(i.e. pre xhtml)
  30.  
  31.     echo "Moving images to appropriate directories "
  32.     $(mv $DESTINATION/$dirname/*.png  $DESTINATION/$dirname/Images/)
  33.     $(mv $DESTINATION/$dirname/*.html  $DESTINATION/$dirname/html/)
  34.  
  35.     #update html pages with new image location
  36.     echo "Updating html with new image location"
  37.     $(sed -i "s/<IMG width=\"\(.*\)\" height=\"\(.*\)\" src=\"\(.*png\)\" alt=\"background image\"\/>/<IMG width=\"\1\" height=\"\2\" src=\"\.\.\/Images\/\3\" alt=\"backgroundimage\"\/>/Ig" $DESTINATION/$dirname/html/*.html)
  38.  
  39.   #convert html to xhtml
  40.     echo "Converting html to xhtml"
  41.    
  42.     for file in $(ls $DESTINATION/$dirname/html/*.html)
  43.       do
  44.         dst_file=$(basename $file)
  45.         $(html2xhtml $file -o $DESTINATION/$dirname/OEBPS/$dst_file)  
  46.     done
  47.    
  48.  
  49.     # Fixing the html header to make it valid xhtml
  50.     $(sed -i "/<!DOCTYPE html/{n;N;d}" $DESTINATION/$dirname/OEBPS/*.html)
  51.     $(sed -i "s/<!DOCTYPE html/<!DOCTYPE html>/" $DESTINATION/$dirname/OEBPS/*.html)
  52.    
  53.     #fix css in <body> to make it valid xhtml
  54.     for file in $(ls $DESTINATION/$dirname/OEBPS/*.html)  
  55.       do  
  56.         j=$(grep '<body' $file)
  57.         bgcolour=$(echo $j | cut -d'=' -f2 | cut -d'"' -f2)
  58.         vlink=$(echo $j | cut -d'=' -f3 | cut -d'"' -f2)
  59.         $(sed -i "s/<\/style>/\ta:link {color: $vlink}\n\ta:visited {color: $vlink}\n\ta:active {color: $vlink}\n\tbody{color: $bgcolour}\n\t<\/style>/" $file)
  60.         $(sed -i "s/<body.*\"blue\">/<body>/" $file)
  61.       done
  62.  
  63.    
  64.     sleep 2
  65.    
  66.     echo "Creating mime types & container"
  67.    
  68.     #Creating mime types
  69.     echo -n 'application/epub+zip' > $DESTINATION/$dirname/mimetype
  70.    
  71.     ##container.xml
  72.     echo -e "<?xml version=\"1.0\"?>\n<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\n\t<rootfiles>\n\t\t<rootfile full-path=\"package.opf\" media-type=\"application/oebps-package+xml\"/>\n\t</rootfiles>\n</container>" > $DESTINATION/$dirname/META-INF/container.xml
  73.  
  74.     echo "Generating uuid for book"
  75.     book_uuid=$(uuidgen)
  76.    
  77.     echo "Populate OPF file"
  78.    
  79.     echo -e "<?xml version='1.0' encoding='utf-8'?>\n<package xmlns=\"http://www.idpf.org/2007/opf\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:dcterms=\"http://purl.org/dc/terms/\" version=\"3.0\" xml:lang=\"en\" unique-identifier=\"book_id\">\n<metadata>\n\t<dc:identifier id=\"book_id\">urn::uuid::$book_uuid</dc:identifier>\n\t<dc:language>en</dc:language>\n\t<dc:title>$dirname</dc:title>\n\t<meta property=\"dcterms:modified\">$(date +%Y-%m-%d)T$(date +%H:%M:%S)Z</meta>\n</metadata>\n" > $DESTINATION/$dirname/package.opf
  80.    
  81.  
  82.     echo "<manifest>" >> $DESTINATION/$dirname/package.opf
  83.  
  84.     #Populating OPF file with Html files
  85.     last_file=$(ls  $DESTINATION/$dirname/OEBPS/*.html | awk -F'-' '{print $2}' | awk -F'.html' '{print $1}' | sort -n | tail -n 1)
  86.     pagename=$(basename $(ls $DESTINATION/$dirname/OEBPS/*.html| head -1 | awk -F'-' '{print $1}'))    
  87.  
  88.    
  89.     for j in $(seq 1 $last_file)
  90.       do
  91.         echo -e "<item href=\"OEBPS/$pagename-$j.html\"\tid=\"html_$j\"\tmedia-type=\"application/xhtml+xml\"/>" >> $DESTINATION/$dirname/package.opf
  92.       done    
  93.    
  94.    #Populating OPF file with Image files
  95.    last_png=$(ls $DESTINATION/$dirname/Images/*.png | awk -F'book_' '{print $2}' | awk -F'.png' '{print $1}' | sort -n | tail -n 1)    
  96.    
  97.    for j in $(seq -w 1 $last_png)
  98.      do
  99.        echo -e "<item href=\"Images/$pagename$j.png\"\tid=\"png_$j\"\tmedia-type=\"image/png\"/>" >> $DESTINATION/$dirname/package.opf
  100.      done
  101.  
  102.    #Populating OPF File with toc and, nav file  
  103.    echo  -e "<item href=\"OEBPS/toc.ncx\"\tid=\"ncx\"\tmedia-type=\"application/x-dtbncx+xml\"/>\n<item properties=\"nav\" id=\"nav\" href=\"OEBPS/nav.html\" media-type=\"application/xhtml+xml\"/>\n</manifest>\n<spine toc=\"ncx\">" >> $DESTINATION/$dirname/package.opf
  104.  
  105.   #Populating OPF file with spine tree
  106.  
  107.   for j in $(cut -f2 $DESTINATION/$dirname/package.opf | grep  'id="html_' | cut -d'=' -f2 | cut -d'"' -f2)
  108.     do
  109.       echo -e "\t<itemref\tidref=\"$j\"/>" >> $DESTINATION/$dirname/package.opf
  110.   done  
  111.  
  112.  
  113.   echo -e "\t<itemref idref=\"nav\" linear=\"no\"/>\n</spine>\n</package>" >> $DESTINATION/$dirname/package.opf
  114.  
  115.   sleep 2
  116.  
  117.   #Fix outline Before using it to generate TOC
  118.   $(awk '/<a href=".*html"\>.*$/ { printf("%s ",$0); next } 1' $DESTINATION/$dirname/OEBPS/$pagename-outline.html | sed -e "s/ \+/ /g" | sed -e "s/<\/a> <\/li>/<\/a>\n<\/li>/g" > $DESTINATION/$dirname/OEBPS/outline.html)
  119.  
  120.   #Creating Index Data for TOC
  121.   $(sed -e "s/<a href=\"\(.*html\)\">\(.*\)<\/a>/#\"\1\"#\"\2\"/g" $DESTINATION/$dirname/OEBPS/outline.html | grep "#" | sed -e "s/\(.*\)\W<.*>/\1/g" | sed "s/ /_/g" | sed "s/\"//g" > $DESTINATION/$dirname/OEBPS/index_data.txt)
  122.  
  123.  
  124.   #Generating TOC(ncx)
  125.   echo "Generating TOC(ncx)"
  126.  
  127.   echo -e "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<ncx xmlns=\"http://www.daisy.org/z3986/2005/ncx/\" version=\"2005-1\" xml:lang=\"eng\">\n\t<head>\n\t\t<meta name=\"dtb:uid\"\tcontent=\"urn::uuid::$book_uuid\"/>\n\t\t<meta name=\"dbt:depth\"\tcontent=\"0\"/>\n\t\t<meta name=\"dbt:totalPageCount\"\tcontent=\"0\"/>\n\t\t<meta name=\"dbt:maxPageNumber\"\tcontent=\"0\"/>\n\t</head>\n<docTitle>\n\t<text>$dirname</text>\n</docTitle>\n<navMap>" > $DESTINATION/$dirname/OEBPS/toc.ncx
  128.  
  129.   #create Nav Data from index_data.txt
  130.   count=1
  131.   for j in $(cat $DESTINATION/$dirname/OEBPS/index_data.txt)
  132.     do
  133.       link=$(echo $j | cut -d'#' -f2)
  134.       title=$(echo $j | cut -d'#' -f3)
  135.       title=$(echo $title | sed -e 's/"//g' | sed -e 's/_/ /g')
  136.       echo -e "\t<navPoint class=\"title\" id=\"title_$count\" playOrder=\"$count\">\n\t\t<navLabel>\n\t\t\t<text>$title</text>\n\t\t</navLabel>\n\t\t<content src=\"$link#ref$count\"/>\n\t</navPoint>" >> $DESTINATION/$dirname/OEBPS/toc.ncx
  137.       count=$(expr $count + 1)
  138.     done  
  139.  
  140.   echo -e "</navMap>\n</ncx>" >> $DESTINATION/$dirname/OEBPS/toc.ncx
  141.  
  142.   #fixing the toc.ncx ref error
  143.   for j in $(grep '<text>\|src=' $DESTINATION/$dirname/OEBPS/toc.ncx | grep '#'  | sed 's/^[ \t]*//;s/[ \t]*$//' | awk -F'"' '{print $2}')
  144.     do  
  145.       page=$(echo $j| cut -d'#' -f1)
  146.       ref=$(echo $j | cut -d'#' -f2)  
  147.       $(sed -i "s/<body>/<body>\n\t<a id=\"$ref\"\><\/a>/" $DESTINATION\/$dirname\/OEBPS\/$page)
  148.     done
  149.  
  150.   #Generating  Navigation html(nav.html)
  151.   echo "Generating  Navigation html(nav.html)"
  152.  
  153.   $(awk '/<a href=".*html"\>.*$/ { printf("%s ",$0); next } 1' $DESTINATION/$dirname/OEBPS/$pagename-outline.html | sed -e 's/ \+/ /g' | sed -e 's/<\/a> <\(.*\)>/<\/a>\n<\1>/g' > $DESTINATION/$dirname/OEBPS/nav.html)
  154.  
  155.  
  156.   $(sed -i 's/<li>/\t<li>/g' $DESTINATION/$dirname/OEBPS/nav.html)
  157.   $(sed -i 's/<\/li>/\t<\/li>/g' $DESTINATION/$dirname/OEBPS/nav.html)
  158.   $(sed -i 's/<a href=/\t\t<a href=/g' $DESTINATION/$dirname/OEBPS/nav.html)
  159.   $(sed -i "s/<body>/<body>\n<nav id=\"toc\" epub:type=\"toc\">/" $DESTINATION/$dirname/OEBPS/nav.html)
  160.   $(sed -i "s/<\/body>/<\/nav>\n<\/body>/" $DESTINATION/$dirname/OEBPS/nav.html)
  161.   $(sed -i "s/^[ \t]*//;s/[ \t]*$//" $DESTINATION/$dirname/OEBPS/nav.html)
  162.  
  163.  
  164.   #Delete the unwanted files and, directory
  165.  
  166.   echo "Deleting Unwanted files and directories"
  167.  
  168.   delete_base=$DESTINATION/$dirname
  169.   $(rm -f  $delete_base/OEBPS/$pagename\_ind.html)
  170.   $(rm -f  $delete_base/OEBPS/$pagename.html)
  171.   $(rm -f  $delete_base/OEBPS/$pagename-outline.html)
  172.   $(rm -f  $delete_base/OEBPS/outline.html)
  173.   $(rm -f  $delete_base/OEBPS/index_data.txt)
  174.   $(rm -f $DESTINATION/$dirname/*.html)
  175.   $(rm -rf $delete_base/html)
  176.  
  177.   echo "Done with $i"
  178.  
  179. done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement