Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- SOURCE=${1%/}
- DESTINATION=${2%/}
- FILETYPE='*.pdf'
- #pdf to html(smallest file first :)
- for i in $(ls -Sr $SOURCE/$FILETYPE)
- do
- pdf_file=$(basename $i)
- echo "Converting $pdf_file to html"
- #Create supporting directory structure
- dirname=$(echo $pdf_file | awk -F'.pdf' '{print $1}')
- $(mkdir -p $DESTINATION/$dirname)
- $(mkdir -p $DESTINATION/$dirname/META-INF)
- $(mkdir -p $DESTINATION/$dirname/OEBPS)
- $(mkdir -p $DESTINATION/$dirname/Images)
- $(mkdir -p $DESTINATION/$dirname/Style)
- $(mkdir -p $DESTINATION/$dirname/Audio)
- $(mkdir -p $DESTINATION/$dirname/Video)
- $(mkdir -p $DESTINATION/$dirname/Misc)
- $(mkdir -p $DESTINATION/$dirname/Fonts)
- $(mkdir -p $DESTINATION/$dirname/html)
- pdftohtml -q -c -enc UTF-8 -nodrm -fmt png $i $DESTINATION/$dirname/book_
- sleep 2
- #seperate png & html(i.e. pre xhtml)
- echo "Moving images to appropriate directories "
- $(mv $DESTINATION/$dirname/*.png $DESTINATION/$dirname/Images/)
- $(mv $DESTINATION/$dirname/*.html $DESTINATION/$dirname/html/)
- #update html pages with new image location
- echo "Updating html with new image location"
- $(sed -i "s/<IMG width=\"\(.*\)\" height=\"\(.*\)\" src=\"\(.*png\)\" alt=\"background image\"\/>/<IMG width=\"\1\" height=\"\2\" src=\"\.\.\/Images\/\3\" alt=\"backgroundimage\"\/>/Ig" $DESTINATION/$dirname/html/*.html)
- #convert html to xhtml
- echo "Converting html to xhtml"
- for file in $(ls $DESTINATION/$dirname/html/*.html)
- do
- dst_file=$(basename $file)
- $(html2xhtml $file -o $DESTINATION/$dirname/OEBPS/$dst_file)
- done
- # Fixing the html header to make it valid xhtml
- $(sed -i "/<!DOCTYPE html/{n;N;d}" $DESTINATION/$dirname/OEBPS/*.html)
- $(sed -i "s/<!DOCTYPE html/<!DOCTYPE html>/" $DESTINATION/$dirname/OEBPS/*.html)
- #fix css in <body> to make it valid xhtml
- for file in $(ls $DESTINATION/$dirname/OEBPS/*.html)
- do
- j=$(grep '<body' $file)
- bgcolour=$(echo $j | cut -d'=' -f2 | cut -d'"' -f2)
- vlink=$(echo $j | cut -d'=' -f3 | cut -d'"' -f2)
- $(sed -i "s/<\/style>/\ta:link {color: $vlink}\n\ta:visited {color: $vlink}\n\ta:active {color: $vlink}\n\tbody{color: $bgcolour}\n\t<\/style>/" $file)
- $(sed -i "s/<body.*\"blue\">/<body>/" $file)
- done
- sleep 2
- echo "Creating mime types & container"
- #Creating mime types
- echo -n 'application/epub+zip' > $DESTINATION/$dirname/mimetype
- ##container.xml
- echo -e "<?xml version=\"1.0\"?>\n<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\n\t<rootfiles>\n\t\t<rootfile full-path=\"package.opf\" media-type=\"application/oebps-package+xml\"/>\n\t</rootfiles>\n</container>" > $DESTINATION/$dirname/META-INF/container.xml
- echo "Generating uuid for book"
- book_uuid=$(uuidgen)
- echo "Populate OPF file"
- echo -e "<?xml version='1.0' encoding='utf-8'?>\n<package xmlns=\"http://www.idpf.org/2007/opf\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:dcterms=\"http://purl.org/dc/terms/\" version=\"3.0\" xml:lang=\"en\" unique-identifier=\"book_id\">\n<metadata>\n\t<dc:identifier id=\"book_id\">urn::uuid::$book_uuid</dc:identifier>\n\t<dc:language>en</dc:language>\n\t<dc:title>$dirname</dc:title>\n\t<meta property=\"dcterms:modified\">$(date +%Y-%m-%d)T$(date +%H:%M:%S)Z</meta>\n</metadata>\n" > $DESTINATION/$dirname/package.opf
- echo "<manifest>" >> $DESTINATION/$dirname/package.opf
- #Populating OPF file with Html files
- last_file=$(ls $DESTINATION/$dirname/OEBPS/*.html | awk -F'-' '{print $2}' | awk -F'.html' '{print $1}' | sort -n | tail -n 1)
- pagename=$(basename $(ls $DESTINATION/$dirname/OEBPS/*.html| head -1 | awk -F'-' '{print $1}'))
- for j in $(seq 1 $last_file)
- do
- echo -e "<item href=\"OEBPS/$pagename-$j.html\"\tid=\"html_$j\"\tmedia-type=\"application/xhtml+xml\"/>" >> $DESTINATION/$dirname/package.opf
- done
- #Populating OPF file with Image files
- last_png=$(ls $DESTINATION/$dirname/Images/*.png | awk -F'book_' '{print $2}' | awk -F'.png' '{print $1}' | sort -n | tail -n 1)
- for j in $(seq -w 1 $last_png)
- do
- echo -e "<item href=\"Images/$pagename$j.png\"\tid=\"png_$j\"\tmedia-type=\"image/png\"/>" >> $DESTINATION/$dirname/package.opf
- done
- #Populating OPF File with toc and, nav file
- echo -e "<item href=\"OEBPS/toc.ncx\"\tid=\"ncx\"\tmedia-type=\"application/x-dtbncx+xml\"/>\n<item properties=\"nav\" id=\"nav\" href=\"OEBPS/nav.html\" media-type=\"application/xhtml+xml\"/>\n</manifest>\n<spine toc=\"ncx\">" >> $DESTINATION/$dirname/package.opf
- #Populating OPF file with spine tree
- for j in $(cut -f2 $DESTINATION/$dirname/package.opf | grep 'id="html_' | cut -d'=' -f2 | cut -d'"' -f2)
- do
- echo -e "\t<itemref\tidref=\"$j\"/>" >> $DESTINATION/$dirname/package.opf
- done
- echo -e "\t<itemref idref=\"nav\" linear=\"no\"/>\n</spine>\n</package>" >> $DESTINATION/$dirname/package.opf
- sleep 2
- #Fix outline Before using it to generate TOC
- $(awk '/<a href=".*html"\>.*$/ { printf("%s ",$0); next } 1' $DESTINATION/$dirname/OEBPS/$pagename-outline.html | sed -e "s/ \+/ /g" | sed -e "s/<\/a> <\/li>/<\/a>\n<\/li>/g" > $DESTINATION/$dirname/OEBPS/outline.html)
- #Creating Index Data for TOC
- $(sed -e "s/<a href=\"\(.*html\)\">\(.*\)<\/a>/#\"\1\"#\"\2\"/g" $DESTINATION/$dirname/OEBPS/outline.html | grep "#" | sed -e "s/\(.*\)\W<.*>/\1/g" | sed "s/ /_/g" | sed "s/\"//g" > $DESTINATION/$dirname/OEBPS/index_data.txt)
- #Generating TOC(ncx)
- echo "Generating TOC(ncx)"
- echo -e "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<ncx xmlns=\"http://www.daisy.org/z3986/2005/ncx/\" version=\"2005-1\" xml:lang=\"eng\">\n\t<head>\n\t\t<meta name=\"dtb:uid\"\tcontent=\"urn::uuid::$book_uuid\"/>\n\t\t<meta name=\"dbt:depth\"\tcontent=\"0\"/>\n\t\t<meta name=\"dbt:totalPageCount\"\tcontent=\"0\"/>\n\t\t<meta name=\"dbt:maxPageNumber\"\tcontent=\"0\"/>\n\t</head>\n<docTitle>\n\t<text>$dirname</text>\n</docTitle>\n<navMap>" > $DESTINATION/$dirname/OEBPS/toc.ncx
- #create Nav Data from index_data.txt
- count=1
- for j in $(cat $DESTINATION/$dirname/OEBPS/index_data.txt)
- do
- link=$(echo $j | cut -d'#' -f2)
- title=$(echo $j | cut -d'#' -f3)
- title=$(echo $title | sed -e 's/"//g' | sed -e 's/_/ /g')
- echo -e "\t<navPoint class=\"title\" id=\"title_$count\" playOrder=\"$count\">\n\t\t<navLabel>\n\t\t\t<text>$title</text>\n\t\t</navLabel>\n\t\t<content src=\"$link#ref$count\"/>\n\t</navPoint>" >> $DESTINATION/$dirname/OEBPS/toc.ncx
- count=$(expr $count + 1)
- done
- echo -e "</navMap>\n</ncx>" >> $DESTINATION/$dirname/OEBPS/toc.ncx
- #fixing the toc.ncx ref error
- for j in $(grep '<text>\|src=' $DESTINATION/$dirname/OEBPS/toc.ncx | grep '#' | sed 's/^[ \t]*//;s/[ \t]*$//' | awk -F'"' '{print $2}')
- do
- page=$(echo $j| cut -d'#' -f1)
- ref=$(echo $j | cut -d'#' -f2)
- $(sed -i "s/<body>/<body>\n\t<a id=\"$ref\"\><\/a>/" $DESTINATION\/$dirname\/OEBPS\/$page)
- done
- #Generating Navigation html(nav.html)
- echo "Generating Navigation html(nav.html)"
- $(awk '/<a href=".*html"\>.*$/ { printf("%s ",$0); next } 1' $DESTINATION/$dirname/OEBPS/$pagename-outline.html | sed -e 's/ \+/ /g' | sed -e 's/<\/a> <\(.*\)>/<\/a>\n<\1>/g' > $DESTINATION/$dirname/OEBPS/nav.html)
- $(sed -i 's/<li>/\t<li>/g' $DESTINATION/$dirname/OEBPS/nav.html)
- $(sed -i 's/<\/li>/\t<\/li>/g' $DESTINATION/$dirname/OEBPS/nav.html)
- $(sed -i 's/<a href=/\t\t<a href=/g' $DESTINATION/$dirname/OEBPS/nav.html)
- $(sed -i "s/<body>/<body>\n<nav id=\"toc\" epub:type=\"toc\">/" $DESTINATION/$dirname/OEBPS/nav.html)
- $(sed -i "s/<\/body>/<\/nav>\n<\/body>/" $DESTINATION/$dirname/OEBPS/nav.html)
- $(sed -i "s/^[ \t]*//;s/[ \t]*$//" $DESTINATION/$dirname/OEBPS/nav.html)
- #Delete the unwanted files and, directory
- echo "Deleting Unwanted files and directories"
- delete_base=$DESTINATION/$dirname
- $(rm -f $delete_base/OEBPS/$pagename\_ind.html)
- $(rm -f $delete_base/OEBPS/$pagename.html)
- $(rm -f $delete_base/OEBPS/$pagename-outline.html)
- $(rm -f $delete_base/OEBPS/outline.html)
- $(rm -f $delete_base/OEBPS/index_data.txt)
- $(rm -f $DESTINATION/$dirname/*.html)
- $(rm -rf $delete_base/html)
- echo "Done with $i"
- done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement