GNU autotools

#!/usr/bin/env python
'''
This file is not free software. You are allowed to freely copy,
distribute and modify it, as long as this copyright notice
and license conditions remains.
You are NOT given permission to execute this program.

This file is not even intended to be a running program at all.
It's intended to be a basis for educational discussions about
using latex as a generator for beautiful documents.

'''

raise NotImplementedError("You are not allowed to execute this file!")

import os
import re
import shutil
import urllib2

_sources = [\
    ('work/t', 'http://nostarch.com/autotools.htm'),\
    ('work/t', 'http://nostarch.com/sites/default/files/imagecache/product_full/autotools_big.png'),\
    ('work/c0', 'http://www.freesoftwaremagazine.com/books/autotools_a_guide_to_autoconf_automake_libtool'),\
    \
    ('work/c1', 'http://www.freesoftwaremagazine.com/books/agaal/brief_introduction_to_gnu_autotools'),\
    ('work/c1', 'http://www.freesoftwaremagazine.com/files/www.freesoftwaremagazine.com/nodes/2754/autoconf_ahdr_dataflow.png'),\
    ('work/c1', 'http://www.freesoftwaremagazine.com/files/www.freesoftwaremagazine.com/nodes/2754/aclocal_dataflow.png'),\
    ('work/c1', 'http://www.freesoftwaremagazine.com/files/www.freesoftwaremagazine.com/nodes/2754/automake_libtool_dataflow.png'),\
    ('work/c1', 'http://www.freesoftwaremagazine.com/files/www.freesoftwaremagazine.com/nodes/2754/configure_dataflow.png'),\
    ('work/c1', 'http://www.freesoftwaremagazine.com/files/www.freesoftwaremagazine.com/nodes/2754/make_dataflow.png'),\
    \
    ('work/c2', 'http://www.freesoftwaremagazine.com/books/agaal/gnu_coding_standards_applied_to_autotools'),\
    ('work/c2', 'http://www.freesoftwaremagazine.com/files/www.freesoftwaremagazine.com/nodes/2763/compile_link_process.png'),\
    ('work/c3', 'http://www.freesoftwaremagazine.com/books/agaal/configuring_a_project_with_autoconf'),\
    ('work/c4', 'http://www.freesoftwaremagazine.com/books/agaal/automatically_writing_makefiles_with_autotools'),\
    ('work/c5', 'http://www.freesoftwaremagazine.com/books/agaal/building_shared_libraries_once_using_autotools'),\
    ('work/c5', 'http://www.freesoftwaremagazine.com/files/www.freesoftwaremagazine.com/nodes/2764/exe_load.png'),\
    ('work/c5', 'http://www.freesoftwaremagazine.com/files/www.freesoftwaremagazine.com/nodes/2764/lib_load.png'),\
    ('work/c6', 'http://www.freesoftwaremagazine.com/books/agaal/autotools_example'),\
    ('work/c7', 'http://www.freesoftwaremagazine.com/books/agaal/catalog_of_reusable_solutions'),\
    ('work/c8', 'http://www.freesoftwaremagazine.com/books/agaal/overview_of_m4_overview'),\
    ('work/c9', 'http://www.freesoftwaremagazine.com/books/agaal/reusing_autotools_solutions'),\
]

_chapter_names = ["Foreword"]


_regex_transforms = [
    ("<div\s*class=\"textbox\">\s*<h3>(.*?)</h3>([\s\S]*?)</div>",\
'''
\\\\begin{quotation}
{\\\\color{gray}\\\\noindent\\\\rule{0.89\\\\textwidth}{2pt}}
\\\\subsection*{\\1}

\\2

{\\\\color{gray}\\\\noindent\\\\rule{0.89\\\\textwidth}{2pt}}
\\\\end{quotation}

'''),\
    ("<!--.*?-->", ""),\
    ("&lt;", "<"),\
    ("&gt;", ">"),\
    ("&amp;", "&"),\
    ("&#8216;", "`"),\
    ("&#8217;", "'"),\
    ("&#8220;", "``"),\
    ("&#8221;", "''"),\
    ("&#8212;", "--"),\
    ("&#8230;", "\\\\ldots "),\
    ("<h[23]>(.+)</h[23]>\s*<pre>", "\\\\begin{lstlisting}[caption=\\1]"),\
    ("</pre>", "\n\\\\end{lstlisting}"),\
    ("<dl class=\"image\"><dt><a href=\"/files/www.freesoftwaremagazine.com/nodes/\d*/([^\"]+)\"><img src=\"[^\"]+\" alt=\"Figure \d*: ([^\"]+)\" title=\"[^\"]+\"></a></dt><dd><caption>[^\"]+</caption></dd></dl>",\
'''
(see fig. \\\\ref{fig-\\1})

\\\\begin{figure}[ht!]
    \\\\centering
    \\\\includegraphics[width=\\\\textwidth]{img/\\1}
    \\\\caption{\\2}
    \\\\label{fig-\\1}
\\\\end{figure}

'''),\
    ("<a\s+href=\"([^\"]+)\">(.*?)</a>", "\\2\\\\footnote{\\\\url{\\1}}"),\
    ("<p>" , ""),\
    ("</p>", "\n"),\
    ("<h1[^>]*>", "\\\\section{"),\
    ("</h1>", "}"),\
    ("<h2>", "\\\\subsection{"),\
    ("</h2>", "}"),\
    ("<h3>", "\\\\subsubsection{"),\
    ("</h3>", "}"),\
    ("<code>", "\\\\texttt{"),\
    ("</code>", "}"),\
    ("<em>", "\\\\textit{"),\
    ("</em>", "}"),\
    ("<i>", "\\\\textit{"),\
    ("</i>", "}"),\
    ("<b>", "\\\\textbf{"),\
    ("</b>", "}"),\
    ("<strong>", "\\\\textbf{"),\
    ("</strong>", "}"),\
    ("<pre>", "\\\\begin{lstlisting}"),\
    ("</pre>", "\n\\\\end{lstlisting}"),\
    ("<blockquote>", "\\\\begin{quotation}"),\
    ("</blockquote>", "\n\\\\end{quotation}"),\
    ("<ul>", "\\\\begin{itemize}"),\
    ("</ul>", "\\\\end{itemize}"),\
    ("<ol>", "\\\\begin{enumerate}"),\
    ("</ol>", "\\\\end{enumerate}"),\
    ("<li>", "\\\\item "),\
    ("</li>", "\n"),\
    ("Chapter (\d)", "Chapter \\\\ref{chap-\\1}"),\
    ("Chapters (\d), (\d) and (\d)", "Chapters \\\\ref{chap-\\1}, \\\\ref{chap-\\2} and \\\\ref{chap-\\3}"),\
    ("Appendix A", "Chapter \\\\ref{chap-8}"),\
    ("Finally, the References section", "Finally Chapter \\\\ref{chap-9}"),\
    ("\\\\section{About the Author}", "\\\\section*{About the Author}\n\n"),\
#last resort rules: remove missed html tags:
    ("</?div[^>]*>", ""),\
    ("<hr[^>]*>", "")\
]

#tex destination, html source
_tex_files = [
    ('work/tex/titletext.tex', 'work/t/autotools.htm'),\
    ('work/tex/00_foreword.tex', 'work/c0/autotools_a_guide_to_autoconf_automake_libtool'),\
    ('work/tex/01_introduction.tex', 'work/c1/brief_introduction_to_gnu_autotools'),\
    ('work/tex/02_gnu_coding_standards.tex', 'work/c2/gnu_coding_standards_applied_to_autotools'),\
    ('work/tex/03_configure.tex', 'work/c3/configuring_a_project_with_autoconf'),\
    ('work/tex/04_automake.tex', 'work/c4/automatically_writing_makefiles_with_autotools'),\
    ('work/tex/05_shared_libs.tex', 'work/c5/building_shared_libraries_once_using_autotools'),\
    ('work/tex/06_example.tex', 'work/c6/autotools_example'),\
    ('work/tex/07_catalog.tex', 'work/c7/catalog_of_reusable_solutions'),\
    ('work/tex/08_m4.tex', 'work/c8/overview_of_m4_overview'),\
    ('work/tex/09_reusing.tex', 'work/c9/reusing_autotools_solutions')\
]

_tex_template =\
'''
\documentclass[10pt,titlepage=true,BCOR=10mm,DIV=10]{scrbook}
%BCOR 10mm for ring binding
%increased DIV to enlage page area (and reduce pages numbers)
%be aware that reading long lines is not that easy for the eye
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[english]{babel}
\usepackage{hyperref}
\usepackage[sc]{mathpazo} %palatino
\linespread{1.05} %palatino wider lines spacing
\usepackage{listings}
\usepackage{xcolor}
\usepackage{graphicx}
\usepackage[absolute]{textpos}
\pretolerance=150
\\tolerance=150
\setlength{\emergencystretch}{3em}
\hypersetup{
    colorlinks=true,
    linkcolor=black,
    urlcolor=black
}
\lstset{
    breaklines=true,
    breakatwhitespace=true,
    basicstyle=\\ttfamily\small
}
\setkomafont{disposition}{\\rmfamily\itshape}

\\author{John Calcote}
\\title{Autotools: A Practical Guide To GNU Autoconf, Automake, And Libtool}
\date{July 2010}

\\begin{document}
\pagenumbering{roman}
\\thispagestyle{empty}

\\begin{titlepage}
    \\begin{textblock}{297}[0,0](0,0)
        \includegraphics[width=\paperwidth,height=\paperheight]{img/autotools_big_scaled.png}
    \end{textblock}

    \\vspace*{1em}

    \clearpage

    \input{titletext.tex}

    \\vfill

    \\noindent Warning: This book was generated from various unreviewed work-in-progress online sources
    and may contain minor incorrectnesses.
    The real printed book, available from No Starch Press (\\url{http://www.nostarch.com/autotools.htm}) is
    technically reviewed and completed. Consider buying.

    \clearpage
    \\thispagestyle{empty}
\end{titlepage}


\maketitle
\setcounter{page}{1}
\\tableofcontents
\listoffigures
\pagestyle{empty}
\cleardoublepage
\pagenumbering{arabic}
\pagestyle{plain}

\input{00_foreword.tex}

\input{01_introduction.tex}

\input{02_gnu_coding_standards.tex}

\input{03_configure.tex}

\input{04_automake.tex}

\input{05_shared_libs.tex}

\input{06_example.tex}

\input{07_catalog.tex}

\input{08_m4.tex}

\input{09_reusing.tex}

\end{document}

'''

def mkpath(path):
    if(not os.path.exists(path)):
        os.makedirs(path)
    return

def fetch_sources():
    if(os.path.exists('work')):
        print 'work directory exists; delete it to make me re-fetch book data'
    else:
        print 'fetching book data...'
        for path,url in _sources:
            mkpath(path)
            #os.system('cd %s && wget \'%s\'' % (path,url))
            print 'downloading %s' % url
            req = urllib2.urlopen(url)
            content = req.read()
            outfilename = '%s/%s' % (path,url.split('/')[-1])
            print '  storing at %s' % outfilename
            outfile = open(outfilename, 'w')
            outfile.write(content)
            outfile.close()
    return

def make_tex_template():
    print 'writing tex template'
    mkpath('work/tex')
    template = open('work/tex/autotools-guide.tex', 'w')
    template.write(_tex_template)
    template.close()
    return

def scale_title_image():
    print 'scaling title image'
    os.system('cd work && convert -resize 400\% -colorspace Gray t/autotools_big.png t/autotools_big_scaled.png')
    return

def read_chapter_titles():
    print 'extracting chapter names'
    for _,src in _tex_files[2:]:
        file = open(src, 'r')
        content = file.read()
        file.close()

        matches = re.findall("<title>Chapter \d+: ([^<]+)</title>", content)
        if(len(matches)):
            _chapter_names.append(matches[0].strip())
            continue

        matches = re.findall("<title>Appendix .+: ([^<]+)</title>", content)
        if(len(matches)):
            _chapter_names.append(matches[0].strip())
            continue

        matches = re.findall("<title>([^<]+)</title>", content)
        if(len(matches)):
            _chapter_names.append(matches[0].strip())
            continue

    return


def cutncopy_html_bodies():
    fsm_body_open = '<div class="content">'
    fsm_body_close1 = '<div class="book-navigation">'
    fsm_body_close2 = '<h3>Source archive</h3>'

    nost_body_open = 'Download the source code from the book</a></li>\n</ul>'
    nost_body_close = '<div><hr class="separator"><a name="toc">'
    for dest,src in _tex_files:
        infile = open(src, 'r')
        content = infile.read()
        infile.close()

        pos = content.find(fsm_body_open)
        if(pos != -1): #we have a freesoftwaremagazine file
            content = content[pos+len(fsm_body_open):] #cut header garbage
            pos = content.find(fsm_body_close1)
            if(pos!=-1):
                content = content[:pos] #cut footer garbage
            pos = content.find(fsm_body_close2)
            if(pos!=-1):
                content = content[:pos]

        pos = content.find(nost_body_open)
        if(pos != -1): #we have the nostarch titlepage
            content = content[pos+len(nost_body_open):] #cut header garbage
            pos = content.find(nost_body_close)
            content = content[:pos] #cut footer garbage

        outfile = open(dest, 'w')
        outfile.write(content)
        outfile.close()
    return

def regex_transformations(content):
    print 'applying regex patterns'
    for match,replace in _regex_transforms:
        content = re.subn(match, replace, content)[0]
    return content

def context_sensitive_latex_escaping(content):
    print 'escaping _ $ and # in non-lstlisting environments'

    #second pass lstlisting/ _ $ and # escaping
    sections = []
    start = 0
    lst_start_token = "\\begin{lstlisting}"
    lst_end_token = "\\end{lstlisting}"
    caption_start_token="[caption="
    caption_end_token="]"
    while True:
        found_start = content.find(lst_start_token, start)
        if(found_start == -1): #no more listing environs
            sections.append((content[start:], True))
            break
        non_escape_start = found_start + len(lst_start_token)

        #we have an additional caption that has to be escaped
        if( content[non_escape_start:].startswith(caption_start_token) ):
            found_caption_end = content.find(caption_end_token, non_escape_start)
            if(found_caption_end != -1):
                non_escape_start = found_caption_end + len(caption_end_token)


        found_end = content.find(lst_end_token, non_escape_start)
        if(found_end == -1): #no more listing environs
            sections.append((content[start:], True))
            break

        non_escape_end = found_end

        sections.append((content[start:non_escape_start], True))
        sections.append((content[non_escape_start:non_escape_end], False))
        sections.append((content[non_escape_end:non_escape_end+len(lst_end_token)], True))

        start = found_end + len(lst_end_token)

    content = []
    for (section,doEsc) in sections:
        if doEsc:
            section = section.replace("_", "\\_")
            section = section.replace("$", "\\$")
            section = section.replace("#", "\\#")
            section = section.replace("...", "\\ldots ")

        content.append(section)

    return "".join(content)

def ref_label_cleanup(content):
    print 'cleaning labels and refs'

    for token in ["\\ref{", "\\label{"]:
        start = 0
        while True:
            found_start = content.find(token, start)
            if(found_start == -1):
                break

            found_end = content.find("}", found_start)
            if(found_end == -1):
                break

            cleaned = content[found_start+len(token):found_end]
            cleaned = cleaned.replace("\\", "")
            cleaned = cleaned.replace("_", "-")
            cleaned = cleaned.replace(".", "-")
            content = '%s%s%s' % (content[:found_start+len(token)],cleaned,content[found_end:])

            start = found_end
    return content

def includegraphics_cleanup(content):
    print 'cleaning includegraphics'

    token = "\\includegraphics[width=\\textwidth]{"
    start = 0
    while True:
        found_start = content.find(token, start)
        if(found_start == -1):
            break

        found_end = content.find("}", found_start)
        if(found_end == -1):
            break

        cleaned = content[found_start+len(token):found_end]
        cleaned = cleaned.replace("\\", "")
        content = '%s%s%s' % (content[:found_start+len(token)],cleaned,content[found_end:])

        start = found_end

    return content

_level_tokens = ["\\chapter", "\\section", "\\subsection", "\\subsubsection"]
def fix_sectioning(content):
    print "fixing guessable sectioning errors"
    current_level = 0
    current_expected_level = 0

    pos = 0
    start = 0
    while pos != -1:
        next_pos = len(content)
        next_level = 0
        for level,token in enumerate(_level_tokens):
            pos = content.find(token, start)
            if pos == -1:
                continue

            if pos<next_pos:
                next_pos = pos
                next_level = level
        if pos == -1:
            continue
        #print '---'
        #print 'pos: %d' % pos
        #print 'current_level: %d' % current_level
        #print 'current_expected_level: %d' % current_expected_level

        #no level change
        if next_level == current_expected_level:
            #print 'no level change at %d' % current_expected_level
            start = next_pos + len(_level_tokens[next_level])
            continue

        #moving one level down
        if next_level < current_expected_level :
            #print 'down from %s to %s' % (_level_tokens[current_expected_level], _level_tokens[next_level])
            current_expected_level = next_level
            current_level = current_expected_level
            start = next_pos + len(_level_tokens[next_level])
            continue

        #moving one level up
        if current_level == current_expected_level and next_level == current_expected_level + 1 :
            #print 'up from %s to %s' % (_level_tokens[current_expected_level], _level_tokens[next_level])
            current_expected_level = next_level
            current_level = current_expected_level
            start = next_pos + len(_level_tokens[next_level])
            continue

        #there is a jump in the levels
        if current_level == current_expected_level and next_level > current_expected_level + 1:
            #print 'jump from %s to %s' % (_level_tokens[current_expected_level], _level_tokens[next_level])
            current_expected_level += 1
            current_level = next_level

        #now fix broken sectioning
        if current_level != current_expected_level :
            #print 'replacing %s with %s' % (_level_tokens[next_level], _level_tokens[current_expected_level])
            content = "".join([\
                content[:next_pos],\
                content[next_pos:].replace(_level_tokens[next_level], _level_tokens[current_expected_level], 1)])

            start = next_pos + len(_level_tokens[current_expected_level])

    return content


def html_to_latex():
    print 'begin of html->latex conversion'
    for dest,_ in _tex_files:
        file = open(dest, 'r')
        content = file.read()
        file.close()

        content = regex_transformations(content)
        content = context_sensitive_latex_escaping(content)
        content = ref_label_cleanup(content)
        content = includegraphics_cleanup(content)
        content = fix_sectioning(content)

        file = open(dest, 'w')
        file.write(content)
        file.close()
    return

def insert_chapters():
    print 'inserting chapter titles'
    count = 0

    for dest,_ in _tex_files[1:]:
        file = open(dest, 'r')
        content = file.read()
        file.close()

        content = '\\chapter{%s}\n\label{chap-%d}\n\n%s' % (_chapter_names[count], count, content)

        count+=1

        file = open(dest, 'w')
        file.write(content)
        file.close()
    return

def compile_pdflatex():
    print 'compiling twice with pdflatex'
    os.system('cd work/tex && pdflatex autotools-guide.tex && pdflatex autotools-guide.tex')
    return

def copy_images_to_tex():
    mkpath('work/tex/img')
    for folder,_ in _sources:
        entries = os.listdir(folder)
        for entry in entries:
            if entry.endswith('.png'):
                shutil.copy('%s/%s' % (folder,entry), 'work/tex/img/%s' % (entry))
    return

def main():
    fetch_sources()
    make_tex_template()
    scale_title_image() #needs convert/imagemagick
    copy_images_to_tex()
    read_chapter_titles()
    cutncopy_html_bodies()
    insert_chapters()
    html_to_latex()
    compile_pdflatex() #needs pdflatex with koma-script and texlive-fontsrecommended (mathpazo/palatino)

    print "\nI'm done.\nIf everythig went well, there should now be a file called autotools-guide.pdf in work/tex/"
    return

if __name__=="__main__":
    #main()
    raise NotImplementedError("Do you really wan't to chose the dark side?")