Process Index Light Novels

#!/usr/bin/python
import codecs, glob, os, re, subprocess, sys

irfanview_bin_path = '"C:\Program Files (x86)\IrfanView\i_view32.exe"'
cover_file_top = u"<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>\n\
<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n\
  \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n\
\n\
<html xmlns=\"http://www.w3.org/1999/xhtml\">\n\
<head>\n\
  <title>Cover</title>\n\
  <link href=\"../Styles/stylesheet.css\" rel=\"stylesheet\" type=\"text/css\" />\n\
</head>\n\
\n\
<body>\n\
  <p class=\"cover\"><img alt=\"\" class=\"imagefit\" src=\"../Images/"
cover_file_bottom = u"\" /></p>\n\
</body>\n\
</html>\n"
illustrations_file_top = u"<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>\n\
<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n\
  \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n\
\n\
<html xmlns=\"http://www.w3.org/1999/xhtml\">\n\
<head>\n\
  <title>Illustrations</title>\n\
  <link href=\"../Styles/stylesheet.css\" rel=\"stylesheet\" type=\"text/css\" />\n\
</head>\n\
\n\
<body>\n"
illustrations_file_bottom = u"</body>\n\
</html>\n"
illustrations_insert_front = u"<p class=\"illustration\"><img alt=\"\" class=\"imagefit\" src=\"../Images/"
illustrations_insert_back = u"\" /></p>\n"
stylesheet_file = u".cover {\n\
    display: block\n\
    }\n\
.illustration {\n\
    display: block\n\
    }\n\
.imagefit {\n\
    max-height: 100%;\n\
    max-width: 100%\n\
    }"

delete = raw_input("Overwrite existing converted files? (Y/N) ")
if not re.match(r'Y|N$', delete):
    sys.stderr.write("ERROR: Input not valid! Press Enter to exit.")
    raw_input()
    sys.exit(1)

# Process all files by directory
for i in range(1, len(sys.argv)):
    # Process passed argument
    first_pass = True
    if os.path.isdir(sys.argv[i]):
        this_dir = sys.argv[i]
        # Process all files by directory, if any, and recurse subdir
        for dirpath, dirnames, filenames in os.walk(this_dir):
            if first_pass:
                # First pass
                sys.stderr.write("Processing folder " + dirpath + "\n")
                first_pass = False
                # Create converted HTML dir
                try:
                    os.mkdir(os.path.normpath(dirpath + os.sep + "Converted HTML"))
                except:
                    if delete == "N":
                        sys.exit(1)
                    elif delete == "Y":
                        old_files = glob.glob(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + '*'))
                        if old_files:
                            for f in old_files:
                                os.unlink(f)
                # Create stylesheet
                sys.stderr.write("Generating CSS stylesheet file\n")
                output = open(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + "stylesheet.css"), 'w')
                output.write(stylesheet_file)
                output.close()
                for file in filenames:
                    if os.path.splitext(file)[1] == ".htm":
                        # Process HTML file(s)
                        sys.stderr.write("Formatting HTML file " + file + "\n")
                        if re.search('(?<=Toaru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file):
                            name_match = re.search('(?<=Toaru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file)
                            file_name = name_match.group(1)
                        elif re.search('(?<=To Aru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file):
                            name_match = re.search('(?<=To Aru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file)
                            file_name = name_match.group(1)
                        file_name = re.sub(r'NT Volume\d+ |SS Norse Mythology |RailgunSS\d+ |Volume\d+ |Volume SP |SSVolume\d+ ', '', file_name)
                        if re.search('(Chapter|Main|Sub|Period)[\s\.]?(\d+)', file_name):
                            chapter_match = re.search('(Chapter|Main|Sub|Period)[\s\.]?(\d+)', file_name)
                            name = chapter_match.group(1)
                            number = chapter_match.group(2)
                            file_name = name + "_" + number.zfill(2)
                        file_name = os.path.splitext(file_name)[0] + ".xhtml"
                        input = codecs.open(os.path.normpath(dirpath + os.sep + file), 'r', 'utf-8')
                        output = codecs.open(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + file_name), 'w', 'utf-8')
                        # Read in file and apply regexs
                        entire_file = input.read()
                        input.close()
                        # Strip out <head>
                        pattern = re.compile(u'(?<=<head>\n).+(?=</head>)', re.DOTALL)
                        entire_file = re.sub(pattern, u'  <title>' + unicode(os.path.splitext(file_name)[0]) + u'</title>\n  <link href="../Styles/stylesheet.css" rel="stylesheet" type="text/css" />\n', entire_file)
                        # Fix stray References header level
                        entire_file = re.sub(ur'<h2>[^\n]*(<span class="mw-headline" id="References">\s*References\s*</span>)</h2>', u'<h3>\g<1></h3>', entire_file)
                        # Fix malformatted References header
                        entire_file = re.sub(ur'<h2>[^\n]*<span class="mw-headline" id="refs">\s*refs\s*</span></h2>', u'<h3><span class="mw-headline" id="References">References</span></h3>', entire_file)
                        # Fix stray Notes header level
                        entire_file = re.sub(ur'<h2>[^\n]*(<span class="mw-headline" id="Notes">\s*Notes\s*</span>)</h2>', u'<h3>\g<1></h3>', entire_file)
                        # Strip out body before chapter header
                        pattern = re.compile(u'\s+<div id="mw-page-base" class="noprint"></div>.+(<!-- bodytext -->\s+|</table>\s+|[^\n]+Please read[^\n]+Template:PREVIEW.+)(?=<h2>\s?<span class="mw-headline")', re.DOTALL)
                        entire_file = re.sub(pattern, u'\n', entire_file)
                        pattern = re.compile(u'\s+<div id="mw-page-base" class="noprint"></div>.+(<!-- bodytext -->\s+|</table>\s+|[^\n]+Please read[^\n]+Template:PREVIEW.+)([^\n]*<h2>)<span class="editsection">[^\n]+(?=<span class="mw-headline")', re.DOTALL)
                        entire_file = re.sub(pattern, u'\n\g<2>', entire_file)
                        pattern = re.compile(u'\s+<div id="mw-page-base" class="noprint"></div>.+<!-- bodytext -->', re.DOTALL)
                        entire_file = re.sub(pattern, u'\n<h2 class="mw-headline" id="' + unicode(os.path.splitext(file_name)[0]) + u'">' + unicode(os.path.splitext(file_name)[0]) + u'</h2>', entire_file)
                        # Strip out edit sections
                        entire_file = re.sub(ur'<span class="editsection">\[<a href="[^"]+" title="[^"]+">edit</a>]</span>', u'', entire_file)
                        # Strip out bottom of file
                        pattern = re.compile(u'(<table class="wikitable"|<table border="1").+(?=</body>)', re.DOTALL)
                        entire_file = re.sub(pattern, u'', entire_file)
                        pattern = re.compile(u'<!--\s*NewPP limit report.+(?=</body>)', re.DOTALL)
                        entire_file = re.sub(pattern, u'', entire_file)
                        entire_file = re.sub(ur'<!-- Cached \d+ -->', u'', entire_file)
                        # Strip colons out of id for NCNAME spec
                        id_matches = re.findall(ur'(id="[^"]+")', entire_file)
                        if len(id_matches) > 0:
                            for id in id_matches:
                                strip_invalid = re.sub(ur':', u'', id)
                                entire_file = re.sub(re.escape(unicode(id)), unicode(strip_invalid), entire_file)
                        # Purge illustration blocks (already have illustrations)
                        if re.search(u'<h2>\s*<span class="mw-headline" id="Illustrations">\s*Illustrations\s*</span></h2>\n', entire_file):
                            # Remove gallery blocks (don't convert)
                            pattern = re.compile(u'<h2>\s*<span class="mw-headline" id="Illustrations">\s*Illustrations\s*</span></h2>\s+<ul class="gallery">(\s*<li[^\n]+\n|\s*<div[^\n]+\n|\s*<p[^\n]+\n|\s*</li[^\n]+\n|\s*</div[^\n]+\n|\s*</p[^\n]+\n)+</ul>', re.DOTALL)
                            entire_file = re.sub(pattern, u'', entire_file)
                        # Convert <img>
                        entire_file = re.sub(ur'<div class="thumb[^\n]+File:([^"]+)[^\n]+</div></div></div></div>', u'<p class="illustration"><img alt="" class="imagefit" src="../Images/\g<1>" /></p>', entire_file)
                        # Convert <img> in <ul class="gallery">
                        entire_file = re.sub(ur'\s+<ul class=gallery>\n|</ul>', u'', entire_file)
                        entire_file = re.sub(ur'\s+<li class="gallerybox"[^\n]+\n[^\n]+File:([^"]+)[^\n]+\s+<div class="gallerytext">\s+</div>\s+</div></li>', u'<p class="illustration"><img alt="" class="imagefit" src="../Images/\g<1>" /></p>', entire_file)
                        # Write out conversion to file
                        output.write(entire_file)
                        output.close()
                    elif os.path.splitext(file)[1] == ".png":
                        # Generate cover
                        sys.stderr.write("Generating cover page for " + file + "\n")
                        cover_file = cover_file_top + unicode(file) + cover_file_bottom
                        output = codecs.open(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + "Cover.xhtml"), 'w', 'utf-8')
                        output.write(cover_file)
                        output.close()
            elif os.path.basename(dirpath) == "Images":
                # Generate illustrations
                sys.stderr.write("Generating illustrations page for " + os.path.dirname(dirpath) + "\n")
                sorted_images = sorted(filenames)
                image_check = [x for x in sorted_images if re.match(ur'\w+[_\-](000|008)\w*\.\w+', x)]
                sys.stderr.write(', '.join(image_check) + "\n")
                illustrations_file = illustrations_file_top
                if len(image_check) == 2:
                    # Grab images _000.jpg through _008.jpg
                    for image in sorted_images:
                        illustrations_file = illustrations_file + illustrations_insert_front + unicode(image) + illustrations_insert_back
                        if re.match(ur'\w+[_\-]008\w*\.\w+', image):
                            break
                else:
                    # Grab all images
                    for image in sorted_images:
                        illustrations_file = illustrations_file + illustrations_insert_front + unicode(image) + illustrations_insert_back
                illustrations_file = illustrations_file + illustrations_file_bottom
                output = open(os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted HTML" + os.sep + "Illustrations.xhtml"), 'w')
                output.write(illustrations_file)
                output.close()
                # Resize images
                sys.stderr.write("Resizing images for " + os.path.dirname(dirpath) + "\n")
                try:
                    os.mkdir(os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted Images"))
                except:
                    if delete == "N":
                        sys.exit(1)
                    elif delete == "Y":
                        old_files = glob.glob(os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted Images" + os.sep + '*'))
                        if old_files:
                            for f in old_files:
                                os.unlink(f)
                for image in sorted(filenames):
                    commands = [irfanview_bin_path, '"' + os.path.normpath(dirpath + os.sep + image) + '"', "/resize=(1000,1000)", "/resample", "/aspectratio", "/convert=" + '"' + os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted Images" + os.sep + image) + '"']
                    subprocess.call(" ".join(commands))

sys.stderr.write("Conversion complete! Press Enter to exit.")
raw_input()
sys.exit(0)