Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import codecs, glob, os, re, subprocess, sys
- irfanview_bin_path = '"C:\Program Files (x86)\IrfanView\i_view32.exe"'
- cover_file_top = u"<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>\n\
- <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n\
- \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n\
- \n\
- <html xmlns=\"http://www.w3.org/1999/xhtml\">\n\
- <head>\n\
- <title>Cover</title>\n\
- <link href=\"../Styles/stylesheet.css\" rel=\"stylesheet\" type=\"text/css\" />\n\
- </head>\n\
- \n\
- <body>\n\
- <p class=\"cover\"><img alt=\"\" class=\"imagefit\" src=\"../Images/"
- cover_file_bottom = u"\" /></p>\n\
- </body>\n\
- </html>\n"
- illustrations_file_top = u"<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>\n\
- <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n\
- \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n\
- \n\
- <html xmlns=\"http://www.w3.org/1999/xhtml\">\n\
- <head>\n\
- <title>Illustrations</title>\n\
- <link href=\"../Styles/stylesheet.css\" rel=\"stylesheet\" type=\"text/css\" />\n\
- </head>\n\
- \n\
- <body>\n"
- illustrations_file_bottom = u"</body>\n\
- </html>\n"
- illustrations_insert_front = u"<p class=\"illustration\"><img alt=\"\" class=\"imagefit\" src=\"../Images/"
- illustrations_insert_back = u"\" /></p>\n"
- stylesheet_file = u".cover {\n\
- display: block\n\
- }\n\
- .illustration {\n\
- display: block\n\
- }\n\
- .imagefit {\n\
- max-height: 100%;\n\
- max-width: 100%\n\
- }"
- delete = raw_input("Overwrite existing converted files? (Y/N) ")
- if not re.match(r'Y|N$', delete):
- sys.stderr.write("ERROR: Input not valid! Press Enter to exit.")
- raw_input()
- sys.exit(1)
- # Process all files by directory
- for i in range(1, len(sys.argv)):
- # Process passed argument
- first_pass = True
- if os.path.isdir(sys.argv[i]):
- this_dir = sys.argv[i]
- # Process all files by directory, if any, and recurse subdir
- for dirpath, dirnames, filenames in os.walk(this_dir):
- if first_pass:
- # First pass
- sys.stderr.write("Processing folder " + dirpath + "\n")
- first_pass = False
- # Create converted HTML dir
- try:
- os.mkdir(os.path.normpath(dirpath + os.sep + "Converted HTML"))
- except:
- if delete == "N":
- sys.exit(1)
- elif delete == "Y":
- old_files = glob.glob(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + '*'))
- if old_files:
- for f in old_files:
- os.unlink(f)
- # Create stylesheet
- sys.stderr.write("Generating CSS stylesheet file\n")
- output = open(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + "stylesheet.css"), 'w')
- output.write(stylesheet_file)
- output.close()
- for file in filenames:
- if os.path.splitext(file)[1] == ".htm":
- # Process HTML file(s)
- sys.stderr.write("Formatting HTML file " + file + "\n")
- if re.search('(?<=Toaru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file):
- name_match = re.search('(?<=Toaru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file)
- file_name = name_match.group(1)
- elif re.search('(?<=To Aru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file):
- name_match = re.search('(?<=To Aru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file)
- file_name = name_match.group(1)
- file_name = re.sub(r'NT Volume\d+ |SS Norse Mythology |RailgunSS\d+ |Volume\d+ |Volume SP |SSVolume\d+ ', '', file_name)
- if re.search('(Chapter|Main|Sub|Period)[\s\.]?(\d+)', file_name):
- chapter_match = re.search('(Chapter|Main|Sub|Period)[\s\.]?(\d+)', file_name)
- name = chapter_match.group(1)
- number = chapter_match.group(2)
- file_name = name + "_" + number.zfill(2)
- file_name = os.path.splitext(file_name)[0] + ".xhtml"
- input = codecs.open(os.path.normpath(dirpath + os.sep + file), 'r', 'utf-8')
- output = codecs.open(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + file_name), 'w', 'utf-8')
- # Read in file and apply regexs
- entire_file = input.read()
- input.close()
- # Strip out <head>
- pattern = re.compile(u'(?<=<head>\n).+(?=</head>)', re.DOTALL)
- entire_file = re.sub(pattern, u' <title>' + unicode(os.path.splitext(file_name)[0]) + u'</title>\n <link href="../Styles/stylesheet.css" rel="stylesheet" type="text/css" />\n', entire_file)
- # Fix stray References header level
- entire_file = re.sub(ur'<h2>[^\n]*(<span class="mw-headline" id="References">\s*References\s*</span>)</h2>', u'<h3>\g<1></h3>', entire_file)
- # Fix malformatted References header
- entire_file = re.sub(ur'<h2>[^\n]*<span class="mw-headline" id="refs">\s*refs\s*</span></h2>', u'<h3><span class="mw-headline" id="References">References</span></h3>', entire_file)
- # Fix stray Notes header level
- entire_file = re.sub(ur'<h2>[^\n]*(<span class="mw-headline" id="Notes">\s*Notes\s*</span>)</h2>', u'<h3>\g<1></h3>', entire_file)
- # Strip out body before chapter header
- pattern = re.compile(u'\s+<div id="mw-page-base" class="noprint"></div>.+(<!-- bodytext -->\s+|</table>\s+|[^\n]+Please read[^\n]+Template:PREVIEW.+)(?=<h2>\s?<span class="mw-headline")', re.DOTALL)
- entire_file = re.sub(pattern, u'\n', entire_file)
- pattern = re.compile(u'\s+<div id="mw-page-base" class="noprint"></div>.+(<!-- bodytext -->\s+|</table>\s+|[^\n]+Please read[^\n]+Template:PREVIEW.+)([^\n]*<h2>)<span class="editsection">[^\n]+(?=<span class="mw-headline")', re.DOTALL)
- entire_file = re.sub(pattern, u'\n\g<2>', entire_file)
- pattern = re.compile(u'\s+<div id="mw-page-base" class="noprint"></div>.+<!-- bodytext -->', re.DOTALL)
- entire_file = re.sub(pattern, u'\n<h2 class="mw-headline" id="' + unicode(os.path.splitext(file_name)[0]) + u'">' + unicode(os.path.splitext(file_name)[0]) + u'</h2>', entire_file)
- # Strip out edit sections
- entire_file = re.sub(ur'<span class="editsection">\[<a href="[^"]+" title="[^"]+">edit</a>]</span>', u'', entire_file)
- # Strip out bottom of file
- pattern = re.compile(u'(<table class="wikitable"|<table border="1").+(?=</body>)', re.DOTALL)
- entire_file = re.sub(pattern, u'', entire_file)
- pattern = re.compile(u'<!--\s*NewPP limit report.+(?=</body>)', re.DOTALL)
- entire_file = re.sub(pattern, u'', entire_file)
- entire_file = re.sub(ur'<!-- Cached \d+ -->', u'', entire_file)
- # Strip colons out of id for NCNAME spec
- id_matches = re.findall(ur'(id="[^"]+")', entire_file)
- if len(id_matches) > 0:
- for id in id_matches:
- strip_invalid = re.sub(ur':', u'', id)
- entire_file = re.sub(re.escape(unicode(id)), unicode(strip_invalid), entire_file)
- # Purge illustration blocks (already have illustrations)
- if re.search(u'<h2>\s*<span class="mw-headline" id="Illustrations">\s*Illustrations\s*</span></h2>\n', entire_file):
- # Remove gallery blocks (don't convert)
- pattern = re.compile(u'<h2>\s*<span class="mw-headline" id="Illustrations">\s*Illustrations\s*</span></h2>\s+<ul class="gallery">(\s*<li[^\n]+\n|\s*<div[^\n]+\n|\s*<p[^\n]+\n|\s*</li[^\n]+\n|\s*</div[^\n]+\n|\s*</p[^\n]+\n)+</ul>', re.DOTALL)
- entire_file = re.sub(pattern, u'', entire_file)
- # Convert <img>
- entire_file = re.sub(ur'<div class="thumb[^\n]+File:([^"]+)[^\n]+</div></div></div></div>', u'<p class="illustration"><img alt="" class="imagefit" src="../Images/\g<1>" /></p>', entire_file)
- # Convert <img> in <ul class="gallery">
- entire_file = re.sub(ur'\s+<ul class=gallery>\n|</ul>', u'', entire_file)
- entire_file = re.sub(ur'\s+<li class="gallerybox"[^\n]+\n[^\n]+File:([^"]+)[^\n]+\s+<div class="gallerytext">\s+</div>\s+</div></li>', u'<p class="illustration"><img alt="" class="imagefit" src="../Images/\g<1>" /></p>', entire_file)
- # Write out conversion to file
- output.write(entire_file)
- output.close()
- elif os.path.splitext(file)[1] == ".png":
- # Generate cover
- sys.stderr.write("Generating cover page for " + file + "\n")
- cover_file = cover_file_top + unicode(file) + cover_file_bottom
- output = codecs.open(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + "Cover.xhtml"), 'w', 'utf-8')
- output.write(cover_file)
- output.close()
- elif os.path.basename(dirpath) == "Images":
- # Generate illustrations
- sys.stderr.write("Generating illustrations page for " + os.path.dirname(dirpath) + "\n")
- sorted_images = sorted(filenames)
- image_check = [x for x in sorted_images if re.match(ur'\w+[_\-](000|008)\w*\.\w+', x)]
- sys.stderr.write(', '.join(image_check) + "\n")
- illustrations_file = illustrations_file_top
- if len(image_check) == 2:
- # Grab images _000.jpg through _008.jpg
- for image in sorted_images:
- illustrations_file = illustrations_file + illustrations_insert_front + unicode(image) + illustrations_insert_back
- if re.match(ur'\w+[_\-]008\w*\.\w+', image):
- break
- else:
- # Grab all images
- for image in sorted_images:
- illustrations_file = illustrations_file + illustrations_insert_front + unicode(image) + illustrations_insert_back
- illustrations_file = illustrations_file + illustrations_file_bottom
- output = open(os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted HTML" + os.sep + "Illustrations.xhtml"), 'w')
- output.write(illustrations_file)
- output.close()
- # Resize images
- sys.stderr.write("Resizing images for " + os.path.dirname(dirpath) + "\n")
- try:
- os.mkdir(os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted Images"))
- except:
- if delete == "N":
- sys.exit(1)
- elif delete == "Y":
- old_files = glob.glob(os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted Images" + os.sep + '*'))
- if old_files:
- for f in old_files:
- os.unlink(f)
- for image in sorted(filenames):
- commands = [irfanview_bin_path, '"' + os.path.normpath(dirpath + os.sep + image) + '"', "/resize=(1000,1000)", "/resample", "/aspectratio", "/convert=" + '"' + os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted Images" + os.sep + image) + '"']
- subprocess.call(" ".join(commands))
- sys.stderr.write("Conversion complete! Press Enter to exit.")
- raw_input()
- sys.exit(0)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement