Advertisement
AgvirtheSilent

Process Index Light Novels

Apr 15th, 2012
137
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.20 KB | None | 0 0
  1. #!/usr/bin/python
  2. import codecs, glob, os, re, subprocess, sys
  3.  
  4. irfanview_bin_path = '"C:\Program Files (x86)\IrfanView\i_view32.exe"'
  5. cover_file_top = u"<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>\n\
  6. <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n\
  7.  \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n\
  8. \n\
  9. <html xmlns=\"http://www.w3.org/1999/xhtml\">\n\
  10. <head>\n\
  11.  <title>Cover</title>\n\
  12.  <link href=\"../Styles/stylesheet.css\" rel=\"stylesheet\" type=\"text/css\" />\n\
  13. </head>\n\
  14. \n\
  15. <body>\n\
  16.  <p class=\"cover\"><img alt=\"\" class=\"imagefit\" src=\"../Images/"
  17. cover_file_bottom = u"\" /></p>\n\
  18. </body>\n\
  19. </html>\n"
  20. illustrations_file_top = u"<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>\n\
  21. <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n\
  22.  \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n\
  23. \n\
  24. <html xmlns=\"http://www.w3.org/1999/xhtml\">\n\
  25. <head>\n\
  26.  <title>Illustrations</title>\n\
  27.  <link href=\"../Styles/stylesheet.css\" rel=\"stylesheet\" type=\"text/css\" />\n\
  28. </head>\n\
  29. \n\
  30. <body>\n"
  31. illustrations_file_bottom = u"</body>\n\
  32. </html>\n"
  33. illustrations_insert_front = u"<p class=\"illustration\"><img alt=\"\" class=\"imagefit\" src=\"../Images/"
  34. illustrations_insert_back = u"\" /></p>\n"
  35. stylesheet_file = u".cover {\n\
  36.    display: block\n\
  37.    }\n\
  38. .illustration {\n\
  39.    display: block\n\
  40.    }\n\
  41. .imagefit {\n\
  42.    max-height: 100%;\n\
  43.    max-width: 100%\n\
  44.    }"
  45.  
  46. delete = raw_input("Overwrite existing converted files? (Y/N) ")
  47. if not re.match(r'Y|N$', delete):
  48.     sys.stderr.write("ERROR: Input not valid! Press Enter to exit.")
  49.     raw_input()
  50.     sys.exit(1)
  51.  
  52. # Process all files by directory
  53. for i in range(1, len(sys.argv)):
  54.     # Process passed argument
  55.     first_pass = True
  56.     if os.path.isdir(sys.argv[i]):
  57.         this_dir = sys.argv[i]
  58.         # Process all files by directory, if any, and recurse subdir
  59.         for dirpath, dirnames, filenames in os.walk(this_dir):
  60.             if first_pass:
  61.                 # First pass
  62.                 sys.stderr.write("Processing folder " + dirpath + "\n")
  63.                 first_pass = False
  64.                 # Create converted HTML dir
  65.                 try:
  66.                     os.mkdir(os.path.normpath(dirpath + os.sep + "Converted HTML"))
  67.                 except:
  68.                     if delete == "N":
  69.                         sys.exit(1)
  70.                     elif delete == "Y":
  71.                         old_files = glob.glob(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + '*'))
  72.                         if old_files:
  73.                             for f in old_files:
  74.                                 os.unlink(f)
  75.                 # Create stylesheet
  76.                 sys.stderr.write("Generating CSS stylesheet file\n")
  77.                 output = open(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + "stylesheet.css"), 'w')
  78.                 output.write(stylesheet_file)
  79.                 output.close()
  80.                 for file in filenames:
  81.                     if os.path.splitext(file)[1] == ".htm":
  82.                         # Process HTML file(s)
  83.                         sys.stderr.write("Formatting HTML file " + file + "\n")
  84.                         if re.search('(?<=Toaru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file):
  85.                             name_match = re.search('(?<=Toaru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file)
  86.                             file_name = name_match.group(1)
  87.                         elif re.search('(?<=To Aru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file):
  88.                             name_match = re.search('(?<=To Aru Majutsu no Index )(.+)(?= - Baka-Tsuki.htm)', file)
  89.                             file_name = name_match.group(1)
  90.                         file_name = re.sub(r'NT Volume\d+ |SS Norse Mythology |RailgunSS\d+ |Volume\d+ |Volume SP |SSVolume\d+ ', '', file_name)
  91.                         if re.search('(Chapter|Main|Sub|Period)[\s\.]?(\d+)', file_name):
  92.                             chapter_match = re.search('(Chapter|Main|Sub|Period)[\s\.]?(\d+)', file_name)
  93.                             name = chapter_match.group(1)
  94.                             number = chapter_match.group(2)
  95.                             file_name = name + "_" + number.zfill(2)
  96.                         file_name = os.path.splitext(file_name)[0] + ".xhtml"
  97.                         input = codecs.open(os.path.normpath(dirpath + os.sep + file), 'r', 'utf-8')
  98.                         output = codecs.open(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + file_name), 'w', 'utf-8')
  99.                         # Read in file and apply regexs
  100.                         entire_file = input.read()
  101.                         input.close()
  102.                         # Strip out <head>
  103.                         pattern = re.compile(u'(?<=<head>\n).+(?=</head>)', re.DOTALL)
  104.                         entire_file = re.sub(pattern, u'  <title>' + unicode(os.path.splitext(file_name)[0]) + u'</title>\n  <link href="../Styles/stylesheet.css" rel="stylesheet" type="text/css" />\n', entire_file)
  105.                         # Fix stray References header level
  106.                         entire_file = re.sub(ur'<h2>[^\n]*(<span class="mw-headline" id="References">\s*References\s*</span>)</h2>', u'<h3>\g<1></h3>', entire_file)
  107.                         # Fix malformatted References header
  108.                         entire_file = re.sub(ur'<h2>[^\n]*<span class="mw-headline" id="refs">\s*refs\s*</span></h2>', u'<h3><span class="mw-headline" id="References">References</span></h3>', entire_file)
  109.                         # Fix stray Notes header level
  110.                         entire_file = re.sub(ur'<h2>[^\n]*(<span class="mw-headline" id="Notes">\s*Notes\s*</span>)</h2>', u'<h3>\g<1></h3>', entire_file)
  111.                         # Strip out body before chapter header
  112.                         pattern = re.compile(u'\s+<div id="mw-page-base" class="noprint"></div>.+(<!-- bodytext -->\s+|</table>\s+|[^\n]+Please read[^\n]+Template:PREVIEW.+)(?=<h2>\s?<span class="mw-headline")', re.DOTALL)
  113.                         entire_file = re.sub(pattern, u'\n', entire_file)
  114.                         pattern = re.compile(u'\s+<div id="mw-page-base" class="noprint"></div>.+(<!-- bodytext -->\s+|</table>\s+|[^\n]+Please read[^\n]+Template:PREVIEW.+)([^\n]*<h2>)<span class="editsection">[^\n]+(?=<span class="mw-headline")', re.DOTALL)
  115.                         entire_file = re.sub(pattern, u'\n\g<2>', entire_file)
  116.                         pattern = re.compile(u'\s+<div id="mw-page-base" class="noprint"></div>.+<!-- bodytext -->', re.DOTALL)
  117.                         entire_file = re.sub(pattern, u'\n<h2 class="mw-headline" id="' + unicode(os.path.splitext(file_name)[0]) + u'">' + unicode(os.path.splitext(file_name)[0]) + u'</h2>', entire_file)
  118.                         # Strip out edit sections
  119.                         entire_file = re.sub(ur'<span class="editsection">\[<a href="[^"]+" title="[^"]+">edit</a>]</span>', u'', entire_file)
  120.                         # Strip out bottom of file
  121.                         pattern = re.compile(u'(<table class="wikitable"|<table border="1").+(?=</body>)', re.DOTALL)
  122.                         entire_file = re.sub(pattern, u'', entire_file)
  123.                         pattern = re.compile(u'<!--\s*NewPP limit report.+(?=</body>)', re.DOTALL)
  124.                         entire_file = re.sub(pattern, u'', entire_file)
  125.                         entire_file = re.sub(ur'<!-- Cached \d+ -->', u'', entire_file)
  126.                         # Strip colons out of id for NCNAME spec
  127.                         id_matches = re.findall(ur'(id="[^"]+")', entire_file)
  128.                         if len(id_matches) > 0:
  129.                             for id in id_matches:
  130.                                 strip_invalid = re.sub(ur':', u'', id)
  131.                                 entire_file = re.sub(re.escape(unicode(id)), unicode(strip_invalid), entire_file)
  132.                         # Purge illustration blocks (already have illustrations)
  133.                         if re.search(u'<h2>\s*<span class="mw-headline" id="Illustrations">\s*Illustrations\s*</span></h2>\n', entire_file):
  134.                             # Remove gallery blocks (don't convert)
  135.                             pattern = re.compile(u'<h2>\s*<span class="mw-headline" id="Illustrations">\s*Illustrations\s*</span></h2>\s+<ul class="gallery">(\s*<li[^\n]+\n|\s*<div[^\n]+\n|\s*<p[^\n]+\n|\s*</li[^\n]+\n|\s*</div[^\n]+\n|\s*</p[^\n]+\n)+</ul>', re.DOTALL)
  136.                             entire_file = re.sub(pattern, u'', entire_file)
  137.                         # Convert <img>
  138.                         entire_file = re.sub(ur'<div class="thumb[^\n]+File:([^"]+)[^\n]+</div></div></div></div>', u'<p class="illustration"><img alt="" class="imagefit" src="../Images/\g<1>" /></p>', entire_file)
  139.                         # Convert <img> in <ul class="gallery">
  140.                         entire_file = re.sub(ur'\s+<ul class=gallery>\n|</ul>', u'', entire_file)
  141.                         entire_file = re.sub(ur'\s+<li class="gallerybox"[^\n]+\n[^\n]+File:([^"]+)[^\n]+\s+<div class="gallerytext">\s+</div>\s+</div></li>', u'<p class="illustration"><img alt="" class="imagefit" src="../Images/\g<1>" /></p>', entire_file)
  142.                         # Write out conversion to file
  143.                         output.write(entire_file)
  144.                         output.close()
  145.                     elif os.path.splitext(file)[1] == ".png":
  146.                         # Generate cover
  147.                         sys.stderr.write("Generating cover page for " + file + "\n")
  148.                         cover_file = cover_file_top + unicode(file) + cover_file_bottom
  149.                         output = codecs.open(os.path.normpath(dirpath + os.sep + "Converted HTML" + os.sep + "Cover.xhtml"), 'w', 'utf-8')
  150.                         output.write(cover_file)
  151.                         output.close()
  152.             elif os.path.basename(dirpath) == "Images":
  153.                 # Generate illustrations
  154.                 sys.stderr.write("Generating illustrations page for " + os.path.dirname(dirpath) + "\n")
  155.                 sorted_images = sorted(filenames)
  156.                 image_check = [x for x in sorted_images if re.match(ur'\w+[_\-](000|008)\w*\.\w+', x)]
  157.                 sys.stderr.write(', '.join(image_check) + "\n")
  158.                 illustrations_file = illustrations_file_top
  159.                 if len(image_check) == 2:
  160.                     # Grab images _000.jpg through _008.jpg
  161.                     for image in sorted_images:
  162.                         illustrations_file = illustrations_file + illustrations_insert_front + unicode(image) + illustrations_insert_back
  163.                         if re.match(ur'\w+[_\-]008\w*\.\w+', image):
  164.                             break
  165.                 else:
  166.                     # Grab all images
  167.                     for image in sorted_images:
  168.                         illustrations_file = illustrations_file + illustrations_insert_front + unicode(image) + illustrations_insert_back
  169.                 illustrations_file = illustrations_file + illustrations_file_bottom
  170.                 output = open(os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted HTML" + os.sep + "Illustrations.xhtml"), 'w')
  171.                 output.write(illustrations_file)
  172.                 output.close()
  173.                 # Resize images
  174.                 sys.stderr.write("Resizing images for " + os.path.dirname(dirpath) + "\n")
  175.                 try:
  176.                     os.mkdir(os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted Images"))
  177.                 except:
  178.                     if delete == "N":
  179.                         sys.exit(1)
  180.                     elif delete == "Y":
  181.                         old_files = glob.glob(os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted Images" + os.sep + '*'))
  182.                         if old_files:
  183.                             for f in old_files:
  184.                                 os.unlink(f)
  185.                 for image in sorted(filenames):
  186.                     commands = [irfanview_bin_path, '"' + os.path.normpath(dirpath + os.sep + image) + '"', "/resize=(1000,1000)", "/resample", "/aspectratio", "/convert=" + '"' + os.path.normpath(os.path.dirname(dirpath) + os.sep + "Converted Images" + os.sep + image) + '"']
  187.                     subprocess.call(" ".join(commands))
  188.  
  189. sys.stderr.write("Conversion complete! Press Enter to exit.")
  190. raw_input()
  191. sys.exit(0)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement