Advertisement
Guest User

tigsource devlog archiver

a guest
Jul 29th, 2014
2,629
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.92 KB | None | 0 0
  1. # Processes the tigsource printpage html file
  2. # grabs all the images and dumps out a new html file
  3.  
  4. from __future__ import print_function
  5. import os, sys, json, shutil, re, math, tempfile, glob
  6. import os.path
  7. from datetime import date
  8. from htmllib import HTMLParser
  9. from HTMLParser import HTMLParser
  10. from urlparse import urlparse
  11. import urllib
  12. import uuid
  13. from sets import Set
  14. from bs4 import BeautifulSoup
  15.  
  16. IMAGE_EXTENSIONS = [".png", ".gif", ".jpg", ".jpeg"]
  17. VALID_URL_CHARS = r"A-Za-z0-9\-._~:/?#\[\]@!$&'*+,;="
  18.    
  19. # create a subclass and override the handler methods
  20. class MyHTMLParser(HTMLParser):
  21.     images = []
  22.     links = []
  23.     def handle_starttag(self, tag, attrs):
  24.         # print("Encountered a start tag:", tag, attrs)
  25.         pass
  26.     def handle_endtag(self, tag):
  27.         pass
  28.         # print("Encountered an end tag :", tag)
  29.     def handle_data(self, data):
  30.         # print("Encountered some data  :", data)
  31.         #for ref in re.findall(r'\([' + VALID_URL_CHARS + '\w./:\-_?=#]*\)', data):        
  32.         for ref in re.findall(r'\([' + VALID_URL_CHARS + ']*\)', data):        
  33.             ref = ref.strip('()')
  34.             # print(ref)
  35.             url = urlparse(ref)
  36.             if url.scheme=="http" or url.scheme=="https":
  37.                 url = url.geturl()
  38.                 # print("link: ", url.geturl())        
  39.                 isImage = False
  40.                 for ext in IMAGE_EXTENSIONS:
  41.                     if url.endswith(ext):
  42.                         isImage = True
  43.                
  44.                 if isImage:
  45.                     if url not in self.images:
  46.                         self.images.append(url)
  47.                 else: self.links.append(url)
  48.             else:
  49.                 pass
  50.                 # print("brkts: ", ref)        
  51.        
  52. def process(file):
  53.     assert os.path.isfile(file), "Package path isn't a folder"
  54.     dir_name = "devlog_" + os.path.splitext(file) [0]
  55.     print("Making dir: ", dir_name)
  56.     shutil.rmtree(dir_name, ignore_errors=True)
  57.     try: os.mkdir(dir_name)
  58.     except: None
  59.  
  60.     # instantiate the parser and fed it some HTML
  61.     parser = MyHTMLParser()
  62.     f = open(file)
  63.     content = f.read() # 20000)
  64.     parser.feed(content)
  65.    
  66.     # print(parser.links)
  67.     #output_file = open(dir_name + "/full_" + file, 'w')
  68.     for link in parser.links:
  69.         content = content.replace("(" + link + ")", '<a href="%s">%s</a>'%(link, link))
  70.     count = 0
  71.     total = len(parser.images)
  72.     for image in parser.images:
  73.         print("image %d/%d"%(count, total))
  74.         # if count>10: break
  75.         # TODO: download image and depending on size, shrink it down and show thumbnail
  76.         # generate a new name...
  77.         ext = image.rsplit(".")[-1]    
  78.         newname = str(uuid.uuid4()) + "." + ext
  79.        
  80.         urllib.urlretrieve (image, dir_name + "/" + newname)
  81.         content = content.replace("(" + image + ")", '<img src="%s"/>'%(newname))
  82.         count = count + 1
  83.        
  84.         # progressive update
  85.         #if count%10==0:
  86.         #   output_file.write(content)
  87.     #output_file.write(content)
  88.    
  89.     # Now split it all up
  90.     soup = BeautifulSoup(content)
  91.     pages = []
  92.     hrtags = soup.find_all('hr', attrs={'size': re.compile("2")})
  93.    
  94.     def next_element(elem):
  95.         while elem is not None:
  96.             # Find next element, skip NavigableString objects
  97.             elem = elem.next_sibling
  98.             if hasattr(elem, 'name'):
  99.                 return elem
  100.        
  101.     for hrtag in hrtags:
  102.         page = [str(hrtag)]
  103.         elem = next_element(hrtag)     
  104.         while elem:
  105.             if elem.name == 'hr' and 'size' in elem.attrs:
  106.                 break
  107.             page.append(str(elem))
  108.             elem = next_element(elem)          
  109.         # pages.append('\n'.join(page))
  110.         pages.append(''.join(page))
  111.    
  112.     i = 0
  113.     skip = 40
  114.     max_pages = len(pages) 
  115.     num_pages = max(1, (max_pages / skip))
  116.    
  117.     while i < num_pages:
  118.         page = []      
  119.         page.append("""
  120. <!DOCTYPE html>
  121. <html>
  122. <head>
  123. <title>Moonman Devlog Archive</title>
  124. <style type="text/css">body{color:black;background-color:white;}body,td,.normaltext{font-family:Verdana,arial,helvetica,serif;font-size:small;}*,a:link,a:visited,a:hover,a:active{color:black!important;}table{empty-cells:show;}.code{font-size:x-small;font-family:monospace;border:1px solid black;margin:1px;padding:1px;}.quote{font-size:x-small;border:1px solid black;margin:1px;padding:1px;}.smalltext,.quoteheader,.codeheader{font-size:x-small;}.largetext{font-size:large;}hr{height:1px;border:0;color:black;background-color:black;}</style>
  125. </head>
  126. <body>
  127. <h1>Moonman Devlog Archive</h1>
  128. <h2>Archived on %s</h2>
  129. """%(date.today()))
  130.         # Write links to other pages
  131.         page.append("<br /><div>Page ")
  132.         for p in range(num_pages):
  133.             page.append('<a href="page_%d.html">%d </a>'%(p, p))
  134.         page.append("</div><br />\n")
  135.  
  136.         # Write pages
  137.         for j in range(i*skip, min(max_pages, i*skip + skip)):
  138.             page.append(pages[j])
  139.            
  140.         # Write links to other pages
  141.         page.append("<br /><div>Page ")
  142.         for p in range(num_pages):
  143.             page.append('<a href="page_%d.html">%d </a>'%(p, p))
  144.         page.append("</div><br />\n")
  145.            
  146.         page.append("</body></html>")
  147.            
  148.         page = ''.join(page)
  149.         output_file = open(dir_name + "/page_%d.html"%(i), 'w')
  150.         output_file.write(page)
  151.         output_file.close()
  152.         i = i + 1
  153.    
  154. if (len(sys.argv)!=2):
  155.     print("Usage: %s file"%sys.argv[0])
  156. else:  
  157.     print("Processing devlog \"" + sys.argv[1] + "\"")
  158.     process(sys.argv[1])
  159.     print("Success")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement