Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Processes the tigsource printpage html file
- # grabs all the images and dumps out a new html file
- from __future__ import print_function
- import os, sys, json, shutil, re, math, tempfile, glob
- import os.path
- from datetime import date
- from htmllib import HTMLParser
- from HTMLParser import HTMLParser
- from urlparse import urlparse
- import urllib
- import uuid
- from sets import Set
- from bs4 import BeautifulSoup
- IMAGE_EXTENSIONS = [".png", ".gif", ".jpg", ".jpeg"]
- VALID_URL_CHARS = r"A-Za-z0-9\-._~:/?#\[\]@!$&'*+,;="
- # create a subclass and override the handler methods
- class MyHTMLParser(HTMLParser):
- images = []
- links = []
- def handle_starttag(self, tag, attrs):
- # print("Encountered a start tag:", tag, attrs)
- pass
- def handle_endtag(self, tag):
- pass
- # print("Encountered an end tag :", tag)
- def handle_data(self, data):
- # print("Encountered some data :", data)
- #for ref in re.findall(r'\([' + VALID_URL_CHARS + '\w./:\-_?=#]*\)', data):
- for ref in re.findall(r'\([' + VALID_URL_CHARS + ']*\)', data):
- ref = ref.strip('()')
- # print(ref)
- url = urlparse(ref)
- if url.scheme=="http" or url.scheme=="https":
- url = url.geturl()
- # print("link: ", url.geturl())
- isImage = False
- for ext in IMAGE_EXTENSIONS:
- if url.endswith(ext):
- isImage = True
- if isImage:
- if url not in self.images:
- self.images.append(url)
- else: self.links.append(url)
- else:
- pass
- # print("brkts: ", ref)
- def process(file):
- assert os.path.isfile(file), "Package path isn't a folder"
- dir_name = "devlog_" + os.path.splitext(file) [0]
- print("Making dir: ", dir_name)
- shutil.rmtree(dir_name, ignore_errors=True)
- try: os.mkdir(dir_name)
- except: None
- # instantiate the parser and fed it some HTML
- parser = MyHTMLParser()
- f = open(file)
- content = f.read() # 20000)
- parser.feed(content)
- # print(parser.links)
- #output_file = open(dir_name + "/full_" + file, 'w')
- for link in parser.links:
- content = content.replace("(" + link + ")", '<a href="%s">%s</a>'%(link, link))
- count = 0
- total = len(parser.images)
- for image in parser.images:
- print("image %d/%d"%(count, total))
- # if count>10: break
- # TODO: download image and depending on size, shrink it down and show thumbnail
- # generate a new name...
- ext = image.rsplit(".")[-1]
- newname = str(uuid.uuid4()) + "." + ext
- urllib.urlretrieve (image, dir_name + "/" + newname)
- content = content.replace("(" + image + ")", '<img src="%s"/>'%(newname))
- count = count + 1
- # progressive update
- #if count%10==0:
- # output_file.write(content)
- #output_file.write(content)
- # Now split it all up
- soup = BeautifulSoup(content)
- pages = []
- hrtags = soup.find_all('hr', attrs={'size': re.compile("2")})
- def next_element(elem):
- while elem is not None:
- # Find next element, skip NavigableString objects
- elem = elem.next_sibling
- if hasattr(elem, 'name'):
- return elem
- for hrtag in hrtags:
- page = [str(hrtag)]
- elem = next_element(hrtag)
- while elem:
- if elem.name == 'hr' and 'size' in elem.attrs:
- break
- page.append(str(elem))
- elem = next_element(elem)
- # pages.append('\n'.join(page))
- pages.append(''.join(page))
- i = 0
- skip = 40
- max_pages = len(pages)
- num_pages = max(1, (max_pages / skip))
- while i < num_pages:
- page = []
- page.append("""
- <!DOCTYPE html>
- <html>
- <head>
- <title>Moonman Devlog Archive</title>
- <style type="text/css">body{color:black;background-color:white;}body,td,.normaltext{font-family:Verdana,arial,helvetica,serif;font-size:small;}*,a:link,a:visited,a:hover,a:active{color:black!important;}table{empty-cells:show;}.code{font-size:x-small;font-family:monospace;border:1px solid black;margin:1px;padding:1px;}.quote{font-size:x-small;border:1px solid black;margin:1px;padding:1px;}.smalltext,.quoteheader,.codeheader{font-size:x-small;}.largetext{font-size:large;}hr{height:1px;border:0;color:black;background-color:black;}</style>
- </head>
- <body>
- <h1>Moonman Devlog Archive</h1>
- <h2>Archived on %s</h2>
- """%(date.today()))
- # Write links to other pages
- page.append("<br /><div>Page ")
- for p in range(num_pages):
- page.append('<a href="page_%d.html">%d </a>'%(p, p))
- page.append("</div><br />\n")
- # Write pages
- for j in range(i*skip, min(max_pages, i*skip + skip)):
- page.append(pages[j])
- # Write links to other pages
- page.append("<br /><div>Page ")
- for p in range(num_pages):
- page.append('<a href="page_%d.html">%d </a>'%(p, p))
- page.append("</div><br />\n")
- page.append("</body></html>")
- page = ''.join(page)
- output_file = open(dir_name + "/page_%d.html"%(i), 'w')
- output_file.write(page)
- output_file.close()
- i = i + 1
- if (len(sys.argv)!=2):
- print("Usage: %s file"%sys.argv[0])
- else:
- print("Processing devlog \"" + sys.argv[1] + "\"")
- process(sys.argv[1])
- print("Success")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement