tigsource devlog archiver

# Processes the tigsource printpage html file
# grabs all the images and dumps out a new html file

from __future__ import print_function
import os, sys, json, shutil, re, math, tempfile, glob
import os.path
from datetime import date
from htmllib import HTMLParser
from HTMLParser import HTMLParser
from urlparse import urlparse
import urllib
import uuid
from sets import Set
from bs4 import BeautifulSoup

IMAGE_EXTENSIONS = [".png", ".gif", ".jpg", ".jpeg"]
VALID_URL_CHARS = r"A-Za-z0-9\-._~:/?#\[\]@!$&'*+,;="

# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
    images = []
    links = []
    def handle_starttag(self, tag, attrs):
        # print("Encountered a start tag:", tag, attrs)
        pass
    def handle_endtag(self, tag):
        pass
        # print("Encountered an end tag :", tag)
    def handle_data(self, data):
        # print("Encountered some data  :", data)
        #for ref in re.findall(r'\([' + VALID_URL_CHARS + '\w./:\-_?=#]*\)', data):
        for ref in re.findall(r'\([' + VALID_URL_CHARS + ']*\)', data):
            ref = ref.strip('()')
            # print(ref)
            url = urlparse(ref)
            if url.scheme=="http" or url.scheme=="https":
                url = url.geturl()
                # print("link: ", url.geturl())
                isImage = False
                for ext in IMAGE_EXTENSIONS:
                    if url.endswith(ext):
                        isImage = True

                if isImage:
                    if url not in self.images:
                        self.images.append(url)
                else: self.links.append(url)
            else:
                pass
                # print("brkts: ", ref)

def process(file):
    assert os.path.isfile(file), "Package path isn't a folder"
    dir_name = "devlog_" + os.path.splitext(file) [0]
    print("Making dir: ", dir_name)
    shutil.rmtree(dir_name, ignore_errors=True)
    try: os.mkdir(dir_name)
    except: None

    # instantiate the parser and fed it some HTML
    parser = MyHTMLParser()
    f = open(file)
    content = f.read() # 20000)
    parser.feed(content)

    # print(parser.links)
    #output_file = open(dir_name + "/full_" + file, 'w')
    for link in parser.links:
        content = content.replace("(" + link + ")", '<a href="%s">%s</a>'%(link, link))
    count = 0
    total = len(parser.images)
    for image in parser.images:
        print("image %d/%d"%(count, total))
        # if count>10: break
        # TODO: download image and depending on size, shrink it down and show thumbnail
        # generate a new name...
        ext = image.rsplit(".")[-1]
        newname = str(uuid.uuid4()) + "." + ext

        urllib.urlretrieve (image, dir_name + "/" + newname)
        content = content.replace("(" + image + ")", '<img src="%s"/>'%(newname))
        count = count + 1

        # progressive update
        #if count%10==0:
        #   output_file.write(content)
    #output_file.write(content)

    # Now split it all up
    soup = BeautifulSoup(content)
    pages = []
    hrtags = soup.find_all('hr', attrs={'size': re.compile("2")})

    def next_element(elem):
        while elem is not None:
            # Find next element, skip NavigableString objects
            elem = elem.next_sibling
            if hasattr(elem, 'name'):
                return elem

    for hrtag in hrtags:
        page = [str(hrtag)]
        elem = next_element(hrtag)
        while elem:
            if elem.name == 'hr' and 'size' in elem.attrs:
                break
            page.append(str(elem))
            elem = next_element(elem)
        # pages.append('\n'.join(page))
        pages.append(''.join(page))

    i = 0
    skip = 40
    max_pages = len(pages)
    num_pages = max(1, (max_pages / skip))

    while i < num_pages:
        page = []
        page.append("""
<!DOCTYPE html>
<html>
<head>
<title>Moonman Devlog Archive</title>
<style type="text/css">body{color:black;background-color:white;}body,td,.normaltext{font-family:Verdana,arial,helvetica,serif;font-size:small;}*,a:link,a:visited,a:hover,a:active{color:black!important;}table{empty-cells:show;}.code{font-size:x-small;font-family:monospace;border:1px solid black;margin:1px;padding:1px;}.quote{font-size:x-small;border:1px solid black;margin:1px;padding:1px;}.smalltext,.quoteheader,.codeheader{font-size:x-small;}.largetext{font-size:large;}hr{height:1px;border:0;color:black;background-color:black;}</style>
</head>
<body>
<h1>Moonman Devlog Archive</h1>
<h2>Archived on %s</h2>
"""%(date.today()))
        # Write links to other pages
        page.append("<br /><div>Page ")
        for p in range(num_pages):
            page.append('<a href="page_%d.html">%d </a>'%(p, p))
        page.append("</div><br />\n")

        # Write pages
        for j in range(i*skip, min(max_pages, i*skip + skip)):
            page.append(pages[j])

        # Write links to other pages
        page.append("<br /><div>Page ")
        for p in range(num_pages):
            page.append('<a href="page_%d.html">%d </a>'%(p, p))
        page.append("</div><br />\n")

        page.append("</body></html>")

        page = ''.join(page)
        output_file = open(dir_name + "/page_%d.html"%(i), 'w')
        output_file.write(page)
        output_file.close()
        i = i + 1

if (len(sys.argv)!=2):
    print("Usage: %s file"%sys.argv[0])
else:
    print("Processing devlog \"" + sys.argv[1] + "\"")
    process(sys.argv[1])
    print("Success")