TextWeb Sublime Text Plugin

# Rough WIP text-based web view plugin for Sublime Text
# In the console: run_command("textweb") with one or more URLs selected,
# this will open a new file view with a plain text version of the
# requested webpage, for each URL provided.

import sublime
import sublime_plugin

import html
import re
import urllib.request

def is_whitespace(text):
    return re.match(r'\s+', text)

def clean_whitespace(text):
    return re.sub(r'\s+', " ", text.strip())

def clean_lines(text):
    return re.sub(r'(?m)^ *(.*?) *$', r'\1', text)

def get_attr(name, attrs):
    for attr in attrs:
        if attr[0] == name:
            return attr[1]
    return None

class TextWebNode(object):
    def __init__(self):
        self.text = ""
    def __str__(self):
        return self.text
    def append(self, text):
        if len(text) and len(self.text) and (
            not is_whitespace(self.text[-1]) and
            not is_whitespace(text[0])
        ):
            self.text = self.text + " " + text
        else:
            self.text = self.text + text

class TextWebIgnoreNode(TextWebNode):
    def __init__(self):
        self.text = ""
    def append(self, text):
        pass
    def __str__(self):
        return ""

class TextWebHeaderNode(TextWebNode):
    def __init__(self, level):
        self.level = level
        self.text = ""
    def __str__(self):
        return ("#" * self.level) + " " + self.text + "\n\n"

class TextWebHyperlinkNode(TextWebNode):
    def __init__(self, href):
        self.text = ""
        self.href = href
    def __str__(self):
        if self.href is not None and len(self.href):
            return self.text + " [url: " + self.href + "]"
        else:
            return self.text

class TextWebInlineNode(TextWebNode):
    def __init__(self):
        self.text = ""

class TextWebParagraphNode(TextWebNode):
    def __init__(self):
        self.text = ""
    def __str__(self):
        return self.text + "\n\n"

class TextWebListItemNode(TextWebNode):
    def __init__(self):
        self.text = ""
    def __str__(self):
        return "- " + self.text + "\n"

class TextWebRowNode(TextWebNode):
    def __init__(self):
        self.text = ""
    def __str__(self):
        return "| " + self.text + " |\n"

def is_header_node(tag):
    if len(tag) == 2 and tag[0] == "h":
        try:
            return int(tag[1])
        except:
            return 0
    else:
        return 0

class TextWebHTMLParser(html.parser.HTMLParser):
    def __init__(self, *args, **kwargs):
        self.stack = [TextWebNode()]
        super(TextWebHTMLParser, self).__init__(*args, **kwargs)

    def push(self, node):
        self.stack.append(node)

    def pop(self, node_type):
        while len(self.stack) > 1 and not isinstance(self.stack[-1], node_type):
            self.stack.pop()
        if len(self.stack) > 1:
            return self.stack.pop()
        else:
            return self.stack[0]

    def handle_starttag(self, tag, attrs):
        header = is_header_node(tag)
        if tag == "hr":
            self.stack[-1].append("\n\n---\n\n")
        elif header:
            self.push(TextWebHeaderNode(header))
        elif tag == "p" or tag == "div":
            self.push(TextWebParagraphNode())
        elif tag == "span" or tag == "label" or tag == "td":
            self.push(TextWebInlineNode())
        elif tag == "tr":
            self.push(TextWebRowNode())
        elif tag == "li":
            self.push(TextWebListItemNode())
        elif tag == "img" and len(self.stack):
            self.stack[-1].append("[image: %s]" % get_attr("src", attrs))
        elif tag == "a":
            self.push(TextWebHyperlinkNode(get_attr("href", attrs)))
        elif tag == "script" or tag == "style":
            self.push(TextWebIgnoreNode())

    def handle_endtag(self, tag):
        if is_header_node(tag):
            ended = self.pop(TextWebHeaderNode)
        elif tag == "p" or tag == "div":
            ended = self.pop(TextWebParagraphNode)
        elif tag == "span" or tag == "label" or tag == "td":
            ended = self.pop(TextWebInlineNode)
        elif tag == "tr":
            ended = self.pop(TextWebRowNode)
        elif tag == "li":
            ended = self.pop(TextWebListItemNode)
        elif tag == "a":
            ended = self.pop(TextWebHyperlinkNode)
        elif tag == "script" or tag == "style":
            ended = self.pop(TextWebIgnoreNode)
        else:
            ended = None
        if ended is not None and len(self.stack):
            self.stack[-1].append(str(ended))

    def handle_startendtag(self, tag, attrs):
        pass

    def handle_data(self, data):
        if len(self.stack):
            self.stack[-1].append(clean_whitespace(data))

    def get_content(self):
        return clean_lines(
            re.sub(r'\n\n+', "\n\n", str(self.stack[0].text + "\n"))
        )

class TextWebCommand(sublime_plugin.TextCommand):
    def name(self):
        return "textweb"
    def run(self, edit):
        selection = self.view.sel()
        if not len(selection):
            return # TODO: Prompt for a web address?
        else:
            for region in selection:
                view = sublime.active_window().new_file()
                self.load_url(view, self.view.substr(region))
    def load_url(self, view, url):
        print("Loading url:", url)
        content = urllib.request.urlopen(url).read().decode("utf-8")
        parser = TextWebHTMLParser()
        parser.feed(content)
        parser.close()
        # https://stackoverflow.com/a/43582267/3478907
        view.run_command("append", {
            "characters": "Textweb " + url + "\n" + parser.get_content(),
            "scroll_to_end": False
        })