Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Rough WIP text-based web view plugin for Sublime Text
- # In the console: run_command("textweb") with one or more URLs selected,
- # this will open a new file view with a plain text version of the
- # requested webpage, for each URL provided.
- import sublime
- import sublime_plugin
- import html
- import re
- import urllib.request
- def is_whitespace(text):
- return re.match(r'\s+', text)
- def clean_whitespace(text):
- return re.sub(r'\s+', " ", text.strip())
- def clean_lines(text):
- return re.sub(r'(?m)^ *(.*?) *$', r'\1', text)
- def get_attr(name, attrs):
- for attr in attrs:
- if attr[0] == name:
- return attr[1]
- return None
- class TextWebNode(object):
- def __init__(self):
- self.text = ""
- def __str__(self):
- return self.text
- def append(self, text):
- if len(text) and len(self.text) and (
- not is_whitespace(self.text[-1]) and
- not is_whitespace(text[0])
- ):
- self.text = self.text + " " + text
- else:
- self.text = self.text + text
- class TextWebIgnoreNode(TextWebNode):
- def __init__(self):
- self.text = ""
- def append(self, text):
- pass
- def __str__(self):
- return ""
- class TextWebHeaderNode(TextWebNode):
- def __init__(self, level):
- self.level = level
- self.text = ""
- def __str__(self):
- return ("#" * self.level) + " " + self.text + "\n\n"
- class TextWebHyperlinkNode(TextWebNode):
- def __init__(self, href):
- self.text = ""
- self.href = href
- def __str__(self):
- if self.href is not None and len(self.href):
- return self.text + " [url: " + self.href + "]"
- else:
- return self.text
- class TextWebInlineNode(TextWebNode):
- def __init__(self):
- self.text = ""
- class TextWebParagraphNode(TextWebNode):
- def __init__(self):
- self.text = ""
- def __str__(self):
- return self.text + "\n\n"
- class TextWebListItemNode(TextWebNode):
- def __init__(self):
- self.text = ""
- def __str__(self):
- return "- " + self.text + "\n"
- class TextWebRowNode(TextWebNode):
- def __init__(self):
- self.text = ""
- def __str__(self):
- return "| " + self.text + " |\n"
- def is_header_node(tag):
- if len(tag) == 2 and tag[0] == "h":
- try:
- return int(tag[1])
- except:
- return 0
- else:
- return 0
- class TextWebHTMLParser(html.parser.HTMLParser):
- def __init__(self, *args, **kwargs):
- self.stack = [TextWebNode()]
- super(TextWebHTMLParser, self).__init__(*args, **kwargs)
- def push(self, node):
- self.stack.append(node)
- def pop(self, node_type):
- while len(self.stack) > 1 and not isinstance(self.stack[-1], node_type):
- self.stack.pop()
- if len(self.stack) > 1:
- return self.stack.pop()
- else:
- return self.stack[0]
- def handle_starttag(self, tag, attrs):
- header = is_header_node(tag)
- if tag == "hr":
- self.stack[-1].append("\n\n---\n\n")
- elif header:
- self.push(TextWebHeaderNode(header))
- elif tag == "p" or tag == "div":
- self.push(TextWebParagraphNode())
- elif tag == "span" or tag == "label" or tag == "td":
- self.push(TextWebInlineNode())
- elif tag == "tr":
- self.push(TextWebRowNode())
- elif tag == "li":
- self.push(TextWebListItemNode())
- elif tag == "img" and len(self.stack):
- self.stack[-1].append("[image: %s]" % get_attr("src", attrs))
- elif tag == "a":
- self.push(TextWebHyperlinkNode(get_attr("href", attrs)))
- elif tag == "script" or tag == "style":
- self.push(TextWebIgnoreNode())
- def handle_endtag(self, tag):
- if is_header_node(tag):
- ended = self.pop(TextWebHeaderNode)
- elif tag == "p" or tag == "div":
- ended = self.pop(TextWebParagraphNode)
- elif tag == "span" or tag == "label" or tag == "td":
- ended = self.pop(TextWebInlineNode)
- elif tag == "tr":
- ended = self.pop(TextWebRowNode)
- elif tag == "li":
- ended = self.pop(TextWebListItemNode)
- elif tag == "a":
- ended = self.pop(TextWebHyperlinkNode)
- elif tag == "script" or tag == "style":
- ended = self.pop(TextWebIgnoreNode)
- else:
- ended = None
- if ended is not None and len(self.stack):
- self.stack[-1].append(str(ended))
- def handle_startendtag(self, tag, attrs):
- pass
- def handle_data(self, data):
- if len(self.stack):
- self.stack[-1].append(clean_whitespace(data))
- def get_content(self):
- return clean_lines(
- re.sub(r'\n\n+', "\n\n", str(self.stack[0].text + "\n"))
- )
- class TextWebCommand(sublime_plugin.TextCommand):
- def name(self):
- return "textweb"
- def run(self, edit):
- selection = self.view.sel()
- if not len(selection):
- return # TODO: Prompt for a web address?
- else:
- for region in selection:
- view = sublime.active_window().new_file()
- self.load_url(view, self.view.substr(region))
- def load_url(self, view, url):
- print("Loading url:", url)
- content = urllib.request.urlopen(url).read().decode("utf-8")
- parser = TextWebHTMLParser()
- parser.feed(content)
- parser.close()
- # https://stackoverflow.com/a/43582267/3478907
- view.run_command("append", {
- "characters": "Textweb " + url + "\n" + parser.get_content(),
- "scroll_to_end": False
- })
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement