Advertisement
pineapplemachine

TextWeb Sublime Text Plugin

Jul 9th, 2020
217
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.86 KB | None | 0 0
  1. # Rough WIP text-based web view plugin for Sublime Text
  2. # In the console: run_command("textweb") with one or more URLs selected,
  3. # this will open a new file view with a plain text version of the
  4. # requested webpage, for each URL provided.
  5.  
  6. import sublime
  7. import sublime_plugin
  8.  
  9. import html
  10. import re
  11. import urllib.request
  12.  
  13. def is_whitespace(text):
  14.     return re.match(r'\s+', text)
  15.    
  16. def clean_whitespace(text):
  17.     return re.sub(r'\s+', " ", text.strip())
  18.  
  19. def clean_lines(text):
  20.     return re.sub(r'(?m)^ *(.*?) *$', r'\1', text)
  21.  
  22. def get_attr(name, attrs):
  23.     for attr in attrs:
  24.         if attr[0] == name:
  25.             return attr[1]
  26.     return None
  27.  
  28. class TextWebNode(object):
  29.     def __init__(self):
  30.         self.text = ""
  31.     def __str__(self):
  32.         return self.text
  33.     def append(self, text):
  34.         if len(text) and len(self.text) and (
  35.             not is_whitespace(self.text[-1]) and
  36.             not is_whitespace(text[0])
  37.         ):
  38.             self.text = self.text + " " + text
  39.         else:
  40.             self.text = self.text + text
  41.  
  42. class TextWebIgnoreNode(TextWebNode):
  43.     def __init__(self):
  44.         self.text = ""
  45.     def append(self, text):
  46.         pass
  47.     def __str__(self):
  48.         return ""
  49.        
  50. class TextWebHeaderNode(TextWebNode):
  51.     def __init__(self, level):
  52.         self.level = level
  53.         self.text = ""
  54.     def __str__(self):
  55.         return ("#" * self.level) + " " + self.text + "\n\n"
  56.  
  57. class TextWebHyperlinkNode(TextWebNode):
  58.     def __init__(self, href):
  59.         self.text = ""
  60.         self.href = href
  61.     def __str__(self):
  62.         if self.href is not None and len(self.href):
  63.             return self.text + " [url: " + self.href + "]"
  64.         else:
  65.             return self.text
  66.        
  67. class TextWebInlineNode(TextWebNode):
  68.     def __init__(self):
  69.         self.text = ""
  70.  
  71. class TextWebParagraphNode(TextWebNode):
  72.     def __init__(self):
  73.         self.text = ""
  74.     def __str__(self):
  75.         return self.text + "\n\n"
  76.  
  77. class TextWebListItemNode(TextWebNode):
  78.     def __init__(self):
  79.         self.text = ""
  80.     def __str__(self):
  81.         return "- " + self.text + "\n"
  82.  
  83. class TextWebRowNode(TextWebNode):
  84.     def __init__(self):
  85.         self.text = ""
  86.     def __str__(self):
  87.         return "| " + self.text + " |\n"
  88.  
  89. def is_header_node(tag):
  90.     if len(tag) == 2 and tag[0] == "h":
  91.         try:
  92.             return int(tag[1])
  93.         except:
  94.             return 0
  95.     else:
  96.         return 0
  97.  
  98. class TextWebHTMLParser(html.parser.HTMLParser):
  99.     def __init__(self, *args, **kwargs):
  100.         self.stack = [TextWebNode()]
  101.         super(TextWebHTMLParser, self).__init__(*args, **kwargs)
  102.    
  103.     def push(self, node):
  104.         self.stack.append(node)
  105.    
  106.     def pop(self, node_type):
  107.         while len(self.stack) > 1 and not isinstance(self.stack[-1], node_type):
  108.             self.stack.pop()
  109.         if len(self.stack) > 1:
  110.             return self.stack.pop()
  111.         else:
  112.             return self.stack[0]
  113.        
  114.     def handle_starttag(self, tag, attrs):
  115.         header = is_header_node(tag)
  116.         if tag == "hr":
  117.             self.stack[-1].append("\n\n---\n\n")
  118.         elif header:
  119.             self.push(TextWebHeaderNode(header))
  120.         elif tag == "p" or tag == "div":
  121.             self.push(TextWebParagraphNode())
  122.         elif tag == "span" or tag == "label" or tag == "td":
  123.             self.push(TextWebInlineNode())
  124.         elif tag == "tr":
  125.             self.push(TextWebRowNode())
  126.         elif tag == "li":
  127.             self.push(TextWebListItemNode())
  128.         elif tag == "img" and len(self.stack):
  129.             self.stack[-1].append("[image: %s]" % get_attr("src", attrs))
  130.         elif tag == "a":
  131.             self.push(TextWebHyperlinkNode(get_attr("href", attrs)))
  132.         elif tag == "script" or tag == "style":
  133.             self.push(TextWebIgnoreNode())
  134.        
  135.     def handle_endtag(self, tag):
  136.         if is_header_node(tag):
  137.             ended = self.pop(TextWebHeaderNode)
  138.         elif tag == "p" or tag == "div":
  139.             ended = self.pop(TextWebParagraphNode)
  140.         elif tag == "span" or tag == "label" or tag == "td":
  141.             ended = self.pop(TextWebInlineNode)
  142.         elif tag == "tr":
  143.             ended = self.pop(TextWebRowNode)
  144.         elif tag == "li":
  145.             ended = self.pop(TextWebListItemNode)
  146.         elif tag == "a":
  147.             ended = self.pop(TextWebHyperlinkNode)
  148.         elif tag == "script" or tag == "style":
  149.             ended = self.pop(TextWebIgnoreNode)
  150.         else:
  151.             ended = None
  152.         if ended is not None and len(self.stack):
  153.             self.stack[-1].append(str(ended))
  154.            
  155.     def handle_startendtag(self, tag, attrs):
  156.         pass
  157.    
  158.     def handle_data(self, data):
  159.         if len(self.stack):
  160.             self.stack[-1].append(clean_whitespace(data))
  161.    
  162.     def get_content(self):
  163.         return clean_lines(
  164.             re.sub(r'\n\n+', "\n\n", str(self.stack[0].text + "\n"))
  165.         )
  166.  
  167. class TextWebCommand(sublime_plugin.TextCommand):
  168.     def name(self):
  169.         return "textweb"
  170.     def run(self, edit):
  171.         selection = self.view.sel()
  172.         if not len(selection):
  173.             return # TODO: Prompt for a web address?
  174.         else:
  175.             for region in selection:
  176.                 view = sublime.active_window().new_file()
  177.                 self.load_url(view, self.view.substr(region))
  178.     def load_url(self, view, url):
  179.         print("Loading url:", url)
  180.         content = urllib.request.urlopen(url).read().decode("utf-8")
  181.         parser = TextWebHTMLParser()
  182.         parser.feed(content)
  183.         parser.close()
  184.         # https://stackoverflow.com/a/43582267/3478907
  185.         view.run_command("append", {
  186.             "characters": "Textweb " + url + "\n" + parser.get_content(),
  187.             "scroll_to_end": False
  188.         })
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement