Advertisement
UberMouse

Untitled

Feb 22nd, 2012
388
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.56 KB | None | 0 0
  1. import re
  2. import urllib2
  3.  
  4. __author__ = 'UberMouse'
  5.  
  6. class LinkExtractor:
  7.  
  8.     links = list()
  9.  
  10.     def __init__(self, page):
  11.         regex = r'<a.*href=?"(.[^ "]*).*>(.*)</a>'
  12.         match = None
  13.         while re.search(regex, page) is not None:
  14.             match = re.search(regex, page)
  15.             if match is not None:
  16.                 self.links.append((match.group(1), match.group(2)))
  17.             page = page.replace(match.group(), "")
  18.  
  19. print LinkExtractor(urllib2.urlopen('http://www.timeanddate.com/worldclock/astronomy.html?n=78').read()).links
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement