Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import urllib2
- __author__ = 'UberMouse'
- class LinkExtractor:
- links = list()
- def __init__(self, page):
- regex = r'<a.*href=?"(.[^ "]*).*>(.*)</a>'
- match = None
- while re.search(regex, page) is not None:
- match = re.search(regex, page)
- if match is not None:
- self.links.append((match.group(1), match.group(2)))
- page = page.replace(match.group(), "")
- print LinkExtractor(urllib2.urlopen('http://www.timeanddate.com/worldclock/astronomy.html?n=78').read()).links
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement