Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2
- website = "WEBSITE"
- openwebsite = urllib2.urlopen(website)
- html = getwebsite.read()
- print html
- from BeautifulSoup import BeautifulSoup
- import urllib2
- import re
- html_page = urllib2.urlopen("http://www.yourwebsite.com")
- soup = BeautifulSoup(html_page)
- for link in soup.findAll('a'):
- print link.get('href')
- soup.findAll('a', attrs={'href': re.compile("^http://")})
- from HTMLParser import HTMLParser
- class MyHTMLParser(HTMLParser):
- def handle_starttag(self, tag, attrs):
- # Only parse the 'anchor' tag.
- if tag == "a":
- # Check the list of defined attributes.
- for name, value in attrs:
- # If href is defined, print it.
- if name == "href":
- print name, "=", value
- parser = MyHTMLParser()
- parser.feed(your_html_string)
- import BeautifulSoup
- soup = BeautifulSoup.BeautifulSoup(html)
- for link in soup.findAll("a"):
- print link.get("href")
- import urllib
- test = urllib.urlopen("http://www.google.com").read()
- sane = 0
- needlestack = []
- while sane == 0:
- curpos = test.find("href")
- if curpos >= 0:
- testlen = len(test)
- test = test[curpos:testlen]
- curpos = test.find('"')
- testlen = len(test)
- test = test[curpos+1:testlen]
- curpos = test.find('"')
- needle = test[0:curpos]
- if needle.startswith("http" or "www"):
- needlestack.append(needle)
- else:
- sane = 1
- for item in needlestack:
- print item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement