Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # listlinks.py -- almost a webcrawler
- import urllib #urllib
- import re #regex
- keyurl=r'http://www.deviantart.com/' # Example URL
- def listlinks(keyURL):
- url=keyURL
- def trim(target):
- try:
- while target[-1] in '/\\':
- target=target[:-1]
- except: pass
- return target
- url=trim(url)
- addlinks=[] # This is a list of all useful urls to keep
- external=[]
- html=urllib.urlopen(url).read()
- def filterHTML():
- links=re.findall(r'<a href=\"(.*?)\"\s',html)
- return sorted(list(set(links)))
- links=filterHTML()
- for x in xrange(len(links)-1): # loops over filters
- for i in list('''"'#'''):
- if i in links[x]:
- links[x]=None
- break
- if links[x]:
- if links[x][0] == '/': # internal link? If yes, then keep it and add full path
- links[x]=trim(url+links[x])
- if links[x] not in addlinks:
- addlinks.append(links[x])
- print links[x],'++'
- elif links[x].startswith(url): # internal link
- links[x]=trim(links[x])
- if links[x] not in addlinks:
- addlinks.append(links[x])
- print links[x],'+'
- elif links[x][:4] == 'java': # skip javascript
- next
- elif links[x][:6] == 'mailto':
- next
- else: # external link
- links[x]=trim(links[x])
- if links[x] not in external:
- external.append(links[x])
- print '\n','$'*10, 'External Links'
- for link in external:
- print link,'*'
- print '\n','$'*10, 'Done!'
- listlinks(keyurl)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement