Advertisement
here2share

# listlinks.py -- almost a webcrawler

Jul 23rd, 2015
343
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.40 KB | None | 0 0
  1. # listlinks.py -- almost a webcrawler
  2.  
  3. import urllib   #urllib
  4. import re   #regex
  5. keyurl=r'http://www.deviantart.com/'   # Example URL
  6.  
  7. def listlinks(keyURL):
  8.     url=keyURL
  9.     def trim(target):
  10.         try:
  11.             while target[-1] in '/\\':
  12.                 target=target[:-1]
  13.         except: pass
  14.         return target
  15.     url=trim(url)
  16.     addlinks=[]   # This is a list of all useful urls to keep
  17.     external=[]
  18.     html=urllib.urlopen(url).read()
  19.     def filterHTML():
  20.         links=re.findall(r'<a href=\"(.*?)\"\s',html)
  21.         return sorted(list(set(links)))
  22.     links=filterHTML()
  23.     for x in xrange(len(links)-1):  # loops over filters
  24.         for i in list('''"'#'''):
  25.             if i in links[x]:
  26.                 links[x]=None
  27.                 break
  28.         if links[x]:
  29.             if links[x][0] == '/':  # internal link? If yes, then keep it and add full path
  30.                 links[x]=trim(url+links[x])
  31.                 if links[x] not in addlinks:
  32.                     addlinks.append(links[x])
  33.                     print links[x],'++'
  34.             elif links[x].startswith(url): # internal link
  35.                 links[x]=trim(links[x])
  36.                 if links[x] not in addlinks:
  37.                     addlinks.append(links[x])
  38.                     print links[x],'+'
  39.             elif links[x][:4] == 'java': # skip javascript
  40.                 next
  41.             elif links[x][:6] == 'mailto':
  42.                 next
  43.             else: # external link
  44.                 links[x]=trim(links[x])
  45.                 if links[x] not in external:
  46.                     external.append(links[x])
  47.     print '\n','$'*10, 'External Links'
  48.     for link in external:
  49.         print link,'*'
  50.     print '\n','$'*10, 'Done!'
  51. listlinks(keyurl)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement