Advertisement
thiagobodruk

Python Crawler

May 20th, 2013
719
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.72 KB | None | 0 0
  1. #!/usr/bin/python
  2. import re, urllib
  3. links = []
  4. myurl = raw_input("\nWeb Crawler v1.0\nType your URL including 'http://' or 'ftp://'\nURL: ")
  5. mylog = raw_input("Type your log file name: ")
  6. log = open(mylog + '.txt','w+')
  7. log.write(myurl + '\n')
  8. def crawler(url):
  9.     print ''
  10.     links.append(url)
  11.     for i in re.findall('''href=["'](?!javascript:)(.[^"']+)["']''', urllib.urlopen(url).read(), re.I):
  12.         check = re.match('(?!http|ftp)', i);
  13.         if(check):
  14.             i = myurl + i
  15.         print i
  16.         links.append(i)
  17. try:
  18.     crawler(myurl)
  19.     links = list(set(links))
  20.     links.sort()
  21.     for n in links:
  22.         #print n
  23.         log.write(n +'\n')
  24.     log.close()
  25.     print "\n%i crawled links\nURL: %s" % (len(links)+1, myurl)
  26. except:
  27.     print "Error crawling: " + myurl
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement