Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import re, urllib
- links = []
- myurl = raw_input("\nWeb Crawler v1.0\nType your URL including 'http://' or 'ftp://'\nURL: ")
- mylog = raw_input("Type your log file name: ")
- log = open(mylog + '.txt','w+')
- log.write(myurl + '\n')
- def crawler(url):
- print ''
- links.append(url)
- for i in re.findall('''href=["'](?!javascript:)(.[^"']+)["']''', urllib.urlopen(url).read(), re.I):
- check = re.match('(?!http|ftp)', i);
- if(check):
- i = myurl + i
- print i
- links.append(i)
- try:
- crawler(myurl)
- links = list(set(links))
- links.sort()
- for n in links:
- #print n
- log.write(n +'\n')
- log.close()
- print "\n%i crawled links\nURL: %s" % (len(links)+1, myurl)
- except:
- print "Error crawling: " + myurl
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement