Advertisement
Guest User

Untitled

a guest
Oct 1st, 2014
180
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.32 KB | None | 0 0
  1. import sys
  2. import re
  3. import urllib2
  4. import urlparse
  5. tocrawl = set(["http://www.facebook.com/"])
  6. crawled = set([])
  7. keywordregex = re.compile('<metasname=["']keywords["']scontent=["'](.*?)["']s/>')
  8. linkregex = re.compile('<as*href=['|"](.*?)['"].*?>')
  9.  
  10. while 1:
  11. try:
  12. crawling = tocrawl.pop()
  13. print crawling
  14. except KeyError:
  15. raise StopIteration
  16. url = urlparse.urlparse(crawling)
  17. try:
  18. response = urllib2.urlopen(crawling)
  19. except:
  20. continue
  21. msg = response.read()
  22. startPos = msg.find('<title>')
  23. if startPos != -1:
  24. endPos = msg.find('</title>', startPos+7)
  25. if endPos != -1:
  26. title = msg[startPos+7:endPos]
  27. print title
  28. keywordlist = keywordregex.findall(msg)
  29. if len(keywordlist) > 0:
  30. keywordlist = keywordlist[0]
  31. keywordlist = keywordlist.split(", ")
  32. print keywordlist
  33. links = linkregex.findall(msg)
  34. crawled.add(crawling)
  35. for link in (links.pop(0) for _ in xrange(len(links))):
  36. if link.startswith('/'):
  37. link = 'http://' + url[1] + link
  38. elif link.startswith('#'):
  39. link = 'http://' + url[1] + url[2] + link
  40. elif not link.startswith('http'):
  41. link = 'http://' + url[1] + '/' + link
  42. if link not in crawled:
  43. tocrawl.add(link)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement