Advertisement
Guest User

Untitled

a guest
Nov 27th, 2014
172
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.21 KB | None | 0 0
  1. def check_domains(url):
  2. global num_websites,domain_queue,domains,doc_queue,stanford_tagger
  3. the_domain = re.match(r'^(:?https?://[^.]*.)?([^/#?&]+).*$',url)
  4. if the_domain is not None:
  5. if the_domain.groups(0)[1] not in domains.keys():
  6. domains[the_domain.groups(0)[1]] = website(doc_queue,the_domain.groups(0)[1])
  7. domains[the_domain.groups(0)[1]].add_initial_url(url)
  8. domain_queue.append(domains[the_domain.groups(0)[1]])
  9. num_websites = num_websites + 1
  10. else:
  11. domains[the_domain.groups(0)[1]].add_url(url)
  12.  
  13. File "web_crawler.py", line 178, in getdoc
  14. check_domains(check)
  15. File "web_crawler.py", line 133, in check_domains
  16. the_domain = re.match(r'^(:?https?://[^.]*.)?([^/#?&]+).*$',url)
  17. File "/usr/local/lib/python2.7/re.py", line 137, in match
  18. return _compile(pattern, flags).match(string)
  19. TypeError: expected string or buffer
  20.  
  21. >>> def check_domains(url):
  22. ... the_domain = re.match(r'^(:?https?://[^.]*.)?([^/#?&]+).*$',url) #right here
  23. ... if the_domain is not None:
  24. ... print the_domain.groups(0)[1]
  25. ... else:
  26. ... print "NOOOO!!!!!"
  27. ...
  28. >>>
  29. >>> check_domains("http://www.hulu.com/watch/6704")
  30. hulu.com
  31. >>> check_domains("https://docs.python.org/2/library/datetime.html")
  32. python.org
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement