Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def check_domains(url):
- global num_websites,domain_queue,domains,doc_queue,stanford_tagger
- the_domain = re.match(r'^(:?https?://[^.]*.)?([^/#?&]+).*$',url)
- if the_domain is not None:
- if the_domain.groups(0)[1] not in domains.keys():
- domains[the_domain.groups(0)[1]] = website(doc_queue,the_domain.groups(0)[1])
- domains[the_domain.groups(0)[1]].add_initial_url(url)
- domain_queue.append(domains[the_domain.groups(0)[1]])
- num_websites = num_websites + 1
- else:
- domains[the_domain.groups(0)[1]].add_url(url)
- File "web_crawler.py", line 178, in getdoc
- check_domains(check)
- File "web_crawler.py", line 133, in check_domains
- the_domain = re.match(r'^(:?https?://[^.]*.)?([^/#?&]+).*$',url)
- File "/usr/local/lib/python2.7/re.py", line 137, in match
- return _compile(pattern, flags).match(string)
- TypeError: expected string or buffer
- >>> def check_domains(url):
- ... the_domain = re.match(r'^(:?https?://[^.]*.)?([^/#?&]+).*$',url) #right here
- ... if the_domain is not None:
- ... print the_domain.groups(0)[1]
- ... else:
- ... print "NOOOO!!!!!"
- ...
- >>>
- >>> check_domains("http://www.hulu.com/watch/6704")
- hulu.com
- >>> check_domains("https://docs.python.org/2/library/datetime.html")
- python.org
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement