Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python2
- # -*- coding:utf-8 -*-
- import httplib
- from pyquery import PyQuery as pq
- import re
- rHTTPHost = re.compile(r"^http://([^/]+)")
- rHTTPURI = re.compile(r"^http://[^/]+(/.*)$")
- class HTTPConnection(object):
- def __init__(self, host):
- self.host = host
- self.conn = httplib.HTTPConnection(host)
- def reset(self, host):
- self.conn = httplib.HTTPConnection(host)
- def get_html(self, uri, method="GET"):
- self.conn.request(method, uri, headers={"Connection": "keep-alive"})
- try:
- r = self.conn.getresponse()
- except (IOError, httplib.HTTPException):
- self.reset(self.host)
- return self.get_html(uri, method)
- if r.status == 200:
- return r.read()
- elif r.status in (301, 302):
- l = r.getheader("Location")
- host = rHTTPHost.findall(l)[0]
- uri = rHTTPURI.findall(l)[0]
- self.reset(host)
- return self.get_html(uri, method)
- connection = HTTPConnection("m.douban.com")
- _html = connection.get_html('/movie/tag/movies?tag=2014')
- douban = pq(_html)
- rStar = re.compile(r'\(([0-5]).\)$')
- for mlink in douban(".movie-items.list").find("a"):
- movie_uri = pq(mlink).attr("href")
- _html = connection.get_html(movie_uri)
- movie = pq(_html)
- for clink in movie("#bd > .itm > a"):
- cmt_uri = pq(clink).attr("href")
- if "comment" not in cmt_uri:
- continue
- _html = connection.get_html(cmt_uri)
- # commets is the first .list element
- comments = pq(_html)("#bd > .list ")[0]
- for cmt_wrapper in pq(comments)(".item"):
- print pq(cmt_wrapper)("span")[0].text
- mark = pq(cmt_wrapper)("span.author").text()
- if mark:
- try:
- print rStar.findall(mark)[0]
- except:
- print mark
- # vim: ts=4 sw=4 sts=4 expandtab
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement