Advertisement
Guest User

Untitled

a guest
Oct 23rd, 2014
140
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.96 KB | None | 0 0
  1. #!/usr/bin/env python2
  2. # -*- coding:utf-8 -*-
  3. import httplib
  4. from pyquery import PyQuery as pq
  5. import re
  6.  
  7. rHTTPHost = re.compile(r"^http://([^/]+)")
  8. rHTTPURI = re.compile(r"^http://[^/]+(/.*)$")
  9.  
  10.  
  11. class HTTPConnection(object):
  12.  
  13.     def __init__(self, host):
  14.         self.host = host
  15.         self.conn = httplib.HTTPConnection(host)
  16.  
  17.     def reset(self, host):
  18.         self.conn = httplib.HTTPConnection(host)
  19.  
  20.     def get_html(self, uri, method="GET"):
  21.         self.conn.request(method, uri, headers={"Connection": "keep-alive"})
  22.         try:
  23.             r = self.conn.getresponse()
  24.         except (IOError, httplib.HTTPException):
  25.             self.reset(self.host)
  26.             return self.get_html(uri, method)
  27.  
  28.         if r.status == 200:
  29.             return r.read()
  30.         elif r.status in (301, 302):
  31.             l = r.getheader("Location")
  32.             host = rHTTPHost.findall(l)[0]
  33.             uri = rHTTPURI.findall(l)[0]
  34.             self.reset(host)
  35.             return self.get_html(uri, method)
  36.  
  37.  
  38. connection = HTTPConnection("m.douban.com")
  39. _html = connection.get_html('/movie/tag/movies?tag=2014')
  40. douban = pq(_html)
  41.  
  42. rStar = re.compile(r'\(([0-5]).\)$')
  43.  
  44. for mlink in douban(".movie-items.list").find("a"):
  45.     movie_uri = pq(mlink).attr("href")
  46.  
  47.     _html = connection.get_html(movie_uri)
  48.     movie = pq(_html)
  49.  
  50.     for clink in movie("#bd > .itm > a"):
  51.         cmt_uri = pq(clink).attr("href")
  52.         if "comment" not in cmt_uri:
  53.             continue
  54.  
  55.         _html = connection.get_html(cmt_uri)
  56.         # commets is the first .list element
  57.         comments = pq(_html)("#bd > .list ")[0]
  58.         for cmt_wrapper in pq(comments)(".item"):
  59.             print pq(cmt_wrapper)("span")[0].text
  60.             mark = pq(cmt_wrapper)("span.author").text()
  61.             if mark:
  62.                 try:
  63.                     print rStar.findall(mark)[0]
  64.                 except:
  65.                     print mark
  66.  
  67. # vim: ts=4 sw=4 sts=4 expandtab
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement