Advertisement
Guest User

Untitled

a guest
Jan 24th, 2016
132
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.86 KB | None | 0 0
  1. import json, pdb, logging, re, requests
  2. from scrapy import Request
  3. from scrapy.spiders.init import InitSpider
  4.  
  5. class aCrawler(InitSpider):
  6. name = "crawler"
  7. allowed_domains = ["myweb.com"]
  8. start_urls = [
  9. 'http://myweb.com/providers',
  10. ]
  11. provinceId = 217
  12. category = {
  13. 'quan-an': 3
  14. }
  15. login_page = 'http://myweb.com/dang-nhap'
  16.  
  17. def __init__(self, user=None, password=None, *args, **kwargs):
  18. super(aCrawler, self).__init__(*args, **kwargs)
  19. self.password = password
  20. self.user = user
  21. msg = 'The account will be used ' + user + ' ' + password
  22. self.log(msg, level=logging.INFO)
  23.  
  24. def init_request(self):
  25. """This function is called before crawling starts."""
  26. msg = {'email': self.user, 'password': self.password,
  27. 'reCaptchaResponse': '', 'rememberMe': 'true'}
  28. headers = {'Host': 'www.myweb.com',
  29. 'Accept': 'application/json, text/javascript, */*; q=0.01',
  30. 'Accept-Encoding': 'gzip, deflate',
  31. 'Accept-Language': 'en-US,vi;q=0.5',
  32. 'Referer': 'http://www.myweb.com',
  33. 'X-Requested-With': 'XMLHttpRequest',
  34. 'Content-Type': 'application/json'}
  35. yield Request(self.login_page, method='POST', body=json.dumps(msg), headers=headers,
  36. callback=self.check_login_response)
  37.  
  38. def check_login_response(self, response):
  39. """Check the response returned by a login request to see if we are
  40. successfully logged in.
  41. """
  42. if json.loads(response.body)['isSuccess']:
  43. self.log("Successfully logged in!", level=logging.INFO)
  44.  
  45. for url in self.start_urls:
  46. yield self.make_requests_from_url(url)
  47.  
  48. self.initialized()
  49. else:
  50. self.log("Can't login", level=logging.ERROR)
  51.  
  52. def parse(self, response):
  53. page_name_ = re.sub('http://myweb.com/ho-chi-minh/', '', response.url)
  54. category_id = self.category[page_name_]
  55. url_ = '''
  56. http://myweb.com/ho-chi-minh/dia-diem?ds=Restaurant&vt=row&st=1&provinceId={}&categoryId={}&append=false
  57. '''.format(self.provinceId, category_id)
  58. headers = {'Accept': 'application/json, text/javascript, */*; q=0.01',
  59. 'Accept-Encoding': 'gzip, deflate',
  60. 'Accept-Language': 'en-US,vi;q=0.5',
  61. 'Referer': response.url,
  62. 'X-Requested-With': 'XMLHttpRequest'}
  63. yield (Request(url=url_, headers=headers,
  64. callback=self.parse_provider))
  65.  
  66. def parse_providers(self, response):
  67. self.log('parse_provider')
  68.  
  69. for url in self.start_urls:
  70. yield self.make_requests_from_url(url)
  71.  
  72. yield (Request(url=url_, headers=headers,
  73. callback=self.parse_provider))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement