Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json, pdb, logging, re, requests
- from scrapy import Request
- from scrapy.spiders.init import InitSpider
- class aCrawler(InitSpider):
- name = "crawler"
- allowed_domains = ["myweb.com"]
- start_urls = [
- 'http://myweb.com/providers',
- ]
- provinceId = 217
- category = {
- 'quan-an': 3
- }
- login_page = 'http://myweb.com/dang-nhap'
- def __init__(self, user=None, password=None, *args, **kwargs):
- super(aCrawler, self).__init__(*args, **kwargs)
- self.password = password
- self.user = user
- msg = 'The account will be used ' + user + ' ' + password
- self.log(msg, level=logging.INFO)
- def init_request(self):
- """This function is called before crawling starts."""
- msg = {'email': self.user, 'password': self.password,
- 'reCaptchaResponse': '', 'rememberMe': 'true'}
- headers = {'Host': 'www.myweb.com',
- 'Accept': 'application/json, text/javascript, */*; q=0.01',
- 'Accept-Encoding': 'gzip, deflate',
- 'Accept-Language': 'en-US,vi;q=0.5',
- 'Referer': 'http://www.myweb.com',
- 'X-Requested-With': 'XMLHttpRequest',
- 'Content-Type': 'application/json'}
- yield Request(self.login_page, method='POST', body=json.dumps(msg), headers=headers,
- callback=self.check_login_response)
- def check_login_response(self, response):
- """Check the response returned by a login request to see if we are
- successfully logged in.
- """
- if json.loads(response.body)['isSuccess']:
- self.log("Successfully logged in!", level=logging.INFO)
- for url in self.start_urls:
- yield self.make_requests_from_url(url)
- self.initialized()
- else:
- self.log("Can't login", level=logging.ERROR)
- def parse(self, response):
- page_name_ = re.sub('http://myweb.com/ho-chi-minh/', '', response.url)
- category_id = self.category[page_name_]
- url_ = '''
- http://myweb.com/ho-chi-minh/dia-diem?ds=Restaurant&vt=row&st=1&provinceId={}&categoryId={}&append=false
- '''.format(self.provinceId, category_id)
- headers = {'Accept': 'application/json, text/javascript, */*; q=0.01',
- 'Accept-Encoding': 'gzip, deflate',
- 'Accept-Language': 'en-US,vi;q=0.5',
- 'Referer': response.url,
- 'X-Requested-With': 'XMLHttpRequest'}
- yield (Request(url=url_, headers=headers,
- callback=self.parse_provider))
- def parse_providers(self, response):
- self.log('parse_provider')
- for url in self.start_urls:
- yield self.make_requests_from_url(url)
- yield (Request(url=url_, headers=headers,
- callback=self.parse_provider))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement