Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ####
- ## In settings.py
- ####
- BOT_NAME = 'tutorial'
- SPIDER_MODULES = ['tutorial.spiders']
- NEWSPIDER_MODULE = 'tutorial.spiders'
- #SPLASH_URL = 'http://192.168.99.100:8050'
- DOWNLOADER_MIDDLEWARES = {
- 'scrapyjs.SplashMiddleware': 725,
- 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300
- }
- SPLASH_URL = 'http://3prg5eay-splash.scrapinghub.com'
- DUPEFILTER_CLASS = 'scrapyjs.SplashAwareDupeFilter'
- HTTPCACHE_STORAGE = 'scrapyjs.SplashAwareFSCacheStorage'
- CRAWLERA_ENABLED = True
- CRAWLERA_APIKEY = '0ec13d2a4b264e1a85173e0beda6f2eb'
- CONCURRENT_REQUESTS = 10
- DOWNLOAD_TIMEOUT = 3600
- ####
- ## Spider
- ####
- import json
- import scrapy
- import requests
- import scrapyjs
- from tutorial.items import ProxyTestItem
- class SplashSpider(scrapy.Spider):
- http_user = '5b0056be4659470ab4aaabfaf71239b1'
- http_pass = ''
- name = "proxy_test"
- download_delay = 2
- start_urls = [
- "http://whatismyip.org"
- ]
- def start_requests(self):
- for url in self.start_urls:
- yield scrapy.Request(url, self.parse, meta={
- 'splash': {
- 'endpoint': 'execute',
- 'args': {
- 'url' : url
- }
- }
- })
- def parse(self, response):
- item = ProxyTestItem()
- item['test'] = response.xpath('/html/body/div[2]/span/text()').extract()
- yield item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement