Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## b3 p0lit3
- USER_AGENT = ' *companyname* TUTORIAL BOT - (*myemail*) | No content Generated will be used - For Educational Purpose'
- DOWNLOAD_DELAY = 5.0
- AUTOTHROTTLE_ENABLED = True
- HTTPCACHE_ENABLED = True
- BOT_NAME = 'flaticontest'
- SPIDER_MODULES = ['flaticontest.spiders']
- NEWSPIDER_MODULE = 'flaticontest.spiders'
- IMAGES_STORE = '/home/scriptso/Desktop/flattetstn1'
- ROBOTSTXT_OBEY = True
- ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
- import scrapy
- class FlaticontestItem(scrapy.Item):
- images = scrapy.Field()
- image_urls = scrapy.Field()
- title = scrapy.Field()
- pachName = scrapy.Field()
- image_name = scrapy.Field()
- from scrapy.contrib.pipeline.images import ImagesPipeline
- class FlaticontestPipeline(object):
- def process_item(self, item, spider):
- return item
- class CustomImageNamePipeline(ImagesPipeline):
- def get_media_requests(self, item, info):
- return [Request(x, meta={'image_name': item["image_name"]})
- for x in item.get('image_urls', [])]
- def file_path(self, request, response=None, info=None):
- return '%s.jpg' % request.meta['image_name']
- import scrapy
- from flaticontest.items import FlaticontestItem
- class FltspiSpider(scrapy.Spider):
- name = "fltSpi"
- allowed_domains = ["flaticon.com"]
- start_urls = []
- for num in range(1,2000):
- start_urls.append("http://www.flaticon.com/free-icons/computing_23394/" + str(num))
- def parse(self, response):
- for icon in response.css('.icon'):
- yield {
- 'title': icon.css('img').re('title="(.*?)"'),
- 'image_urls': icon.css('img').re('set="(.*?) 4x'),
- 'pach-name': icon.css('li').re('data-pack="(.*)" '),
- 'image_name': icon.css('img').re('title="(.*?)"'),
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement