Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from scrapy.exceptions import CloseSpider
- # from scrapy.loader import ItemLoader
- # from tutorial.items import TutorialItem
- class TutorialItem(scrapy.Item):
- # define the fields for your item here like:
- Tags = scrapy.Field()
- Authors = scrapy.Field()
- Text = scrapy.Field()
- class QuotesSpider(scrapy.Spider):
- name = "quotes"
- result = TutorialItem()
- urls = ['http://quotes.toscrape.com/page/1/']
- def start_requests(self):
- num = 1
- for url in self.urls:
- num += 1
- self.urls.append(str(self.urls[0][:-2] + str(num) + '/'))
- yield scrapy.Request(url=url, callback=self.parse)
- def _tags_parse(self, tags):
- for i in tags:
- if 'Tags' not in QuotesSpider.result:
- QuotesSpider.result['Tags'] = []
- QuotesSpider.result['Tags'].append(i)
- def _authors_parse(self, authors):
- for i in authors:
- if 'Authors' not in QuotesSpider.result:
- QuotesSpider.result['Authors'] = []
- QuotesSpider.result['Authors'].append(i)
- def _text_parse(self, text):
- for i in text:
- if 'Text' not in QuotesSpider.result:
- QuotesSpider.result['Text'] = []
- QuotesSpider.result['Text'].append(i)
- def _content_checker(self, content):
- if len(content) == 0:
- raise CloseSpider('No more pages with contents!')
- def parse(self, response):
- tags = set(response.xpath('//a[@class="tag"]//text()').getall())
- authors = set(response.xpath('//small[@class="author"]//text()').getall())
- text = response.xpath('//div[@class="col-md-8"]//span[@class="text"]//text()').getall()
- QuotesSpider._content_checker(self, authors)
- QuotesSpider._tags_parse(self, tags)
- QuotesSpider._authors_parse(self, authors)
- QuotesSpider._text_parse(self, text)
- with open('tags.txt', 'w') as f:
- f.write(str(QuotesSpider.result))
Advertisement
Add Comment
Please, Sign In to add comment