Advertisement
Guest User

Untitled

a guest
Apr 25th, 2017
57
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.46 KB | None | 0 0
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3.  
  4. from scrapy.spiders import CrawlSpider, Rule
  5. from scrapy.linkextractors import LinkExtractor
  6.  
  7. from wiki_spider.items import WikiSpiderItem
  8. import re
  9.  
  10.  
  11. class WikiSpider(CrawlSpider):
  12.     name = 'wiki_spider'
  13.  
  14.     start_urls = ['https://en.wikipedia.org/wiki/Minsk',
  15.                   'https://en.wikipedia.org/wiki/Python_(programming_language)']
  16.  
  17.     links_xpath = '(//div[@id="bodyContent"]/div/p/a)[position()<100]'
  18.     allow_re = '/wiki/' \
  19.                '(?!((File|Talk|Category|Portal|Special|Template' \
  20.                '|Template_talk|Wikipedia|Help|Draft):|Main_Page)).+'
  21.     compiled_allow_re = re.compile('/wiki/'
  22.                                    '(?!((File|Talk|Category|Portal|Special|Template'
  23.                                    '|Template_talk|Wikipedia|Help|Draft):|Main_Page)).+')
  24.  
  25.     rules = (
  26.         Rule(LinkExtractor(restrict_xpaths=links_xpath,
  27.                            deny='#.*',
  28.                            allow=allow_re),
  29.              callback='parse_item', follow=True),
  30.     )
  31.  
  32.     def parse_start_url(self, response):
  33.         return self.parse_item(response)
  34.  
  35.     def parse_item(self, response):
  36.         item = WikiSpiderItem()
  37.         item['url'] = response.url
  38.         item['links'] = [response.urljoin(link) for link in response.xpath(self.links_xpath).xpath('@href').extract()
  39.                          if self.compiled_allow_re.match(link)]
  40.         return item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement