Advertisement
Guest User

Untitled

a guest
Oct 30th, 2014
184
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.95 KB | None | 0 0
  1. class DemoSpider(CrawlSpider):
  2. name = 'sample_recursive'
  3. allowed_domains = ['www.example.org']
  4. start_urls = [
  5. "http://example.org"
  6. ]
  7.  
  8. rules = [Rule(SgmlLinkExtractor(allow=(r'/.org/site/ID/'), deny=(r'/.org$', r'/site/ID/home')), callback='parse_start_url', follow=True)]
  9. def parse_start_url(self, response):
  10. items = []
  11. item = DemoSampleItem()
  12. item["source_url"] = response.url
  13. item["title"] = response.xpath('//div[@class="content-title"]/h2/text()')[0].extract()
  14. item["breadcrumb"] = response.xpath("//ul[@class='breadcrumbs']")[0].extract()
  15. item["content"] = response.xpath("//div[@class='main_col']")[0].extract()
  16. item["right_col"] = response.xpath("//div[@class='right_col']").extract()
  17. item["left_col"] = response.xpath("//div[@class='left_col']")[0].extract()
  18. item["depth"] = response.meta.get('depth', 0)
  19. items.append(item)
  20.  
  21. return items
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement