Advertisement
Guest User

Untitled

a guest
Jun 28th, 2016
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.73 KB | None | 0 0
  1. import re
  2. from scrapy import Spider
  3. from scrapy.selector import Selector
  4.  
  5. from stack.items import StackItem
  6.  
  7.  
  8. class StackSpider(Spider):
  9. name = "stack"
  10. allowed_domains = ["pokepedia.fr"]
  11. start_urls = [
  12. "http://www.pokepedia.fr/Pikachu",
  13. ]
  14.  
  15. def unicodize(seg):
  16. if re.match(r'\u[0-9a-f]{4}', seg):
  17. return seg.decode('unicode-escape')
  18.  
  19. return seg.decode('utf-8')
  20.  
  21. def parse(self, response):
  22. pokemon = Selector(response).xpath('//*[@id="mw-content-text"]/table[2]')
  23.  
  24. for question in pokemon:
  25. item = StackItem()
  26. item['title'] = question.xpath(
  27. '//*[@id="mw-content-text"]/table[2]/tbody/tr[1]/th[2]/text()').extract()[0]
  28. yield item
  29.  
  30. scrapy crawl stack -o items.json -t json
  31.  
  32. [
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement