Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- from scrapy import Spider
- from scrapy.selector import Selector
- from stack.items import StackItem
- class StackSpider(Spider):
- name = "stack"
- allowed_domains = ["pokepedia.fr"]
- start_urls = [
- "http://www.pokepedia.fr/Pikachu",
- ]
- def unicodize(seg):
- if re.match(r'\u[0-9a-f]{4}', seg):
- return seg.decode('unicode-escape')
- return seg.decode('utf-8')
- def parse(self, response):
- pokemon = Selector(response).xpath('//*[@id="mw-content-text"]/table[2]')
- for question in pokemon:
- item = StackItem()
- item['title'] = question.xpath(
- '//*[@id="mw-content-text"]/table[2]/tbody/tr[1]/th[2]/text()').extract()[0]
- yield item
- scrapy crawl stack -o items.json -t json
- [
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement