Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class DemoSpider(CrawlSpider):
- name = 'sample_recursive'
- allowed_domains = ['www.example.org']
- start_urls = [
- "http://example.org"
- ]
- rules = [Rule(SgmlLinkExtractor(allow=(r'/.org/site/ID/'), deny=(r'/.org$', r'/site/ID/home')), callback='parse_start_url', follow=True)]
- def parse_start_url(self, response):
- items = []
- item = DemoSampleItem()
- item["source_url"] = response.url
- item["title"] = response.xpath('//div[@class="content-title"]/h2/text()')[0].extract()
- item["breadcrumb"] = response.xpath("//ul[@class='breadcrumbs']")[0].extract()
- item["content"] = response.xpath("//div[@class='main_col']")[0].extract()
- item["right_col"] = response.xpath("//div[@class='right_col']").extract()
- item["left_col"] = response.xpath("//div[@class='left_col']")[0].extract()
- item["depth"] = response.meta.get('depth', 0)
- items.append(item)
- return items
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement