Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import datetime
- import urlparse
- import socket
- import scrapy
- from scrapy.loader.processors import MapCompose, Join
- from scrapy.loader import ItemLoader
- from ..items import TermsItem
- class BasicSpider(scrapy.Spider):
- name = "basic"
- allowed_domains = ["web"]
- # Start on a property page
- start_urls = [i.strip() for i in open('todo.urls.txt').readlines()]
- def parse(self, response):
- # Create the loader using the response
- l = ItemLoader(item=TermsItem(), response=response)
- # Load fields using XPath expressions
- l.add_xpath('title', '//h1[@class="foo"]/span/text()',
- MapCompose(unicode.strip, unicode.title))
- l.add_xpath('detail', '//*[@class="bar"]//text()',
- MapCompose(unicode.strip))
- # Housekeeping fields
- l.add_value('url', response.url)
- l.add_value('project', self.settings.get('BOT_NAME'))
- l.add_value('spider', self.name)
- l.add_value('server', socket.gethostname())
- l.add_value('date', datetime.datetime.now())
- return l.load_item()
- from scrapy.item import Item, Field
- class TermsItem(Item):
- # Primary fields
- title = Field()
- detail= Field()
- # Housekeeping fields
- url = Field()
- project = Field()
- spider = Field()
- server = Field()
- date = Field()
- start_urls = [i.strip() for i in open('todo.urls.txt').readlines()]
- l.add_value('url', response.url)
- l.add_value('url', response.request.url)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement