Advertisement
Guest User

Untitled

a guest
Sep 21st, 2017
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.47 KB | None | 0 0
  1. import datetime
  2. import urlparse
  3. import socket
  4. import scrapy
  5.  
  6. from scrapy.loader.processors import MapCompose, Join
  7. from scrapy.loader import ItemLoader
  8.  
  9. from ..items import TermsItem
  10.  
  11.  
  12. class BasicSpider(scrapy.Spider):
  13. name = "basic"
  14. allowed_domains = ["web"]
  15.  
  16. # Start on a property page
  17. start_urls = [i.strip() for i in open('todo.urls.txt').readlines()]
  18.  
  19. def parse(self, response):
  20. # Create the loader using the response
  21. l = ItemLoader(item=TermsItem(), response=response)
  22.  
  23. # Load fields using XPath expressions
  24. l.add_xpath('title', '//h1[@class="foo"]/span/text()',
  25. MapCompose(unicode.strip, unicode.title))
  26. l.add_xpath('detail', '//*[@class="bar"]//text()',
  27. MapCompose(unicode.strip))
  28.  
  29. # Housekeeping fields
  30. l.add_value('url', response.url)
  31. l.add_value('project', self.settings.get('BOT_NAME'))
  32. l.add_value('spider', self.name)
  33. l.add_value('server', socket.gethostname())
  34. l.add_value('date', datetime.datetime.now())
  35.  
  36. return l.load_item()
  37.  
  38. from scrapy.item import Item, Field
  39.  
  40.  
  41. class TermsItem(Item):
  42. # Primary fields
  43. title = Field()
  44. detail= Field()
  45. # Housekeeping fields
  46. url = Field()
  47. project = Field()
  48. spider = Field()
  49. server = Field()
  50. date = Field()
  51.  
  52. start_urls = [i.strip() for i in open('todo.urls.txt').readlines()]
  53.  
  54. l.add_value('url', response.url)
  55.  
  56. l.add_value('url', response.request.url)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement