Advertisement
Guest User

Untitled

a guest
Dec 19th, 2014
151
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.57 KB | None | 0 0
  1. from twisted.internet import reactor
  2.  
  3. from scrapy import log, signals
  4. from scrapy.crawler import Crawler
  5. from scrapy.xlib.pydispatch import dispatcher
  6. import logging
  7.  
  8.  
  9. from external_links.spiders.test import MySpider
  10. from scrapy.utils.project import get_project_settings
  11. settings = get_project_settings()
  12.  
  13. #manually set settings here
  14. settings.overrides['ITEM_PIPELINES'] = {'external_links.pipelines.FilterPipeline':100,'external_links.pipelines.CsvWriterPipeline': 200}
  15. settings.overrides['DEPTH_LIMIT'] = 1
  16.  
  17. def stop_reactor():
  18. reactor.stop()
  19.  
  20. dispatcher.connect(stop_reactor, signal=signals.spider_closed)
  21. spider = MySpider()
  22. crawler = Crawler(settings)
  23. crawler.configure()
  24. crawler.crawl(spider)
  25. crawler.start()
  26. log.start(loglevel=logging.DEBUG)
  27. log.msg('reactor running...')
  28. reactor.run()
  29. log.msg('Reactor stopped...')
  30.  
  31. import re
  32. from scrapy.exceptions import DropItem
  33.  
  34. class FilterPipeline(object):
  35. def process_item(self, item, spider):
  36.  
  37. domain = re.search(r'(?:http://)?(?:www.)(.+)',item['start_url']).group(1)
  38.  
  39. if bool(re.match('^(http|www)',item['all_links']))==True and bool(re.search(domain,item['all_links']))==False:
  40. return item
  41. else:
  42. raise DropItem("not an outside url link" % item)
  43.  
  44.  
  45. import csv
  46.  
  47. class CsvWriterPipeline(object):
  48.  
  49. def __init__(self):
  50. self.csvwriter = csv.writer(open('items2.csv', 'wb'))
  51.  
  52. def process_item(self, item, spider): #item needs to be second in this list otherwise get spider object
  53. print 'This Never Printed'
  54. self.csvwriter.writerow([item['all_links'], item['current_url'], item['start_url']])
  55.  
  56. return item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement