Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from twisted.internet import reactor
- from scrapy import log, signals
- from scrapy.crawler import Crawler
- from scrapy.xlib.pydispatch import dispatcher
- import logging
- from external_links.spiders.test import MySpider
- from scrapy.utils.project import get_project_settings
- settings = get_project_settings()
- #manually set settings here
- settings.overrides['ITEM_PIPELINES'] = {'external_links.pipelines.FilterPipeline':100,'external_links.pipelines.CsvWriterPipeline': 200}
- settings.overrides['DEPTH_LIMIT'] = 1
- def stop_reactor():
- reactor.stop()
- dispatcher.connect(stop_reactor, signal=signals.spider_closed)
- spider = MySpider()
- crawler = Crawler(settings)
- crawler.configure()
- crawler.crawl(spider)
- crawler.start()
- log.start(loglevel=logging.DEBUG)
- log.msg('reactor running...')
- reactor.run()
- log.msg('Reactor stopped...')
- import re
- from scrapy.exceptions import DropItem
- class FilterPipeline(object):
- def process_item(self, item, spider):
- domain = re.search(r'(?:http://)?(?:www.)(.+)',item['start_url']).group(1)
- if bool(re.match('^(http|www)',item['all_links']))==True and bool(re.search(domain,item['all_links']))==False:
- return item
- else:
- raise DropItem("not an outside url link" % item)
- import csv
- class CsvWriterPipeline(object):
- def __init__(self):
- self.csvwriter = csv.writer(open('items2.csv', 'wb'))
- def process_item(self, item, spider): #item needs to be second in this list otherwise get spider object
- print 'This Never Printed'
- self.csvwriter.writerow([item['all_links'], item['current_url'], item['start_url']])
- return item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement