Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- from scrapy.spiders import CrawlSpider, Rule
- from scrapy.linkextractors import LinkExtractor
- from scrapy.http import Request
- RE_VULID = r'\-(\d+\-\d+)'
- class WooyunSpider(CrawlSpider):
- name = 'wooyun'
- allowed_domains = ['www.wooyun.org']
- start_urls = ['http://www.wooyun.org/bugs/new_public/page/1']
- rules = (
- Rule(LinkExtractor(allow=(r'\/bugs\/new_public\/page\/\d+', ), )),
- Rule(LinkExtractor(allow=(r'\/bugs\/wooyun\-\d+\-\d+', )), callback='parse_vul'),
- )
- def __init__(self, *args, **kwarg):
- super(WooyunSpider, self).__init__(*args, **kwarg)
- self.finished = set()
- def make_requests_from_url(self, url):
- match = re.findall(RE_VULID, url)
- if match:
- vulid, = match
- if vulid in self.finished:
- return
- else:
- self.finished.add(vulid)
- return Request(url, dont_filter=False)
- def parse_vul(self, response):
- item = {key: ''.join([text.strip() for text in extracted]) for key, extracted in {
- 'title': response.css('h3.wybug_title::text').re(ur'\t\t(\S+)'),
- 'vulid': response.xpath('//h3/a[starts-with(@href,"/bugs/wooyun")]/@href').re(RE_VULID),
- 'vendor': response.css('h3.wybug_corp a::text').extract(),
- 'author': response.css('h3.wybug_author a::text').extract(),
- 'submitted': response.css('h3.wybug_date::text').re('\t\t(\d+\-\d+\-\d+\s+\d+:\d+)'),
- 'published': response.css('h3.wybug_open_date::text').re('\t\t(\d+\-\d+\-\d+\s+\d+:\d+)'),
- 'detail': response.css('.wybug_detail').xpath('./node()').extract(),
- 'patch': response.css('.wybug_patch .detail').xpath('./node()').extract(),
- 'rank': response.css('.bug_result .detail').re(r'Rank[\s\S]?(\d*)'),
- 'description': response.css('p.detail.wybug_description::text').extract(),
- 'vultype': response.css('h3.wybug_type::text').re('\t\t(\S+)'),
- 'level': response.css('.bug_result .detailTitle + p.detail::text').re(ur'\uff1a(\S+)'),
- }.iteritems()}
- yield item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement