Advertisement
Guest User

Wooyun

a guest
Apr 21st, 2016
43
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.15 KB | None | 0 0
  1. import re
  2.  
  3. from scrapy.spiders import CrawlSpider, Rule
  4. from scrapy.linkextractors import LinkExtractor
  5. from scrapy.http import Request
  6.  
  7. RE_VULID = r'\-(\d+\-\d+)'
  8.  
  9.  
  10. class WooyunSpider(CrawlSpider):
  11.     name = 'wooyun'
  12.     allowed_domains = ['www.wooyun.org']
  13.     start_urls = ['http://www.wooyun.org/bugs/new_public/page/1']
  14.     rules = (
  15.         Rule(LinkExtractor(allow=(r'\/bugs\/new_public\/page\/\d+', ), )),
  16.         Rule(LinkExtractor(allow=(r'\/bugs\/wooyun\-\d+\-\d+', )), callback='parse_vul'),
  17.     )
  18.  
  19.  
  20.     def __init__(self, *args, **kwarg):
  21.         super(WooyunSpider, self).__init__(*args, **kwarg)
  22.         self.finished = set()
  23.  
  24.  
  25.     def make_requests_from_url(self, url):
  26.         match = re.findall(RE_VULID, url)
  27.         if match:
  28.             vulid, = match
  29.             if vulid in self.finished:
  30.                 return
  31.             else:
  32.                 self.finished.add(vulid)
  33.  
  34.         return Request(url, dont_filter=False)
  35.  
  36.  
  37.     def parse_vul(self, response):
  38.         item = {key: ''.join([text.strip() for text in extracted]) for key, extracted in {
  39.             'title': response.css('h3.wybug_title::text').re(ur'\t\t(\S+)'),
  40.             'vulid': response.xpath('//h3/a[starts-with(@href,"/bugs/wooyun")]/@href').re(RE_VULID),
  41.             'vendor': response.css('h3.wybug_corp a::text').extract(),
  42.             'author': response.css('h3.wybug_author a::text').extract(),
  43.             'submitted': response.css('h3.wybug_date::text').re('\t\t(\d+\-\d+\-\d+\s+\d+:\d+)'),
  44.             'published': response.css('h3.wybug_open_date::text').re('\t\t(\d+\-\d+\-\d+\s+\d+:\d+)'),
  45.             'detail': response.css('.wybug_detail').xpath('./node()').extract(),
  46.             'patch': response.css('.wybug_patch .detail').xpath('./node()').extract(),
  47.             'rank': response.css('.bug_result .detail').re(r'Rank[\s\S]?(\d*)'),
  48.             'description': response.css('p.detail.wybug_description::text').extract(),
  49.             'vultype': response.css('h3.wybug_type::text').re('\t\t(\S+)'),
  50.             'level': response.css('.bug_result .detailTitle + p.detail::text').re(ur'\uff1a(\S+)'),
  51.         }.iteritems()}
  52.  
  53.         yield item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement