Advertisement
Drakia

Untitled

Apr 25th, 2012
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Ruby 4.78 KB | None | 0 0
  1. require 'rubygems'
  2. require 'nokogiri'
  3. require 'open-uri'
  4. require 'mysql'
  5.  
  6. # Load the config file
  7. require_relative 'lib/config.rb'
  8.  
  9. def fileUpdate(fUrl)
  10.   doc = Nokogiri::HTML(open(fUrl))
  11.   file = Hash.new
  12.  
  13.   file["name"] = doc.at_css('.main-body').at_css('h1').inner_text().strip()
  14.   file["changelog"] = doc.at_css('.content-box').at_css('p')
  15.   if (file["changelog"] != nil)
  16.     file["changelog"] = file["changelog"].text
  17.   else
  18.     file["changelog"] = ""
  19.   end
  20.   factBox = doc.at_css('.standard-date').parent().parent()
  21.   file["date"] = doc.at_css('.standard-date')['data-epoch']
  22.   file["url"] = factBox.at_css('a')['href']
  23.   file["filename"] = factBox.at_css('a').text
  24.   file["size"] = factBox.css('dd')[3].text
  25.   file["status"] = factBox.at_css('.file-status').text
  26.   file["type"] = factBox.at_css('.file-type').text
  27.   file["md5"] = factBox.css('dd')[4].text
  28.   file["downloads"] = Integer(factBox.css('dd')[7].text.gsub(/[^\d]/, ''))
  29.  
  30.   # List of builds
  31.   file["builds"] = Array.new
  32.   factBox.at_css('.comma-separated-list').css('li').each do |build|
  33.     file["builds"].push(build.text)
  34.   end
  35.   return file
  36. end
  37.  
  38. def pluginUpdate(slug)
  39.   pUrl = CONFIG[:site] + slug + "/files/"
  40.   doc = Nokogiri::HTML(open(pUrl))
  41.   files = Array.new
  42.   regEx = /\/files\/(.*)/
  43.   # Check if there are no files, return empty array if so
  44.   if (doc.at_css('.listing-none-found') != nil)
  45.     return files
  46.   end
  47.  
  48.   # Figure out if we're paginated
  49.   page = 1
  50.   lastPage = 1
  51.   pagination = doc.at_css('.listing-pagination')
  52.   if (pagination.at_css('a') != nil)
  53.     lastPage = Integer(pagination.at_css('.listing-pagination-pages-next').previous_sibling().text)
  54.   end
  55.   # Loop through the pages/files, fetch file info
  56.   while(page <= lastPage)
  57.     doc.css('td.col-file').each do |file|
  58.       fUrl = pUrl + regEx.match(file.at_css('a')['href'])[1]
  59.       files.push(fileUpdate(fUrl))
  60.       sleep(CONFIG[:sleep])
  61.     end
  62.     page = page + 1
  63.     if (page <= lastPage)
  64.       doc = Nokogiri::HTML(open("#{pUrl}?page=#{page}"))
  65.     end
  66.   end
  67.   puts files
  68.   return files
  69. end
  70.  
  71. def devBukkitUpdate(startPage, quick)
  72.   slugRegEx = /\/server-mods\/(.*?)\//
  73.   categoryRegEx = /.*?category=(.*)/
  74.  
  75.   page = startPage
  76.   lastPage = nil
  77.   # TODO: Get last update time if quick
  78.   lastUpdate = nil
  79.  
  80.   plugins = Array.new
  81.   while (lastPage == nil || page <= lastPage)
  82.     # Sort by name, so as to not miss plugins updated while we're running
  83.     # We only scrape the list first, to hopefully not miss any plugins
  84.     if (quick)
  85.       pageUrl = "#{CONFIG[:site]}?page=#{page}"
  86.     else
  87.       pageUrl = "#{CONFIG[:site]}?page=#{page}&sort=name"
  88.     end
  89.     puts "Scraping #{pageUrl}"
  90.     doc = Nokogiri::HTML(open(pageUrl))
  91.     # Get the maximum page count
  92.     if (lastPage == nil)
  93.       lastPage = Integer(doc.at_css('.listing-pagination-pages').at_css('.listing-pagination-pages-next').previous_sibling().text)
  94.       #lastPage = 1
  95.     end
  96.    
  97.     doc.css('.row-joined-to-next').each do |plugin|
  98.       pEntry = Hash.new
  99.       pEntry["updated"] = plugin.at_css('.col-date').at_css('.standard-date')['data-epoch']
  100.      
  101.       # Quick mode will stop scraping once we hit the last checked plugin
  102.       if (lastUpdate != nil && pEntry["updated"] < lastUpdate)
  103.         page = lastPage
  104.         break
  105.       end
  106.      
  107.       pEntry["name"] = plugin.at_css('.col-project').text
  108.       pEntry["slug"] = slugRegEx.match(plugin.at_css('.col-project').at_css('a')['href'])[1]
  109.       pEntry["stage"] = plugin.at_css('.col-status').text
  110.       pEntry["summary"] = plugin.next_sibling().at_css('.summary').inner_html
  111.       pEntry["downloads"] = 0
  112.       icon = plugin.at_css('.col-icon').at_css('a')
  113.       if (icon)
  114.         pEntry["banner"] = icon.at_css('img')['data-full-src']
  115.       else
  116.         pEntry["banner"] = ""
  117.       end
  118.      
  119.       # Fetch authors
  120.       pEntry["authors"] = Array.new
  121.       authorlist = plugin.at_css('.col-user')
  122.       authorlist.css('a').each do |author|
  123.         pEntry["authors"].push(author.text)
  124.       end
  125.      
  126.       # Fetch categories
  127.       pEntry["categories"] = Array.new
  128.       catlist = plugin.at_css('.col-category')
  129.       catlist.css('a').each do |category|
  130.         cat = Hash.new
  131.         cat["name"] = category.text
  132.         cat["slug"] = categoryRegEx.match(category['href'])[1]
  133.         cat["description"] = category['title']
  134.         pEntry["categories"].push(cat)
  135.       end
  136.       plugins.push(pEntry)
  137.       sleep(CONFIG[:sleep])
  138.     end
  139.     page = page + 1
  140.   end
  141.  
  142.   # Now we update the plugin files
  143.   plugins.each do |plugin|
  144.     plugin["files"] = pluginUpdate(plugin["slug"])
  145.     plugin["files"].each do |file|
  146.       plugin["downloads"] = plugin["downloads"] + file["downloads"]
  147.     end
  148.   end
  149.  
  150.   # TODO: Store the plugins in the database
  151.   return true
  152. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement