Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on Apr 23rd, 2012  |  syntax: None  |  size: 3.16 KB  |  hits: 16  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. How to crawl the right way?
  2. require 'rubygems'
  3. require 'nokogiri'
  4. require 'open-uri'
  5. require 'rexml/document'
  6. require 'csv'
  7. include REXML
  8.        
  9. @urls = Array.new
  10. @ID = Array.new
  11. @titleSv = Array.new
  12. @titleEn = Array.new
  13. @identifier = Array.new
  14. @typeOfLevel = Array.new
  15.        
  16. htmldoc = Nokogiri::HTML(open('http://testnavet.skolverket.se/SusaNavExport/EmilExporter?GetEvent&EMILVersion=1.1&NotExpired&EEFormOfStudy=normal&EIAcademicType=UoH&SelectEI'))
  17.  
  18. htmldoc.xpath('//a/@href').each do |links|
  19.   @urls << links.content
  20. end
  21.        
  22. @urls.each do |url|
  23.   # Loop throw the XML files and grab element nodes
  24.   xmldoc = REXML::Document.new(open(url).read)
  25.   # Root element
  26.   root = xmldoc.root
  27.   # Hämtar info-id
  28.   @ID << root.attributes["id"]
  29.   # TitleSv
  30.   xmldoc.elements.each("/educationInfo/titles/title[1] | /ns:educationInfo/ns:titles/ns:title[1]"){
  31.     |e| m = e.text
  32.         m = m.to_s
  33.         next if m.empty?
  34.         @titleSv << m
  35.   }
  36.        
  37. CSV.open("eduction_normal.csv", "wb") do |row|
  38.     (0..@ID.length - 1).each do |index|
  39.       row << [@ID[index], @titleSv[index], @titleEn[index], @identifier[index], @typeOfLevel[index], @typeOfResponsibleBody[index], @courseTyp[index], @credits[index], @degree[index], @preAcademic[index], @subjectCodeVhs[index], @descriptionSv[index], @lastedited[index], @expires[index]]
  40.     end
  41.   end
  42.        
  43. items = Set.new
  44.  
  45. doc.xpath('//a/@href').each do |url|
  46.   item = {}
  47.   item[:url] = url.content
  48.   items << item
  49. end
  50.  
  51. items.each do |item|
  52.   xml = Nokogiri::XML(open(item[:url]))
  53.  
  54.   item[:id] = xml.root['id']
  55.   ...
  56. end
  57.        
  58. xmldoc.elements.each("/educationInfo/titles/title[1] | /ns:educationInfo/ns:titles/ns:title[1]"){
  59.     |e| m = e.text
  60.      m = m.to_s
  61.      next if m.empty?
  62.      @titleSv << m
  63. }
  64.        
  65. def get_value(xml, path)
  66.    str = ''
  67.    xml.elements.each(path) do |e|
  68.      str = e.text.to_s
  69.      next if str.empty?
  70.    end
  71.  
  72.    str
  73. end
  74.        
  75. xml_paths = {
  76.   :title_sv => "/educationInfo/titles/title[1] | /ns:educationInfo/ns:titles/ns:title[1]",
  77.   :title_en => "/educationInfo/titles/title[2] | /ns:educationInfo/ns:titles/ns:title[2]",
  78.   ...
  79. }
  80.        
  81. item[:title_sv] = get_value(xml, xml_paths[:title_sv])
  82. item[:title_en] = get_value(xml, xml_paths[:title_en])
  83.        
  84. require 'rubygems'
  85. require 'pioneer'
  86. require 'nokogiri'
  87. require 'rexml/document'
  88. require 'csv'
  89.  
  90. class Links < Pioneer::Base
  91.   include REXML
  92.   def locations
  93.     ["http://testnavet.skolverket.se/SusaNavExport/EmilExporter?GetEvent&EMILVersion=1.1&NotExpired&EEFormOfStudy=normal&EIAcademicType=UoH&SelectEI"]
  94.   end
  95.  
  96.   def processing(req)
  97.     doc = Nokogiri::HTML(req.response.response)
  98.     htmldoc.xpath('//a/@href').map do |links|
  99.       links.content
  100.     end
  101.   end
  102. end
  103.  
  104. class Crawler < Pioneer::Base
  105.   include REXML
  106.   def locations
  107.     Links.new.start.flatten
  108.   end
  109.  
  110.   def processing(req)
  111.     xmldoc = REXML::Document.new(req.respone.response)
  112.     root = xmldoc.root
  113.     id = root.attributes["id"]
  114.     xmldoc.elements.each("/educationInfo/titles/title[1] | /ns:educationInfo/ns:titles/ns:title[1]") do |e|
  115.       title = e.text.to_s
  116.       CSV.open("eduction_normal.csv", "a") do |f|
  117.         f << [id, title ...]
  118.       end
  119.     end
  120.   end
  121. end
  122.  
  123. Crawler.start
  124. # or you can run 100 concurrent processes
  125. Crawler.start(concurrency: 100)