- How to crawl the right way?
- require 'rubygems'
- require 'nokogiri'
- require 'open-uri'
- require 'rexml/document'
- require 'csv'
- include REXML
- @urls = Array.new
- @ID = Array.new
- @titleSv = Array.new
- @titleEn = Array.new
- @identifier = Array.new
- @typeOfLevel = Array.new
- htmldoc = Nokogiri::HTML(open('http://testnavet.skolverket.se/SusaNavExport/EmilExporter?GetEvent&EMILVersion=1.1&NotExpired&EEFormOfStudy=normal&EIAcademicType=UoH&SelectEI'))
- htmldoc.xpath('//a/@href').each do |links|
- @urls << links.content
- end
- @urls.each do |url|
- # Loop throw the XML files and grab element nodes
- xmldoc = REXML::Document.new(open(url).read)
- # Root element
- root = xmldoc.root
- # Hämtar info-id
- @ID << root.attributes["id"]
- # TitleSv
- xmldoc.elements.each("/educationInfo/titles/title[1] | /ns:educationInfo/ns:titles/ns:title[1]"){
- |e| m = e.text
- m = m.to_s
- next if m.empty?
- @titleSv << m
- }
- CSV.open("eduction_normal.csv", "wb") do |row|
- (0..@ID.length - 1).each do |index|
- row << [@ID[index], @titleSv[index], @titleEn[index], @identifier[index], @typeOfLevel[index], @typeOfResponsibleBody[index], @courseTyp[index], @credits[index], @degree[index], @preAcademic[index], @subjectCodeVhs[index], @descriptionSv[index], @lastedited[index], @expires[index]]
- end
- end
- items = Set.new
- doc.xpath('//a/@href').each do |url|
- item = {}
- item[:url] = url.content
- items << item
- end
- items.each do |item|
- xml = Nokogiri::XML(open(item[:url]))
- item[:id] = xml.root['id']
- ...
- end
- xmldoc.elements.each("/educationInfo/titles/title[1] | /ns:educationInfo/ns:titles/ns:title[1]"){
- |e| m = e.text
- m = m.to_s
- next if m.empty?
- @titleSv << m
- }
- def get_value(xml, path)
- str = ''
- xml.elements.each(path) do |e|
- str = e.text.to_s
- next if str.empty?
- end
- str
- end
- xml_paths = {
- :title_sv => "/educationInfo/titles/title[1] | /ns:educationInfo/ns:titles/ns:title[1]",
- :title_en => "/educationInfo/titles/title[2] | /ns:educationInfo/ns:titles/ns:title[2]",
- ...
- }
- item[:title_sv] = get_value(xml, xml_paths[:title_sv])
- item[:title_en] = get_value(xml, xml_paths[:title_en])
- require 'rubygems'
- require 'pioneer'
- require 'nokogiri'
- require 'rexml/document'
- require 'csv'
- class Links < Pioneer::Base
- include REXML
- def locations
- ["http://testnavet.skolverket.se/SusaNavExport/EmilExporter?GetEvent&EMILVersion=1.1&NotExpired&EEFormOfStudy=normal&EIAcademicType=UoH&SelectEI"]
- end
- def processing(req)
- doc = Nokogiri::HTML(req.response.response)
- htmldoc.xpath('//a/@href').map do |links|
- links.content
- end
- end
- end
- class Crawler < Pioneer::Base
- include REXML
- def locations
- Links.new.start.flatten
- end
- def processing(req)
- xmldoc = REXML::Document.new(req.respone.response)
- root = xmldoc.root
- id = root.attributes["id"]
- xmldoc.elements.each("/educationInfo/titles/title[1] | /ns:educationInfo/ns:titles/ns:title[1]") do |e|
- title = e.text.to_s
- CSV.open("eduction_normal.csv", "a") do |f|
- f << [id, title ...]
- end
- end
- end
- end
- Crawler.start
- # or you can run 100 concurrent processes
- Crawler.start(concurrency: 100)