Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env ruby
- # encoding: utf-8
- =begin
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- The Software shall be used for Good, not Evil.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
- =end
- require 'rubygems'
- require 'nokogiri'
- require 'open-uri'
- require 'net/http'
- BASE_URL = 'http://www.yle.fi'
- File.open("yle_vaalikone.txt", 'w'){ |f| f.write("") }
- def scrape_constituency(url)
- list_of_cands = Nokogiri::HTML(open(url))
- district = list_of_cands.xpath("//div[@id='em-headingbar']/h2[1]").inner_html
- district = district.chomp(": Kaikki ehdokkaat")
- links = list_of_cands.xpath("//td[@class='em-cell-name']/a").collect[0..-1]
- cand_links = links.grep /\d\s/
- cand_links.each do |el|
- full_url = BASE_URL + el["href"]
- scrape_candidate(full_url, district)
- end
- end
- def scrape_candidate(url,district)
- cand = {:district => district}
- open_page = Nokogiri::HTML(open(url), nil, 'iso8859-1')
- # SCRAPE: GET INFO
- cand_info = open_page.xpath("//div[@class='em-details']")
- cand_name = cand_info.xpath("h3[1]")
- cand[:name] = cand_name.inner_html
- # get party
- find_cell = cand_info.xpath("dl[1]/dt[2]").to_s
- cand_party_raw =
- find_cell.include?('Puolue') ?
- cand_info.xpath("dl[1]/dd[2]") : cand_info.xpath("dl[1]/dd[1]")
- cand_party = cand_party_raw.inner_html
- cand[:party] = cand_party[/(.*)\s<br>/,1].strip
- # get age
- find_cell = cand_info.xpath("dl[1]/dt[4]").to_s
- cand_age_raw =
- find_cell.include?('Ik') ?
- cand_info.xpath("dl[1]/dd[4]") : cand_info.xpath("dl[1]/dd[3]")
- cand[:age] = cand_age_raw.inner_html
- # SCRAPE: GET ANSWERS
- cand[:opts] = [] # numeeriset vastaukset
- cand[:xopts] = [] # monivalinta
- q = open_page.xpath("//div[@class='em-compare-container']").collect[0..32]
- q.each do |a|
- question = a.xpath("h3").inner_html.strip
- qa = {:q => question}
- opt = a.xpath("table[1]/tr").collect[0..-1]
- opt.each_with_index do |b,opt_count|
- if b.xpath("td[1]").to_html.include? "acronym" # "●" html-merkki ongelmallinen
- # ehdokas on tätä mieltä..
- cleartxt = b.xpath("td[4]").inner_html.strip
- # talletetaan myös numeroarvo (1-5)
- qa.update({:a => cleartxt, :n => opt_count+1})
- end
- end
- # rekisteröidään myös tyhjät vastaukset
- cand[:opts] << qa
- end
- ### MONIVALINNAT
- # q34 - Mielestäni seuraavassa hallituksessa on oltava mukana:
- # q35 - Suosikkini tulevan hallituksen pääministeriksi on (vain yksi):
- (34..35).each do |q|
- _q = open_page.xpath("//div[@class='em-compare-container'][%i]" % [q])
- question = _q.xpath("h3").inner_html.strip
- #puts question
- opt = _q.xpath("table[1]/tr").collect[0..-1]
- a = [] # monivalinta
- opt.each do |b|
- if b.xpath("td[1]").to_html.include? "acronym"
- a << b.xpath("td[4]").inner_html.strip
- end
- end
- cand[:xopts] << {:q => question, :a => a.join(', ')}
- end
- ### TULOSTA JA TALLENNA
- print_and_save cand
- sleep 2 # älä kuormita YLEn palvelimia liikaa..
- end
- def print_and_save(cand)
- f = File.open("yle_vaalikone.txt", 'a')
- f.write "***\n"
- f.write [
- cand[:name],
- cand[:district],
- cand[:party],
- cand[:age]
- ].join("\t")
- f.write "\n"
- # numeeriset vastaukset pilkulla eroteltuina
- f.write cand[:opts].map{|h| h[:n]}.join(',')
- f.write "\n"
- f.write cand[:xopts].collect{|h| "%s %s" % [h[:q], h[:a]]}.join("\n")
- f.write "\n"
- f.close
- ### TULOSTA RUUDULLE
- puts [
- cand[:name],
- cand[:district],
- cand[:party],
- cand[:age],
- '============================================='
- ].join("\n")
- (cand[:opts] + cand[:xopts]).each do |h|
- # selväkielinen printti:
- puts "%s %s\n\n" % [h[:q], h[:a]]
- end
- end
- (1..16).each do |district|
- url = "%s/vaalikone11/index.php?emp=l-1.d-%i.s-3.q-1.ps-1" % [BASE_URL, district]
- scrape_constituency(url)
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement