- require 'pdfkit'
- require 'nokogiri'
- require 'open-uri'
- class Article do
- attr_reader :url, :title, :content, :blog, :author, :published, :tags, :comments, :favorites, :rating
- def initialize options
- @url = options[:url]
- @title = options[:title]
- @content = options[:content]
- @blog = options[:blog]
- @author = options[:author]
- @published = options[:published]
- @tags = options[:tags]
- @comments = options[:comments]
- @favorites = options[:favorites]
- @rating = options[:rating]
- end
- end
- class Habr do
- attr_reader :user, :favorites, :pages
- def initialize(user)
- raise 'User is not set' unless user
- @user = user
- end
- def url
- 'http://habrahabr.ru'
- end
- def favorites_url
- "#{url}/users/#{user}/favorites/"
- end
- def pages
- unless @pages
- document = Nokogiri::HTML( open(favorites_url, 'User-Agent' => '') )
- @pages = document.at_css('#nav-pages a:last-child')['href'].match(/(\d+)\/$/)[1].to_i
- else
- @pages
- end
- end
- def favorites
- @favorites ||= (1..pages).inject([]) do |articles, page|
- document = Nokogiri::HTML( open(favorites_url + "page#{page}/", 'User-Agent' => '') )
- document.css('.post').each do |article|
- header = article.css('.title')
- bottom = article.css('.infopanel')
- options = {
- url : header.at_css('.post_title')['href'], # could it be simply #href here?
- title : header.at_css('.post_title').text,
- content : nil,
- blog : header.at_css('.blog_title').text
- author : bottom.at_css('.author').text
- published : bottom.at_css('.published').text
- tags : article.at_css('.tags li').map{ |li| li.text }.join(', ')
- comments : bottom.at_css('.comments .all').text
- favorites : bottom.at_css('.favs_count').text
- rating : bottom.at_css('.score').text
- }
- pages << Article.new(options)
- end
- end
- end
- end
- username = 'gmile'
- habr_url = "http://habrahabr.ru/users/#{username}/favorites/"
- favorites_start_page = Nokogiri::HTML( open(habr_url, 'User-Agent' => '') )
- pages = favorites_start_page.css('ul#nav-pages a:last-child').last['href'].match(/(\d+)\/$/)[1].to_i
- puts "Fetching links from #{pages} pages..."
- puts links.inspect
- x = Nokogiri::HTML::Builder.new(:encoding => 'UTF-8') { |doc|
- doc.html {
- doc.head {
- doc.title "Interesting"
- }
- doc.body {
- links[1..1].each { |link|
- article = Nokogiri::HTML( open(link, 'User-Agent' => '') )
- doc.h2 article.at_css('.title').text.strip
- doc.h4 article.at_css('.author').text.strip
- doc.h4 article.at_css('.published').text.strip
- doc.parent << article.at_css('.content')
- }
- }
- }
- }
- puts 'HTML generation is done'
- PDFKit.new(x.doc.to_s, :page_size => 'A4').to_file('out.pdf')