- # WikiStance. A Wikipedia distance meter.
- # Based on the alt text of http://xkcd.org/903/
- # Gets a Wikipedia URL and measure the distance of this page to the Philosophy article, clicking on links not in parens
- # neither italics
- #
- # Author:: Alejandro Fernández (mailto:antarticonorte@gmail.com)
- # Copyright:: Copyright (c) 2011 Alejandro Fernández
- # License:: GPL
- #
- # =Usage=
- #
- # require 'wikistance'
- #
- # url = 'http://en.wikipedia.org/wiki/Scrubs_%28TV_series%29'
- # ws = WikiStance.new(url)
- # ws.trace # Go through all the pages until we reach Philosophy
- # ws.distance # => 22
- # ws.breadcrumbs # => ["List of characters on Scrubs", "NBC", "United States", ..., "Philosophy"]
- require 'rubygems'
- require 'mechanize'
- class WikiStance
- attr_reader :title, :breadcrumbs
- def initialize(url)
- if url =~ /^http:\/\/en\.wikipedia\.org\/wiki\/(.*?)/
- @url = url
- # Wikipedia returns 403 with the default user agent
- @agent = Mechanize.new
- @agent.user_agent_alias = 'Mac Safari'
- self.reset
- else
- raise ArgumentError, "You should use a valid wikipedia link"
- end
- end
- # Resets the class
- def reset
- @page = @agent.get(@url)
- @breadcrumbs = []
- @title = page_title
- @breadcrumbs << @title
- true
- end
- # Gets the current @page title
- def page_title
- @page.at('#firstHeading').text()
- end
- # Go through the pages to calculate distance
- def trace
- while page_title != 'Philosophy'
- click_first_link
- title = page_title
- # Avoid entering in an infinite loop
- if @breadcrumbs.include?(title)
- raise "We are repeating ourselves! We already visited \"#{title}\""
- end
- @breadcrumbs << title
- end
- true
- end
- def distance
- # Breadcrumbs hold the initial page. If we start in philosophy the distance should be 0
- @breadcrumbs.length - 1
- end
- private
- def click_first_link
- first_link = nil
- # div#bodyContent is where wikipedia shows article's content
- # The starting text is direct child of div#bodyContent. This way we avoid <p> inside TOCs and other texts.
- # We also avoid Disambiguation and other wikipedia texts, (which all of them contains links in italics) because
- # they are in <div> instead of <p>
- @page.search('#bodyContent > p').each do |p|
- # Links between parens should not be clicked
- # I tried using a regex with lookbehind to know if a link has an opening parenthesis before, but ruby doesn't
- # support them, so I will just remove all text between parens...
- text = p.to_html.gsub(/\((?:.*?)\)/, '').gsub(/<i>(?:.*?)<\/i>/, '')
- # ...and then get the first link.
- first_link = text.match(/<a(?:.*?)href\=\"[^#](.*?)\"(?:.*?)\/a>/)
- break unless first_link.nil?
- end
- raise "Oops! seems that \"#{page_title}\" has no links" if first_link.nil?
- @page = @page.links_with(:href => /#{first_link[1]}/).first.click
- end
- end