Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- require 'rubygems'
- require 'hpricot'
- require 'open-uri'
- require 'mofo'
- def me_urls_in(doc, base_url)
- me_urls = doc.search('a[@rel="me"]').map do |element|
- # Absolute URL
- if element['href'] =~ /^http:/
- element['href']
- # Absolute path
- elsif element['href'] =~ /^\//
- base_url.sub(/^(http:\/\/[^\/]+).*/, '\1') + element['href']
- # Relative path
- else
- base_url + element['href']
- end
- end
- end
- def search(url)
- # Collection of processed and unprocessed URLs
- unprocessed_urls = [ url ]
- processed_urls = []
- # Gathered personal information so far
- full_names = []
- loop do
- # Take an unprocesed URL
- url = unprocessed_urls.shift
- # Stop if there's nothing more to process
- break if url.nil?
- # Skip if it's actually already processed
- next if processed_urls.include?(url)
- # Mark it as processed
- processed_urls << url
- # Debug
- puts '=== Processing ' + url.inspect
- # Parse document
- content = open(url).read
- doc = Hpricot(content)
- # Find new names
- addresses = doc.search('address')
- hcards = addresses.map { |address| hCard.find(:all => { :text => address.inner_html }) }.flatten
- names = hcards.map { |hcard| hcard.fn }
- full_names = (full_names + names).uniq
- puts 'Found new names:'
- names.each { |name| puts ' - ' + name }
- # Find unprocessed URLs
- puts 'Found new URLs:'
- me_urls_in(doc, url).sort.each do |new_url|
- # Debug
- puts ' - ' + new_url
- # Add it to the queue
- unprocessed_urls << new_url
- end
- end
- # Print all URLs
- puts '=== Done!'
- puts 'Names:'
- full_names.each { |name| puts ' - ' + name }
- puts 'URLs:'
- processed_urls.each { |new_url| puts ' - ' + new_url }
- end
- if ARGV[0].nil?
- puts 'usage: search_me [url]'
- else
- search(ARGV[0])
- end
Add Comment
Please, Sign In to add comment