Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- http://www.canadapost.ca/cpotools/apps/track/personal/findByTrackNumber?trackingNumber=0656887000494793
- http://www.canadapost.ca/cpotools/apps/track/personal/findByTrackNumber?execution=eXs1
- sudo gem install mechanize
- require 'mechanize'
- agent = WWW::Mechanize.new
- page = agent.get "http://www.canadapost.ca/cpotools/apps/track/personal/findByTrackNumber trackingNumber=0656887000494793"
- page.content # Get the resulting page as a string
- page.body # Get the body content of the resulting page as a string
- page.search(".somecss") # Search for specific elements by XPath/CSS using nokogiri
- require "open-uri"
- require "zlib"
- require "nokogiri"
- require "sanitize"
- require "htmlentities"
- require "readability"
- def crawl(url_address)
- self.errors = Array.new
- begin
- begin
- url_address = URI.parse(url_address)
- rescue URI::InvalidURIError
- url_address = URI.decode(url_address)
- url_address = URI.encode(url_address)
- url_address = URI.parse(url_address)
- end
- url_address.normalize!
- stream = ""
- timeout(8) { stream = url_address.open(SHINSO_HEADERS) }
- if stream.size > 0
- url_crawled = URI.parse(stream.base_uri.to_s)
- else
- self.errors << "Server said status 200 OK but document file is zero bytes."
- return
- end
- rescue Exception => exception
- self.errors << exception
- return
- end
- # extract information before html parsing
- self.url_posted = url_address.to_s
- self.url_parsed = url_crawled.to_s
- self.url_host = url_crawled.host
- self.status = stream.status
- self.content_type = stream.content_type
- self.content_encoding = stream.content_encoding
- self.charset = stream.charset
- if stream.content_encoding.include?('gzip')
- document = Zlib::GzipReader.new(stream).read
- elsif stream.content_encoding.include?('deflate')
- document = Zlib::Deflate.new().deflate(stream).read
- #elsif stream.content_encoding.include?('x-gzip') or
- #elsif stream.content_encoding.include?('compress')
- else
- document = stream.read
- end
- self.charset_guess = CharGuess.guess(document)
- if not self.charset_guess.blank? and (not self.charset_guess.downcase == 'utf-8' or not self.charset_guess.downcase == 'utf8')
- document = Iconv.iconv("UTF-8", self.charset_guess, document).to_s
- end
- document = Nokogiri::HTML.parse(document,nil,"utf8")
- document.xpath('//script').remove
- document.xpath('//SCRIPT').remove
- for item in document.xpath('//*[translate(@src, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")]')
- item.set_attribute('src',make_absolute_address(item['src']))
- end
- document = document.to_s.gsub(/<!--(.|s)*?-->/,'')
- self.content = Nokogiri::HTML.parse(document,nil,"utf8")
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement