Advertisement
Guest User

Untitled

a guest
Sep 22nd, 2013
1,176
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.60 KB | None | 0 0
  1. http://www.canadapost.ca/cpotools/apps/track/personal/findByTrackNumber?trackingNumber=0656887000494793
  2.  
  3. http://www.canadapost.ca/cpotools/apps/track/personal/findByTrackNumber?execution=eXs1
  4.  
  5. sudo gem install mechanize
  6.  
  7. require 'mechanize'
  8. agent = WWW::Mechanize.new
  9. page = agent.get "http://www.canadapost.ca/cpotools/apps/track/personal/findByTrackNumber trackingNumber=0656887000494793"
  10.  
  11. page.content # Get the resulting page as a string
  12. page.body # Get the body content of the resulting page as a string
  13. page.search(".somecss") # Search for specific elements by XPath/CSS using nokogiri
  14.  
  15. require "open-uri"
  16. require "zlib"
  17. require "nokogiri"
  18. require "sanitize"
  19. require "htmlentities"
  20. require "readability"
  21.  
  22. def crawl(url_address)
  23. self.errors = Array.new
  24. begin
  25. begin
  26. url_address = URI.parse(url_address)
  27. rescue URI::InvalidURIError
  28. url_address = URI.decode(url_address)
  29. url_address = URI.encode(url_address)
  30. url_address = URI.parse(url_address)
  31. end
  32. url_address.normalize!
  33. stream = ""
  34. timeout(8) { stream = url_address.open(SHINSO_HEADERS) }
  35. if stream.size > 0
  36. url_crawled = URI.parse(stream.base_uri.to_s)
  37. else
  38. self.errors << "Server said status 200 OK but document file is zero bytes."
  39. return
  40. end
  41. rescue Exception => exception
  42. self.errors << exception
  43. return
  44. end
  45. # extract information before html parsing
  46. self.url_posted = url_address.to_s
  47. self.url_parsed = url_crawled.to_s
  48. self.url_host = url_crawled.host
  49. self.status = stream.status
  50. self.content_type = stream.content_type
  51. self.content_encoding = stream.content_encoding
  52. self.charset = stream.charset
  53. if stream.content_encoding.include?('gzip')
  54. document = Zlib::GzipReader.new(stream).read
  55. elsif stream.content_encoding.include?('deflate')
  56. document = Zlib::Deflate.new().deflate(stream).read
  57. #elsif stream.content_encoding.include?('x-gzip') or
  58. #elsif stream.content_encoding.include?('compress')
  59. else
  60. document = stream.read
  61. end
  62. self.charset_guess = CharGuess.guess(document)
  63. if not self.charset_guess.blank? and (not self.charset_guess.downcase == 'utf-8' or not self.charset_guess.downcase == 'utf8')
  64. document = Iconv.iconv("UTF-8", self.charset_guess, document).to_s
  65. end
  66. document = Nokogiri::HTML.parse(document,nil,"utf8")
  67. document.xpath('//script').remove
  68. document.xpath('//SCRIPT').remove
  69. for item in document.xpath('//*[translate(@src, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")]')
  70. item.set_attribute('src',make_absolute_address(item['src']))
  71. end
  72. document = document.to_s.gsub(/<!--(.|s)*?-->/,'')
  73. self.content = Nokogiri::HTML.parse(document,nil,"utf8")
  74. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement