Guest User

Untitled

a guest
Jun 24th, 2018
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.93 KB | None | 0 0
  1. # Should we better use sanitize library for this ?
  2. require 'cgi'
  3.  
  4. def html2text html
  5. text = html.
  6. gsub(/( |\n|\s)+/im, ' ').squeeze(' ').strip.
  7. gsub(/<([^\s]+)[^>]*(src|href)=\s*(.?)([^>\s]*)\3[^>]*>\4<\/\1>/i,
  8. '\4')
  9.  
  10. links = []
  11. linkregex = /<[^>]*(src|href)=\s*(.?)([^>\s]*)\2[^>]*>\s*/i
  12. while linkregex.match(text)
  13. links << $~[3]
  14. text.sub!(linkregex, "[#{links.size}]")
  15. end
  16.  
  17. text = CGI.unescapeHTML(
  18. text.
  19. gsub(/<(script|style)[^>]*>.*<\/\1>/im, '').
  20. gsub(/<!--.*-->/m, '').
  21. gsub(/<hr(| [^>]*)>/i, "___\n").
  22. gsub(/<li(| [^>]*)>/i, "\n* ").
  23. gsub(/<blockquote(| [^>]*)>/i, '> ').
  24. gsub(/<(br)(| [^>]*)>/i, "\n").
  25. gsub(/<(\/h[\d]+|p)(| [^>]*)>/i, "\n\n").
  26. gsub(/<[^>]*>/, '')
  27. ).lstrip.gsub(/\n[ ]+/, "\n") + "\n"
  28.  
  29. for i in (0...links.size).to_a
  30. text = text + "\n [#{i+1}] <#{CGI.unescapeHTML(links[i])}>" unless links[i].nil?
  31. end
  32. links = nil
  33. text
  34. end
Add Comment
Please, Sign In to add comment