LrdArc

Clean html tags | R | https://intip.in/FnaD

Mar 31st, 2017
307
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 0.99 KB | None | 0 0
  1. cleanScript <- function(htmlString) {
  2.   return( gsub( "#<script(.*?)>(.*?)</script>#is", "", htmlString ) )
  3. }
  4.  
  5. cleanStyle <- function(htmlString) {
  6.   return( gsub( "#<style(.*?)>(.*?)</style>#is", "", htmlString ) )
  7. }
  8.  
  9. cleanTags <- function(htmlString) {
  10.   return(gsub("<.*?>", "", htmlString))
  11. }
  12.  
  13.  
  14. cleanSekrip <- function(htmlString) {
  15.   return( gsub( '>.*$', '></div>', htmlString ) )
  16. }
  17.  
  18.  
  19. cleanKabeh <- function(htmlString) {
  20.   htmlString <- gsub( "#<script(.*?)>(.*?)</script>#is", "", htmlString )
  21.   htmlString <- gsub( "#<style(.*?)>(.*?)</style>#is", "", htmlString )
  22.   return( gsub("<.*?>", "", htmlString) )
  23. }
  24.  
  25. cleanKabeh <- function(htmlString) {
  26.   return( gsub("<.*?>", "", gsub( "#<style(.*?)>(.*?)</style>#is", "", gsub( \<(?:[^:]+:)?script\>.*?\<\/(?:[^:]+:)?script\>, "", htmlString ) ) ) )
  27. }
  28.  
  29.  
  30.  
  31.  
  32.  
  33. library(XML)
  34. doc <- htmlParse(article,asText=TRUE)
  35. styleNodes <- getNodeSet(doc, "//style")
  36. styleNodes <- getNodeSet(doc, "//script")
  37. removeNodes(styleNodes)
  38. doc
Add Comment
Please, Sign In to add comment