sobach

Creating tables for infogr.am

Nov 25th, 2014
245
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 3.21 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. setwd('~/Documents/vis_opportunities')
  3. Sys.setlocale(category = "LC_ALL", locale = "UTF-8")
  4. df <- read.csv('ria_lenta.csv')
  5. head(df)
  6. nrow(df)
  7.  
  8. df$timestamp <- strptime(as.character(df$timestamp), format = '%Y-%m-%d %H:%M:%S')
  9.  
  10. df$day <- strftime(df$timestamp, format='%d %b')
  11.  
  12. filter.vec <- append(grep('путин', tolower(df$text)), grep('путин', tolower(df$title.text)))
  13. filter.vec <- unique(filter.vec)
  14. df$putin <- FALSE
  15. df$putin[filter.vec] <- TRUE
  16.  
  17. filter.vec <- append(grep('обама', tolower(df$text)), grep('обама', tolower(df$title.text)))
  18. filter.vec <- unique(filter.vec)
  19. df$obama <- FALSE
  20. df$obama[filter.vec] <- TRUE
  21.  
  22. filter.vec <- append(grep('меркель', tolower(df$text)), grep('меркель', tolower(df$title.text)))
  23. filter.vec <- unique(filter.vec)
  24. df$merkel <- FALSE
  25. df$merkel[filter.vec] <- TRUE
  26.  
  27. filter.vec <- append(grep('порошенко', tolower(df$text)), grep('порошенко', tolower(df$title.text)))
  28. filter.vec <- unique(filter.vec)
  29. df$porosh <- FALSE
  30. df$porosh[filter.vec] <- TRUE
  31.  
  32. 100*sum(df$putin | df$obama | df$merkel | df$porosh)/nrow(df)
  33.  
  34. sum.df <- data.frame()
  35. sum.df <- rbind(sum.df, table(df$obama, df$day)['TRUE', ])
  36. sum.df <- rbind(sum.df, table(df$putin, df$day)['TRUE', ])
  37. sum.df <- rbind(sum.df, table(df$merkel, df$day)['TRUE', ])
  38. sum.df <- rbind(sum.df, table(df$porosh, df$day)['TRUE', ])
  39. names(sum.df) <- colnames(table(df$obama, df$day))
  40. rownames(sum.df) <- c('Obama', 'Putin', 'Merkel', 'Poroshenko')
  41. write.csv(sum.df, 'persons.csv')
  42.  
  43. library(tm)
  44. df$f.txt <- paste(df$title.text, df$text, sep=": ")
  45. corp <- Corpus(DataframeSource(data.frame(df[, "f.txt"])))
  46. corp <- tm_map(corp, removePunctuation)
  47. corp <- tm_map(corp, content_transformer(tolower))
  48. corp <- tm_map(corp, function(x) removeWords(x, stopwords("ru")))
  49. tdm <- TermDocumentMatrix(corp)
  50. m <- as.matrix(tdm)
  51. v <- sort(rowSums(m),decreasing=TRUE)
  52. d <- data.frame(word = names(v),freq=v)
  53. d <- d[d$freq >= 100, ]
  54. write.csv(d, 'wordcloud.csv', row.names = FALSE)
  55.  
  56.  
  57. filter.vec <- append(grep('росси', tolower(df$text)), grep('росси', tolower(df$title.text)))
  58. filter.vec <- unique(filter.vec)
  59. df$rus <- FALSE
  60. df$rus[filter.vec] <- TRUE
  61.  
  62. filter.vec <- append(grep('сша', tolower(df$text)), grep('сша', tolower(df$title.text)))
  63. filter.vec <- unique(filter.vec)
  64. df$us <- FALSE
  65. df$us[filter.vec] <- TRUE
  66.  
  67. filter.vec <- append(grep('герман', tolower(df$text)), grep('герман', tolower(df$title.text)))
  68. filter.vec <- unique(filter.vec)
  69. df$de <- FALSE
  70. df$de[filter.vec] <- TRUE
  71.  
  72. filter.vec <- append(grep('украин', tolower(df$text)), grep('украин', tolower(df$title.text)))
  73. filter.vec <- unique(filter.vec)
  74. df$ukraine <- FALSE
  75. df$ukraine[filter.vec] <- TRUE
  76.  
  77. 100*sum(df$rus | df$us | df$de | df$ukraine)/nrow(df)
  78.  
  79. sum.df <- data.frame()
  80. sum.df <- rbind(sum.df, table(df$us, df$day)['TRUE', ])
  81. sum.df <- rbind(sum.df, table(df$rus, df$day)['TRUE', ])
  82. sum.df <- rbind(sum.df, table(df$de, df$day)['TRUE', ])
  83. sum.df <- rbind(sum.df, table(df$ukraine, df$day)['TRUE', ])
  84. names(sum.df) <- colnames(table(df$rus, df$day))
  85. rownames(sum.df) <- c('USA', 'Russia', 'Germany', 'Ukraine')
  86. write.csv(sum.df, 'countries.csv')
Add Comment
Please, Sign In to add comment