Guest User

Untitled

a guest
Feb 19th, 2018
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.29 KB | None | 0 0
  1. library(tm)
  2. library(SnowballC)
  3. library(stringr)
  4. library(ggplot2)
  5.  
  6. ## RQ: What are the most commonly used words & the term frequencies in the 'Protestants-Christelijk' schools and 'Openbaar' schools?
  7.  
  8. # Read in our school metadata
  9. schools <- read.csv("schools.csv",
  10. sep = ";",
  11. stringsAsFactors = FALSE)
  12.  
  13. ### PDF corpus ###
  14.  
  15. gids <- readRDS("/opt/duo/schoolgids2017v1_500.rds")
  16. gids_id <- substring(names(gids), 1, 6)
  17.  
  18. ### Document pre-processing
  19. TurnSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})
  20. gids <- tm_map(gids, TurnSpace, "/|”|@|//|$|:|:)|*|&|!|?|_|-|‐|•|·|#|…+")
  21.  
  22. gids <- tm_map(gids, content_transformer(tolower))
  23. gids <- tm_map(gids, stripWhitespace)
  24. gids <- tm_map(gids, removePunctuation)
  25. gids <- tm_map(gids, function(x) stemDocument(x, language = "dutch"))
  26. gids <- tm_map(gids, removeNumbers)
  27. gids <- tm_map(gids, removeWords, tm::stopwords("dutch"))
  28.  
  29. ## Create a DocumentTermMatrix
  30. dtm <- DocumentTermMatrix(gids)
  31. ddtm <- as.matrix(removeSparseTerms(dtm, sparse = 0.90))
  32.  
  33.  
  34. ## Match the gids to rows in our schools table
  35. gids_row <- match(gids_id, schools$VESTIGINGSNUMMER)
  36. gids_denominatie <- schools$DENOMINATIE[gids_row]
  37.  
  38. ## Compute a dummy variable indicating chrisitian schools
  39. christian <- as.integer(gids_denominatie %in% c("Protestants-Christelijk", "Rooms-Katholiek"))
  40.  
  41. # Normalize by frequency per 1000 words
  42. ndtm <- ddtm / rowSums(ddtm) * 1000
  43. empty <- rowSums(ddtm) == 0
  44.  
  45. # Compute a correlation matrix between our dummy variable and the term frequency
  46. cor_matrix <- cor(gids_score[!empty], ndtm[!empty, ], use="complete.obs")
  47.  
  48. cor_df <- data.frame(term = colnames(cor_matrix), cor = cor_matrix[1,],
  49. row.names = NULL, stringsAsFactors = FALSE)
  50.  
  51. # Order by absolute correlation
  52. cor_df <- cor_df[ order(-abs(cor_df$cor)), ]
  53.  
  54. # Take top 30 positive correlations
  55. pos_cor <- cor_df[ cor_df$cor > 0, ][1:40, ]
  56. ggplot(pos_cor, aes(reorder(term, cor), cor)) +
  57. geom_col() +
  58. coord_flip() +
  59. xlab("Terms") + ylab("Terms with Positive Correlation with CHRISTIAN")
  60.  
  61. # ... And top 30 negative correlations
  62. neg_cor <- cor_df[ cor_df$cor < 0, ][1:40, ]
  63. ggplot(neg_cor, aes(reorder(term, cor), cor)) +
  64. geom_col() +
  65. coord_flip() +
  66. xlab("Terms") + ylab("Terms with Negative Correlation with CORRELATION")
Add Comment
Please, Sign In to add comment