Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(tm)
- library(SnowballC)
- library(stringr)
- library(ggplot2)
- ## RQ: What are the most commonly used words & the term frequencies in the 'Protestants-Christelijk' schools and 'Openbaar' schools?
- # Read in our school metadata
- schools <- read.csv("schools.csv",
- sep = ";",
- stringsAsFactors = FALSE)
- ### PDF corpus ###
- gids <- readRDS("/opt/duo/schoolgids2017v1_500.rds")
- gids_id <- substring(names(gids), 1, 6)
- ### Document pre-processing
- TurnSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})
- gids <- tm_map(gids, TurnSpace, "/|â€|@|//|$|:|:)|*|&|!|?|_|-|â€|•|·|#|…+")
- gids <- tm_map(gids, content_transformer(tolower))
- gids <- tm_map(gids, stripWhitespace)
- gids <- tm_map(gids, removePunctuation)
- gids <- tm_map(gids, function(x) stemDocument(x, language = "dutch"))
- gids <- tm_map(gids, removeNumbers)
- gids <- tm_map(gids, removeWords, tm::stopwords("dutch"))
- ## Create a DocumentTermMatrix
- dtm <- DocumentTermMatrix(gids)
- ddtm <- as.matrix(removeSparseTerms(dtm, sparse = 0.90))
- ## Match the gids to rows in our schools table
- gids_row <- match(gids_id, schools$VESTIGINGSNUMMER)
- gids_denominatie <- schools$DENOMINATIE[gids_row]
- ## Compute a dummy variable indicating chrisitian schools
- christian <- as.integer(gids_denominatie %in% c("Protestants-Christelijk", "Rooms-Katholiek"))
- # Normalize by frequency per 1000 words
- ndtm <- ddtm / rowSums(ddtm) * 1000
- empty <- rowSums(ddtm) == 0
- # Compute a correlation matrix between our dummy variable and the term frequency
- cor_matrix <- cor(gids_score[!empty], ndtm[!empty, ], use="complete.obs")
- cor_df <- data.frame(term = colnames(cor_matrix), cor = cor_matrix[1,],
- row.names = NULL, stringsAsFactors = FALSE)
- # Order by absolute correlation
- cor_df <- cor_df[ order(-abs(cor_df$cor)), ]
- # Take top 30 positive correlations
- pos_cor <- cor_df[ cor_df$cor > 0, ][1:40, ]
- ggplot(pos_cor, aes(reorder(term, cor), cor)) +
- geom_col() +
- coord_flip() +
- xlab("Terms") + ylab("Terms with Positive Correlation with CHRISTIAN")
- # ... And top 30 negative correlations
- neg_cor <- cor_df[ cor_df$cor < 0, ][1:40, ]
- ggplot(neg_cor, aes(reorder(term, cor), cor)) +
- geom_col() +
- coord_flip() +
- xlab("Terms") + ylab("Terms with Negative Correlation with CORRELATION")
Add Comment
Please, Sign In to add comment