Untitled

pacman::p_load(quanteda, magrittr, skmeans)

setwd("/Users/James/Documents/University/rfiles")
load("data_nc.R")

text <- as.character(dataraw_nc$speech)
rm(dataraw_nc)

text <- gsub("’", "", text)

dtm <- dfm(text, tolower = TRUE, stem = TRUE, remove = c(stopwords("english"), "will", "hon") ,
           valuetype = "fixed", verbose = TRUE, remove_numbers = TRUE,
           remove_punct = TRUE, remove_separators = TRUE, remove_symbols = TRUE)

rm(text)

dtm <- dfm_trim(dtm, min_count = 6, verbose = TRUE) %>%
          dfm_weight(., type = "tfidf")

dtm <- dtm[rowSums(dtm) > 0, ]

clust_sk <- lapply(5:20, function(x) skmeans(dtm,  x, method='pclust', control=list(verbose=TRUE)))