Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(tidyverse)
- library(tidytext)
- library(tm)
- library(topicmodels)
- glimpse(datsub)
- Observations: 14,108
- Variables: 6
- $ Product.Name <chr> "iphone 4s", "iphone 4s", "iphone 4s", "iphone 4s", "iphone 4s", "iphone 4...
- $ Brand.Name <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""...
- $ Price <dbl> 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115,...
- $ Rating <int> 5, 1, 4, 5, 5, 3, 5, 5, 5, 1, 5, 5, 1, 5, 2, 5, 5, 4, 5, 1, 4, 1, 1, 1, 4,...
- $ Reviews <chr> "new great price phone really quick great seller", "star product false adv...
- $ Review.Votes <int> 2, 1, 0, 1, 2, 2, 2, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
- corpus = Corpus(VectorSource(datsub$Reviews))
- dtm = DocumentTermMatrix(corpus)
- ap_lda = LDA(dtm,
- method = "Gibbs",
- k = 4,
- control = list(seed = 1))
- ap_topics = tidy(ap_lda, matrix = "beta")
- ap_top_terms = ap_topics %>%
- group_by(topic) %>%
- top_n(5, beta) %>%
- arrange(topic, -beta)
- ap_top_terms %>%
- ggplot(aes(reorder(term, beta), beta, fill = factor(topic))) +
- geom_col(show.legend = FALSE) +
- facet_wrap(~ topic, scales = "free") +
- labs(x = "terms") +
- coord_flip()
- cleaned_reviews = datsub %>%
- unnest_tokens(word, Reviews) %>%
- rename(term = word)
- top_terms1 = ap_topics %>%
- filter(topic == 1) %>%
- arrange(-beta) %>%
- top_n(10, beta)
- merged_dat1 =left_join(top_terms1, cleaned_reviews, by = "term") %>%
- group_by(Product.Name, term, beta) %>%
- summarise(n = n()) %>%
- group_by(Product.Name) %>%
- mutate(N = sum(n)) %>%
- group_by(Product.Name, term) %>%
- mutate(freq = n/N) %>%
- mutate(weight = freq * beta) %>%
- select(Product.Name, term, weight) %>%
- spread(term, weight)
- glimpse(merged_dat1)
- merged_dat1
- # A tibble: 4 x 11
- # Groups: Product.Name [4]
- Product.Name camera fast feature good love low nice picture price quality
- <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
- 1 BLU Studio 5.0 0.00990 0.000747 0.000970 0.208 0.000201 0.000686 0.00428 0.00205 0.0115 0.00632
- 2 iphone 4s 0.00121 0.000626 0.000247 0.327 0.000204 0.000554 0.00625 0.000549 0.00588 0.00303
- 3 Motorola Moto E 0.0129 0.000573 0.00128 0.227 0.0000610 0.000981 0.00376 0.00162 0.0139 0.00451
- 4 Samsung Galaxy II 0.00520 0.000432 0.00150 0.277 0.000182 0.000541 0.00651 0.00117 0.00881 0.00305
- dist1 = dist(merged_dat1, method = "euclidean", upper = TRUE, diag = TRUE)
- dist1 = as.matrix(dist1)
- dist2 = dist(merged_dat2, method = "euclidean", upper = TRUE, diag = TRUE)
- dist2 = as.matrix(dist2)
- mds1 = cmdscale(dist1, k= 1, eig = TRUE, x.ret = TRUE)
- mds2 = cmdscale(dist2, k= 1, eig = TRUE, x.ret = TRUE)
- dim1 = mds1$points
- dim2 = mds2$points
- data.frame(dim1, dim2) %>%
- ggplot() +
- geom_point(aes(x = dim1, y = dim2, color = merged_dat1$Product.Name)) +
- labs(x = "dim 1", y = "dim 1", color = "Brand") +
- theme_minimal()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement