Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Installing dependencies
- # install.packages(c('ggplot2', 'recommenderlab', 'manipulate'))
- # Importing Libraries
- library(recommenderlab)
- library(ggplot2)
- library(manipulate)
- # Importing dataset
- data(MovieLense)
- # Exploratory Data Analysis
- MovieLense
- # 943 * 1664 (= 15,69,152) isnt a complete user-movie rating matrix. There are a lot of NAs.
- # There are only 99,392 ratings. Sparsity of approx 6%.
- # Ratings for a particular user
- as(MovieLense[1, ], "data.frame") #Explain the result
- names(as(MovieLense[1, ], "data.frame"))
- nrow(as(MovieLense[1, ], "data.frame"))
- nrow(as(MovieLense[2, ], "data.frame"))
- # Ratings for a particular item
- as(MovieLense[, 1], "data.frame")
- names(as(MovieLense[, 1], "data.frame"))
- nrow(as(MovieLense[, 1], "data.frame"))
- nrow(as(MovieLense[, 2], "data.frame"))
- # How many movies did users rate, on an average?
- hist(rowCounts(MovieLense), breaks = 10) # x-axis: number of movies, y-axis-number of users who have rated the number of movies falling the x-axis range
- # variable number of breaks
- manipulate(hist(rowCounts(MovieLense), breaks = brk), brk = slider(10, 400))
- df <- as(MovieLense, "data.frame")
- # Average rating by a particular user, 942
- mean(df[df$user == 942, "rating"])
- # Average of all the users
- tapply(df$rating, df$user, mean)
- tapply(df$rating, df$user, mean)[["942"]]
- # Plotting the averages of all users
- barplot(tapply(df$rating, df$user, mean))
- barplot(sort(tapply(df$rating, df$user, mean), decreasing = T))
- # Average rating for a particular movie, say 1
- mean(df[df$item == df$item[1], "rating"])
- # Average rating for all the movies
- tapply(df$rating, df$item, mean)
- tapply(df$rating, df$item, mean)[[df$item[1]]]
- # Plotting the averages of all the movies
- barplot(tapply(df$rating, df$item, mean))
- barplot(sort(tapply(df$rating, df$item, mean), decreasing = T))
- # User 849 - high rating. User 3 - low rating. Lets verify that.
- tapply(df$rating, df$user, mean)[["849"]]
- tapply(df$rating, df$user, mean)[["3"]]
- # Histogram of Ratings
- g <- ggplot(as(MovieLense, "data.frame"), aes(x = rating))
- g <- g + geom_histogram() # pass binwidth = 1
- g
- # Histogram of normalized ratings. Normalization method is Z-score.
- g <- ggplot(as(normalize(MovieLense, method = "Z-score"), "data.frame"), aes(x = rating))
- g <- g + geom_histogram() # pass varying binwidth from 0.1 to 0.5
- g
- # Histogram of average ratings
- qplot(colMeans(MovieLense), binwidth = 0.1)
- # normalize will convert it to a normal curve. Proof: summary
- summary(as(normalize(MovieLense, method = "Z-score"), "data.frame")$rating)
- # Lets get recommending!
- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
- # We have a few options
- # Let's check some algorithms against each other
- scheme <- evaluationScheme(MovieLense, method = "split", train = .9, k = 1, given = 10, goodRating = 4)
- scheme
- algorithms <- list(
- "random items" = list(name="RANDOM", param=list(normalize = "Z-score")),
- "popular items" = list(name="POPULAR", param=list(normalize = "Z-score")),
- "user-based CF" = list(name="UBCF", param=list(normalize = "Z-score", method="Cosine", nn=50, minRating=3)),
- "item-based CF" = list(name="IBCF", param=list(normalize = "Z-score"
- ))
- )
- # run algorithms, predict next n movies
- results <- evaluate(scheme, algorithms, n=c(1, 3, 5, 10, 15, 20))
- # Draw ROC curve
- plot(results, annotate = 1:4, legend="topleft")
- # See precision / recall
- plot(results, "prec/rec", annotate=1:4)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement