Evaluation Metrics Demo using recommenderlab package in R

# Installing dependencies
# install.packages(c('ggplot2', 'recommenderlab', 'manipulate'))

# Importing Libraries
library(recommenderlab)
library(ggplot2)
library(manipulate)

# Importing dataset
data(MovieLense)

# Exploratory Data Analysis
MovieLense

# 943 * 1664 (= 15,69,152) isnt a complete user-movie rating matrix. There are a lot of NAs.
# There are only 99,392 ratings. Sparsity of approx 6%.

# Ratings for a particular user
as(MovieLense[1, ], "data.frame") #Explain the result
names(as(MovieLense[1, ], "data.frame"))
nrow(as(MovieLense[1, ], "data.frame"))
nrow(as(MovieLense[2, ], "data.frame"))

# Ratings for a particular item
as(MovieLense[, 1], "data.frame")
names(as(MovieLense[, 1], "data.frame"))
nrow(as(MovieLense[, 1], "data.frame"))
nrow(as(MovieLense[, 2], "data.frame"))

# How many movies did users rate, on an average?
hist(rowCounts(MovieLense), breaks = 10) # x-axis: number of movies, y-axis-number of users who have rated the number of movies falling the x-axis range
# variable number of breaks
manipulate(hist(rowCounts(MovieLense), breaks = brk), brk = slider(10, 400))

df <- as(MovieLense, "data.frame")

# Average rating by a particular user, 942
mean(df[df$user == 942, "rating"])

# Average of all the users
tapply(df$rating, df$user, mean)
tapply(df$rating, df$user, mean)[["942"]]

# Plotting the averages of all users
barplot(tapply(df$rating, df$user, mean))
barplot(sort(tapply(df$rating, df$user, mean), decreasing = T))

# Average rating for a particular movie, say 1
mean(df[df$item == df$item[1], "rating"])

# Average rating for all the movies
tapply(df$rating, df$item, mean)
tapply(df$rating, df$item, mean)[[df$item[1]]]

# Plotting the averages of all the movies
barplot(tapply(df$rating, df$item, mean))
barplot(sort(tapply(df$rating, df$item, mean), decreasing = T))

# User 849 - high rating. User 3 - low rating. Lets verify that.
tapply(df$rating, df$user, mean)[["849"]]
tapply(df$rating, df$user, mean)[["3"]]

# Histogram of Ratings
g <- ggplot(as(MovieLense, "data.frame"), aes(x = rating))
g <- g + geom_histogram() # pass binwidth = 1
g

# Histogram of normalized ratings. Normalization method is Z-score.
g <- ggplot(as(normalize(MovieLense, method = "Z-score"), "data.frame"), aes(x = rating))
g <- g + geom_histogram() # pass varying binwidth from 0.1 to 0.5
g

# Histogram of average ratings
qplot(colMeans(MovieLense), binwidth = 0.1)

# normalize will convert it to a normal curve. Proof: summary
summary(as(normalize(MovieLense, method = "Z-score"), "data.frame")$rating)

# Lets get recommending!
recommenderRegistry$get_entries(dataType = "realRatingMatrix")
# We have a few options

# Let's check some algorithms against each other
scheme <- evaluationScheme(MovieLense, method = "split", train = .9, k = 1, given = 10, goodRating = 4)

scheme

algorithms <- list(
  "random items" = list(name="RANDOM", param=list(normalize = "Z-score")),
  "popular items" = list(name="POPULAR", param=list(normalize = "Z-score")),
  "user-based CF" = list(name="UBCF", param=list(normalize = "Z-score", method="Cosine", nn=50, minRating=3)),
  "item-based CF" = list(name="IBCF", param=list(normalize = "Z-score"
  ))
)

# run algorithms, predict next n movies
results <- evaluate(scheme, algorithms, n=c(1, 3, 5, 10, 15, 20))

# Draw ROC curve
plot(results, annotate = 1:4, legend="topleft")

# See precision / recall
plot(results, "prec/rec", annotate=1:4)