Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###########################
- # ALDA: hw2.R
- # Instructor: Dr. Thomas Price
- # Mention your team details here
- #
- #
- #
- #
- ############################
- require(caret)
- require(rpart)
- calculate_distance_matrix <- function(train_matrix, test_matrix, method_name){
- # NOTE: This function has already been implemented for you.
- # DO NOT modifiy this function.
- # INPUT:
- # Input: train_matrix: type: matrix n_sentences x sentence_length,
- # where n_sentences is total # training rows (100 in the dataset supplied to you) and
- # sentence_length is the total # features (100 in the dataset supplied to you).
- # Input: test_matrix: type: matrix of size 50 x 100 (i.e, 50 rows, 100 features)
- # Input: method_name: type: string, can be one of the following values: ('calculate_euclidean', 'calculate_cosine')
- # OUTPUT:
- # output: a 50 x 100 matrix of type double, containing the distance/similarity calculation using method_name between
- # every row in test to every row in train
- # This function has already been implemented for you. It takes the data matrix and method name, outputs the distance
- # matrix based on the method name.
- distance_matrix = matrix(0L, nrow = nrow(test_matrix), ncol = nrow(train_matrix))
- # the looping logic for pairwise distances is already provided for you
- for(i in seq(1, nrow(test_matrix))){
- for(j in seq(1, nrow(train_matrix))){
- distance_matrix[i,j] <- do.call(method_name, list(unlist(test_matrix[i,]), unlist(train_matrix[j,])))
- }
- }
- return(distance_matrix)
- }
- calculate_euclidean <- function(p, q) {
- # Input: p, q are vectors of size 1 x 100, each representing a row (i.e., a sentence) from the original dataset.
- # output: a single value of type double, containing the euclidean distance between the vectors p and q
- # Write code here to calculate the euclidean distance between pair of vectors p and q
- return(sqrt(sum((p-q)^2)))
- }
- calculate_cosine <- function(p, q) {
- # Input: p, q are vectors of size 1 x 100, each representing a row (i.e., a sentence) from the original dataset.
- # output: a single value of type double, containing the cosine distance between the vectors p and q
- # Write code here to calculate the cosine distance between pair of vectors p and q
- return((p%*%q)/(sqrt(sum(p^2))*sqrt(sum(q^2))) )
- }
- knn_classifier <- function(x_train, y_train, x_test, distance_method, k){
- # You will be IMPLEMENTING a KNN Classifier here
- # Build a distance matrix by computing the distance between every test sentence
- # (row in training TF-IDF matrix) and training sentence (row in test TF-IDF matrix).
- # Use the above calculate_distance_matrix function to calculate this distance matrix (code already given to you).
- # You can re-use the calculate_euclidean and calculate_cosine methods from HW1 here.
- # Once the distance matrix is computed, for each row in the distance matrix, calculate the 'k' nearest neighbors
- # and return the most frequently occurring class from these 'k' nearest neighbors.
- # INPUT:
- # x_train: TF-IDF matrix with dimensions: (number_training_sentences x number_features)
- # y_train: Vector with length number_training_sentences of type factor - refers to the class labels
- # x_test: TF-IDF matrix with dimensions: (number_test_sentences x number_features)
- # k: integer, represents the 'k' to consider in the knn classifier
- # distance_method: String, can be of type ('calcualte_euclidean' or 'calculate_cosine')
- # OUTPUT:
- # A vector of predictions of length = number of sentences in x_test and of type factor.
- # NOTE 1: Don't normalize the data before calculating the distance matrix
- # NOTE 2: For cosine, remember, you are calculating similarity, not distance. As a result, K nearest neighbors
- # k values with highest values from the distance_matrix, not lowest.
- # For euclidean, you are calculating distance, so you need to consider the k lowest values.
- # NOTE 3:
- # In case of conflicts, choose the class with lower numerical value
- # E.g.: in 5NN, if you have 2 NN of class 1, 2 NN of class 2, and 1 NN of class 3, there is a conflict b/w class 1 and class 2
- # In this case, you will choose class 1.
- # NOTE 4:
- # You are not allowed to use predefined knn-based packages/functions. Using them will result in automatic zero.
- # Allowed packages: R base, utils
- result = c()
- if (distance_method == "calcualte_euclidean"){
- dist.mat = calculate_distance_matrix(x_train, x_test, "calcualte_euclidean")
- }
- else{
- dist.mat = calculate_distance_matrix(x_train, x_test, "calculate_cosine")
- }
- for (row in 1:nrow(dist.mat)) {
- sorted = sort(dist.mat[row,])
- indexs = replicate(k, -1)
- k_list = sorted[1:k]
- for(j in 1:length(k_list)){
- indexs[j] = match(k_list[j],dist.mat[row,])
- }
- pred_labels = c()
- for(i in 1:k){
- pred_labels[i] = y_train[indexs[i]]
- }
- unique_labels = unique(pred_labels)
- result[row] = unique_labels[which.max(tabulate(match(pred_labels, unique_labels)))]
- }
- return(factor(results))
- }
- dtree <- function(x_train, y_train, x_test){
- set.seed(123)
- # You will build a CART decision tree, then use the tuned model to predict class values for a test dataset.
- # INPUT:
- # x_train: TF-IDF matrix with dimensions: (number_training_sentences x number_features)
- # y_train: Vector with length number_training_sentences of type factor - refers to the class labels
- # x_test: TF-IDF matrix with dimensions: (number_test_sentences x number_features)
- # OUTPUT:
- # A vector of predictions of length = number of sentences in y_test and of type factor.
- # Allowed packages: rpart, R Base, utils
- # HINT1: Make sure to read the documentation for the rpart package. Check out the 'rpart' and 'predict' functions.
- # HINT2: I've given you attributes and class labels as separate variables. Do you need to combine them
- # into a data frame for rpart?
- total = cbind(x_train, y_train)
- df_train = data.frame(total)
- fit <- rpart(y_train ~.,
- method="class", data=total, split = "gini")
- result = predict(fit, x_test, method="class")
- return(result)
- }
- dtree_cv <- function(x_train, y_train, x_test, n_folds){
- set.seed(123)
- # You will build a decision tree and tune its parameters using n-fold crossvalidation on the *training* dataset,
- # then use the tuned model to predict class values for a test dataset.
- # INPUT:
- # x_train: TF-IDF matrix with dimensions: (number_training_sentences x number_features)
- # y_train: Vector with length number_training_sentences of type factor - refers to the class labels
- # x_test: TF-IDF matrix with dimensions: (number_test_sentences x number_features)
- # n_folds: integer, refers to the number of folds for n-fold cross validation
- # OUTPUT:
- # A vector of predictions of length = number of sentences in y_test and of type factor.
- # Allowed packages: rpart, caret, R Base, utils
- # HINT1: Make sure to read the documentation for the caret package. Check out the 'train' and 'trainControl' functions.
- total = cbind(x_train, y_train)
- df_train = data.frame(total)
- trainControl = trainControl(method = cv, number =n_folds)
- fit <- rpart(y_train ~.,
- method="rpart", data=total, split = "gini", trControl=trainControl)
- result = predict(fit, x_test, type="raw")
- return(result)
- }
- calculate_accuracy <- function(y_pred, y_true){
- # Given the following:
- mat <- matrix(0, nrow = length(y_pred), ncol = length(y_true), dimnames = list(c("Prediction"), c("Reference")))
- for(i in 1:length(y_pred)) {
- matrix[[y_pred[i]]][[y_true[i]]] =+ 1
- }
- dbl_TP <- 0
- dbl_TN <- 0
- for(i in 1:length(y_pred)) {
- for(j in 1:length(y_true)) {
- if(i==j) {
- dbl_TP =+ matrix[i][j]
- } else {
- dbl_TN =+ matrix[i][j]
- }
- }
- }
- dbl_accuracy = (dbl_Tp + dbl_TN) / length(y_pred)
- table = data.table(mat, keep.rownames = TRUE)
- return( c(table, dbl_accuracy))
- # INPUT:
- # y_pred: predicted class labels (vector, each value of type factor)
- # y_true: ground truth class labels (vector, each value of type factor)
- # OUTPUT:
- # a list in the following order: [confusion matrix, overall accuracy], where confusion matrix is of class "table"
- # (see Figure 2 in the PDF for an example Confusion Matrix)
- # and overall accuracy is on a scale of 0-1 of type double
- # overall class accuracy = accuracy of all the classes
- # confusion matrix should have Prediction to the left, and Reference on the top.
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement