Advertisement
Guest User

Untitled

a guest
Feb 19th, 2019
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.56 KB | None | 0 0
  1. ###########################
  2. # ALDA: hw2.R
  3. # Instructor: Dr. Thomas Price
  4. # Mention your team details here
  5. #
  6. #
  7. #
  8. #
  9. ############################
  10.  
  11. require(caret)
  12. require(rpart)
  13.  
  14.  
  15. calculate_distance_matrix <- function(train_matrix, test_matrix, method_name){
  16. # NOTE: This function has already been implemented for you.
  17. # DO NOT modifiy this function.
  18.  
  19. # INPUT:
  20. # Input: train_matrix: type: matrix n_sentences x sentence_length,
  21. # where n_sentences is total # training rows (100 in the dataset supplied to you) and
  22. # sentence_length is the total # features (100 in the dataset supplied to you).
  23. # Input: test_matrix: type: matrix of size 50 x 100 (i.e, 50 rows, 100 features)
  24. # Input: method_name: type: string, can be one of the following values: ('calculate_euclidean', 'calculate_cosine')
  25.  
  26. # OUTPUT:
  27. # output: a 50 x 100 matrix of type double, containing the distance/similarity calculation using method_name between
  28. # every row in test to every row in train
  29. # This function has already been implemented for you. It takes the data matrix and method name, outputs the distance
  30. # matrix based on the method name.
  31.  
  32. distance_matrix = matrix(0L, nrow = nrow(test_matrix), ncol = nrow(train_matrix))
  33. # the looping logic for pairwise distances is already provided for you
  34. for(i in seq(1, nrow(test_matrix))){
  35. for(j in seq(1, nrow(train_matrix))){
  36. distance_matrix[i,j] <- do.call(method_name, list(unlist(test_matrix[i,]), unlist(train_matrix[j,])))
  37. }
  38. }
  39. return(distance_matrix)
  40. }
  41.  
  42. calculate_euclidean <- function(p, q) {
  43. # Input: p, q are vectors of size 1 x 100, each representing a row (i.e., a sentence) from the original dataset.
  44. # output: a single value of type double, containing the euclidean distance between the vectors p and q
  45. # Write code here to calculate the euclidean distance between pair of vectors p and q
  46. return(sqrt(sum((p-q)^2)))
  47. }
  48.  
  49. calculate_cosine <- function(p, q) {
  50. # Input: p, q are vectors of size 1 x 100, each representing a row (i.e., a sentence) from the original dataset.
  51. # output: a single value of type double, containing the cosine distance between the vectors p and q
  52. # Write code here to calculate the cosine distance between pair of vectors p and q
  53. return((p%*%q)/(sqrt(sum(p^2))*sqrt(sum(q^2))) )
  54. }
  55.  
  56. knn_classifier <- function(x_train, y_train, x_test, distance_method, k){
  57. # You will be IMPLEMENTING a KNN Classifier here
  58.  
  59. # Build a distance matrix by computing the distance between every test sentence
  60. # (row in training TF-IDF matrix) and training sentence (row in test TF-IDF matrix).
  61. # Use the above calculate_distance_matrix function to calculate this distance matrix (code already given to you).
  62. # You can re-use the calculate_euclidean and calculate_cosine methods from HW1 here.
  63. # Once the distance matrix is computed, for each row in the distance matrix, calculate the 'k' nearest neighbors
  64. # and return the most frequently occurring class from these 'k' nearest neighbors.
  65.  
  66. # INPUT:
  67. # x_train: TF-IDF matrix with dimensions: (number_training_sentences x number_features)
  68. # y_train: Vector with length number_training_sentences of type factor - refers to the class labels
  69. # x_test: TF-IDF matrix with dimensions: (number_test_sentences x number_features)
  70. # k: integer, represents the 'k' to consider in the knn classifier
  71. # distance_method: String, can be of type ('calcualte_euclidean' or 'calculate_cosine')
  72. # OUTPUT:
  73. # A vector of predictions of length = number of sentences in x_test and of type factor.
  74.  
  75. # NOTE 1: Don't normalize the data before calculating the distance matrix
  76.  
  77. # NOTE 2: For cosine, remember, you are calculating similarity, not distance. As a result, K nearest neighbors
  78. # k values with highest values from the distance_matrix, not lowest.
  79. # For euclidean, you are calculating distance, so you need to consider the k lowest values.
  80.  
  81. # NOTE 3:
  82. # In case of conflicts, choose the class with lower numerical value
  83. # E.g.: in 5NN, if you have 2 NN of class 1, 2 NN of class 2, and 1 NN of class 3, there is a conflict b/w class 1 and class 2
  84. # In this case, you will choose class 1.
  85.  
  86. # NOTE 4:
  87. # You are not allowed to use predefined knn-based packages/functions. Using them will result in automatic zero.
  88. # Allowed packages: R base, utils
  89. result = c()
  90. if (distance_method == "calcualte_euclidean"){
  91. dist.mat = calculate_distance_matrix(x_train, x_test, "calcualte_euclidean")
  92. }
  93. else{
  94. dist.mat = calculate_distance_matrix(x_train, x_test, "calculate_cosine")
  95. }
  96.  
  97. for (row in 1:nrow(dist.mat)) {
  98. sorted = sort(dist.mat[row,])
  99. indexs = replicate(k, -1)
  100. k_list = sorted[1:k]
  101. for(j in 1:length(k_list)){
  102. indexs[j] = match(k_list[j],dist.mat[row,])
  103. }
  104. pred_labels = c()
  105. for(i in 1:k){
  106. pred_labels[i] = y_train[indexs[i]]
  107. }
  108. unique_labels = unique(pred_labels)
  109. result[row] = unique_labels[which.max(tabulate(match(pred_labels, unique_labels)))]
  110. }
  111. return(factor(results))
  112. }
  113.  
  114. dtree <- function(x_train, y_train, x_test){
  115. set.seed(123)
  116. # You will build a CART decision tree, then use the tuned model to predict class values for a test dataset.
  117.  
  118. # INPUT:
  119. # x_train: TF-IDF matrix with dimensions: (number_training_sentences x number_features)
  120. # y_train: Vector with length number_training_sentences of type factor - refers to the class labels
  121. # x_test: TF-IDF matrix with dimensions: (number_test_sentences x number_features)
  122.  
  123. # OUTPUT:
  124. # A vector of predictions of length = number of sentences in y_test and of type factor.
  125.  
  126. # Allowed packages: rpart, R Base, utils
  127.  
  128. # HINT1: Make sure to read the documentation for the rpart package. Check out the 'rpart' and 'predict' functions.
  129.  
  130. # HINT2: I've given you attributes and class labels as separate variables. Do you need to combine them
  131. # into a data frame for rpart?
  132. total = cbind(x_train, y_train)
  133. df_train = data.frame(total)
  134. fit <- rpart(y_train ~.,
  135. method="class", data=total, split = "gini")
  136. result = predict(fit, x_test, method="class")
  137. return(result)
  138.  
  139. }
  140.  
  141.  
  142. dtree_cv <- function(x_train, y_train, x_test, n_folds){
  143. set.seed(123)
  144. # You will build a decision tree and tune its parameters using n-fold crossvalidation on the *training* dataset,
  145. # then use the tuned model to predict class values for a test dataset.
  146.  
  147. # INPUT:
  148. # x_train: TF-IDF matrix with dimensions: (number_training_sentences x number_features)
  149. # y_train: Vector with length number_training_sentences of type factor - refers to the class labels
  150. # x_test: TF-IDF matrix with dimensions: (number_test_sentences x number_features)
  151. # n_folds: integer, refers to the number of folds for n-fold cross validation
  152.  
  153. # OUTPUT:
  154. # A vector of predictions of length = number of sentences in y_test and of type factor.
  155.  
  156. # Allowed packages: rpart, caret, R Base, utils
  157.  
  158. # HINT1: Make sure to read the documentation for the caret package. Check out the 'train' and 'trainControl' functions.
  159. total = cbind(x_train, y_train)
  160. df_train = data.frame(total)
  161. trainControl = trainControl(method = cv, number =n_folds)
  162. fit <- rpart(y_train ~.,
  163. method="rpart", data=total, split = "gini", trControl=trainControl)
  164. result = predict(fit, x_test, type="raw")
  165. return(result)
  166.  
  167. }
  168.  
  169.  
  170. calculate_accuracy <- function(y_pred, y_true){
  171. # Given the following:
  172. mat <- matrix(0, nrow = length(y_pred), ncol = length(y_true), dimnames = list(c("Prediction"), c("Reference")))
  173. for(i in 1:length(y_pred)) {
  174. matrix[[y_pred[i]]][[y_true[i]]] =+ 1
  175. }
  176.  
  177. dbl_TP <- 0
  178. dbl_TN <- 0
  179.  
  180. for(i in 1:length(y_pred)) {
  181. for(j in 1:length(y_true)) {
  182.  
  183. if(i==j) {
  184. dbl_TP =+ matrix[i][j]
  185. } else {
  186. dbl_TN =+ matrix[i][j]
  187. }
  188. }
  189.  
  190. }
  191. dbl_accuracy = (dbl_Tp + dbl_TN) / length(y_pred)
  192. table = data.table(mat, keep.rownames = TRUE)
  193.  
  194. return( c(table, dbl_accuracy))
  195. # INPUT:
  196. # y_pred: predicted class labels (vector, each value of type factor)
  197. # y_true: ground truth class labels (vector, each value of type factor)
  198.  
  199. # OUTPUT:
  200. # a list in the following order: [confusion matrix, overall accuracy], where confusion matrix is of class "table"
  201. # (see Figure 2 in the PDF for an example Confusion Matrix)
  202. # and overall accuracy is on a scale of 0-1 of type double
  203. # overall class accuracy = accuracy of all the classes
  204.  
  205. # confusion matrix should have Prediction to the left, and Reference on the top.
  206.  
  207.  
  208. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement