Guest User

Untitled

a guest
May 21st, 2018
137
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.16 KB | None | 0 0
  1. # Data Preprocessing Template
  2.  
  3. rm(list = ls())
  4.  
  5. # Importing the dataset
  6.  
  7. dataset <- read.csv(file.path(getwd(),'Data/Data.csv'))
  8.  
  9. #Taking care of missing data
  10.  
  11. any(is.na(dataset)) # the answer to "do we have missing data in the dataset?"
  12.  
  13. sum(is.na(dataset)) # total number of missing data
  14.  
  15. (x <- sapply(dataset, function(x) any(is.na(x)))) # which column have missing data and which is not
  16.  
  17. x[x==TRUE] # display only the coulmns with missing data
  18.  
  19. dataset[!complete.cases(dataset), ] # which rows have missing data
  20.  
  21. new_dataset <- na.omit(dataset) # new dataset without missing data
  22.  
  23. # ave is similar to using split and lapply
  24. # gl is general factor levels
  25.  
  26. dataset$Age <- ifelse(is.na(dataset$Age),
  27. mean(dataset$Age, na.rm = TRUE),
  28. dataset$Age)
  29.  
  30. dataset$Salary <- ifelse(is.na(dataset$Salary),
  31. mean(dataset$Salary, na.rm = TRUE),
  32. dataset$Salary)
  33.  
  34. # Encoding categorical data
  35.  
  36. dataset$Country <- factor(dataset$Country, levels = c("France", "Spain", "Germany"),
  37. labels = 1:3)
  38.  
  39. dataset$Purchased <- factor(dataset$Purchased, levels = c("No", "Yes"),
  40. labels = 0:1)
  41.  
  42. # The base R function model.matrix would generate dummy variables
  43. head(model.matrix(Purchased ~ ., data = dataset), 10)
  44.  
  45.  
  46. # Splitting the dataset into the Training set and Test set
  47.  
  48. # install.packages('caTools')
  49.  
  50. if (!require("caTools"))
  51. install.packages("caTools") # to install the caTools package if you don't have it
  52.  
  53. set.seed(123) # this seed will help everyone to get the same results
  54.  
  55. # split <- sample.split(dataset$DependentVariable, SplitRatio = 0.8)
  56. split <- sample.split(dataset$Purchased, SplitRatio = 0.8)
  57.  
  58. training_set <- subset(dataset, split == TRUE)
  59. test_set <- subset(dataset, split == FALSE)
  60.  
  61. # Feature Scaling
  62. # becasue most machine learning algorithms use the euclidean distance, we must use the same scale
  63. # otherwise, the axis with the largest scale will dominate the small one
  64. # however, not all algorithms need the varibales to be rescaled, so i dont have to do it manually
  65.  
  66. training_set[ , 2:3] <- scale(training_set[ , 2:3])
  67. test_set[ , 2:3] <- scale(test_set[ , 2:3])
Add Comment
Please, Sign In to add comment