Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Data Preprocessing Template
- rm(list = ls())
- # Importing the dataset
- dataset <- read.csv(file.path(getwd(),'Data/Data.csv'))
- #Taking care of missing data
- any(is.na(dataset)) # the answer to "do we have missing data in the dataset?"
- sum(is.na(dataset)) # total number of missing data
- (x <- sapply(dataset, function(x) any(is.na(x)))) # which column have missing data and which is not
- x[x==TRUE] # display only the coulmns with missing data
- dataset[!complete.cases(dataset), ] # which rows have missing data
- new_dataset <- na.omit(dataset) # new dataset without missing data
- # ave is similar to using split and lapply
- # gl is general factor levels
- dataset$Age <- ifelse(is.na(dataset$Age),
- mean(dataset$Age, na.rm = TRUE),
- dataset$Age)
- dataset$Salary <- ifelse(is.na(dataset$Salary),
- mean(dataset$Salary, na.rm = TRUE),
- dataset$Salary)
- # Encoding categorical data
- dataset$Country <- factor(dataset$Country, levels = c("France", "Spain", "Germany"),
- labels = 1:3)
- dataset$Purchased <- factor(dataset$Purchased, levels = c("No", "Yes"),
- labels = 0:1)
- # The base R function model.matrix would generate dummy variables
- head(model.matrix(Purchased ~ ., data = dataset), 10)
- # Splitting the dataset into the Training set and Test set
- # install.packages('caTools')
- if (!require("caTools"))
- install.packages("caTools") # to install the caTools package if you don't have it
- set.seed(123) # this seed will help everyone to get the same results
- # split <- sample.split(dataset$DependentVariable, SplitRatio = 0.8)
- split <- sample.split(dataset$Purchased, SplitRatio = 0.8)
- training_set <- subset(dataset, split == TRUE)
- test_set <- subset(dataset, split == FALSE)
- # Feature Scaling
- # becasue most machine learning algorithms use the euclidean distance, we must use the same scale
- # otherwise, the axis with the largest scale will dominate the small one
- # however, not all algorithms need the varibales to be rescaled, so i dont have to do it manually
- training_set[ , 2:3] <- scale(training_set[ , 2:3])
- test_set[ , 2:3] <- scale(test_set[ , 2:3])
Add Comment
Please, Sign In to add comment