Untitled

# Data Preprocessing Template

rm(list = ls())

# Importing the dataset

dataset <- read.csv(file.path(getwd(),'Data/Data.csv'))

#Taking care of missing data

any(is.na(dataset)) # the answer to "do we have missing data in the dataset?"

sum(is.na(dataset)) # total number of missing data

(x <- sapply(dataset, function(x) any(is.na(x)))) # which column have missing data and which is not

x[x==TRUE] # display only the coulmns with missing data

dataset[!complete.cases(dataset), ] # which rows have missing data

new_dataset <- na.omit(dataset) # new dataset without missing data

# ave is similar to using split and lapply
# gl is general factor levels

dataset$Age <- ifelse(is.na(dataset$Age),
                      mean(dataset$Age, na.rm = TRUE),
                      dataset$Age)

dataset$Salary <- ifelse(is.na(dataset$Salary),
                      mean(dataset$Salary, na.rm = TRUE),
                      dataset$Salary)

# Encoding categorical data

dataset$Country <- factor(dataset$Country, levels = c("France", "Spain", "Germany"),
                          labels = 1:3)

dataset$Purchased <- factor(dataset$Purchased, levels = c("No", "Yes"),
                          labels = 0:1)

# The base R function model.matrix would generate dummy variables
head(model.matrix(Purchased ~ ., data = dataset), 10)


# Splitting the dataset into the Training set and Test set

# install.packages('caTools')

if (!require("caTools"))
  install.packages("caTools") # to install the caTools package if you don't have it

set.seed(123) # this seed will help everyone to get the same results

# split <- sample.split(dataset$DependentVariable, SplitRatio = 0.8)
split <- sample.split(dataset$Purchased, SplitRatio = 0.8)

training_set <- subset(dataset, split == TRUE)
test_set <- subset(dataset, split == FALSE)

# Feature Scaling
# becasue most machine learning algorithms use the euclidean distance, we must use the same scale
# otherwise, the axis with the largest scale will dominate the small one
# however, not all algorithms need the varibales to be rescaled, so i dont have to do it manually

training_set[ , 2:3] <- scale(training_set[ , 2:3])
test_set[ , 2:3] <- scale(test_set[ , 2:3])