DM_PRACTICALS

----------------------------------------PRACTICAL-1-------------------------------------------------------
# create the data
age <- c(21, 2, 18, 221, 34)
agegroup <- c("adult", "child", "adult", "elderly", "child")
height <- c(6.0, 3, 5.7, 5, -7)
status <- c("single", "married", "married", "widowed", "married")
yearsmarried <- c(-1, 0, 20, 2, 3)

# combine the data into a data frame
people <- data.frame(age, agegroup, height, status, yearsmarried)

# write the data frame to a text file
write.table(people, "people.txt", sep="\t", row.names=FALSE)
# read the data from the file
people <- read.table("people.txt", header=TRUE, sep="\t")
# create the ruleset E
library(dplyr)

E <- list(
  check(age > 0 & age <= 150, "Age should be in the range 0-150"),
  check(age > yearsmarried, "Age should be greater than yearsmarried"),
  check(status %in% c("married", "single", "widowed"), "Status should be married or single or widowed"),
  check(
    ifelse(age < 18, agegroup == "child",
           ifelse(age <= 65, agegroup == "adult", agegroup == "elderly")),
    "Agegroup should be child, adult, or elderly"
  )
)
# apply the ruleset E to the data
violations <- people %>%
  validate(E)

# summarize the results
summary(violations)
# visualize the results
library(ggplot2)

ggplot(violations, aes(x=rule, y=row)) +
  geom_point(size=3, color="red") +
  ggtitle("Violations of Ruleset E") +
  ylab("Row") +
  xlab("Rule")
-----------------------------------------PRACTICAL-2-------------------------------------
# load the dataset
dirty_iris <- read.csv("dirty_iris.csv", header=TRUE)
# calculate the number and percentage of complete observations
complete_obs <- complete.cases(dirty_iris)
num_complete <- sum(complete_obs)
perc_complete <- mean(complete_obs) * 100
cat("Number of complete observations:", num_complete, "\n")
cat("Percentage of complete observations:", perc_complete, "%\n")
# replace all special values with NA
dirty_iris[dirty_iris == "NA"] <- NA
dirty_iris[dirty_iris == "N/A"] <- NA
dirty_iris[dirty_iris == "?"] <- NA
Species %in% c("setosa", "versicolor", "virginica")
Sepal.Length > 0
Sepal.Length <= 30
Sepal.Length > Petal.Length
Petal.Length >= 2 * Petal.Width
# read the rules using the editrules package
library(editrules)

rules <- editfile("iris_rules.txt")
print(rules)
# apply the rules to the dataset and count the number of violations
violations <- validate(dirty_iris, rules)
num_violations <- nrow(violations)
cat("Number of violations:", num_violations, "\n")

# summarize the violations
summary(violations)

# plot the violations
plot(violations)
# create a boxplot of sepal length
boxplot(dirty_iris$Sepal.Length)

# calculate the outliers using boxplot.stats
sepallength_stats <- boxplot.stats(dirty_iris$Sepal.Length)
outliers <- sepallength_stats$out
cat("Number of outliers in sepal length:", length(outliers), "\n")


-----------------------------------------PRACTICAL-3--------------------------------------
# load the wine dataset
wine <- read.csv("wine.csv", header = TRUE)

# check if all attributes are standardized
mean_wine <- apply(wine, 2, mean)
sd_wine <- apply(wine, 2, sd)
if(all(abs(mean_wine) < 1e-10) && all(abs(sd_wine - 1) < 1e-10)) {
  cat("All attributes are standardized.\n")
} else {
  # standardize the attributes
  wine_std <- scale(wine)
  cat("Attributes have been standardized.\n")
}
# load the Iris dataset
iris <- read.csv("iris.csv", header = TRUE)

# check if all attributes are standardized
mean_iris <- apply(iris[,1:4], 2, mean)
sd_iris <- apply(iris[,1:4], 2, sd)
if(all(abs(mean_iris) < 1e-10) && all(abs(sd_iris - 1) < 1e-10)) {
  cat("All attributes are standardized.\n")
} else {
  # standardize the attributes
  iris_std <- scale(iris[,1:4])
  iris_std <- cbind(iris_std, iris[,5])
  colnames(iris_std) <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species")
  cat("Attributes have been standardized.\n")
}

---------------------------------------PRACTICAL-4-------------------------------------
# generate example data
transactions <- list(
  c("beer", "chips", "nuts", "salsa"),
  c("beer", "chips", "nuts"),
  c("beer", "chips"),
  c("beer", "salsa"),
  c("beer", "nuts"),
  c("chips", "nuts", "salsa"),
  c("chips", "nuts"),
  c("chips", "salsa"),
  c("nuts", "salsa")
)
# load the arules package
library(arules)

# convert the transaction data to a transaction object
trans <- as(transactions, "transactions")

# run the Apriori algorithm
frequentItemsets <- apriori(trans, parameter = list(supp = 0.5, conf = 0.75))

# view the frequent itemsets
inspect(frequentItemsets)

# extract the association rules
associationRules <- as(frequentItemsets, "rules")

# view the association rules
inspect(associationRules)
# run the Apriori algorithm
frequentItemsets <- apriori(trans, parameter = list(supp = 0.6, conf = 0.6))

# view the frequent itemsets
inspect(frequentItemsets)

# extract the association rules
associationRules <- as(frequentItemsets, "rules")

# view the association rules
inspect(associationRules)
-------------------------------------------------------------PRACTICAL-5-------------------------
NAIVE BAYES----------------------------
library(caTools)
library(e1071)
set.seed(123)

# Load iris dataset
data(iris)

# Split dataset into training and testing sets
split = sample.split(iris$Species, SplitRatio = 0.75)
train = subset(iris, split == TRUE)
test = subset(iris, split == FALSE)
# Train Naive Bayes classifier
model = naiveBayes(Species ~ ., data = train)
# Make predictions on testing set
predictions = predict(model, test)
# Calculate confusion matrix
table(predictions, test$Species)

# Calculate accuracy
mean(predictions == test$Species)
split = sample.split(iris$Species, SplitRatio = 0.75)
train = subset(iris, split == TRUE)
test = subset(iris, split == FALSE)

model = naiveBayes(Species ~ ., data = train)
predictions = predict(model, test)

mean(predictions == test$Species)
split = sample.split(iris$Species, SplitRatio = 0.666)
train = subset(iris, split == TRUE)
test = subset(iris, split == FALSE)

model = naiveBayes(Species ~ ., data = train)
predictions = predict(model, test)

mean(predictions == test$Species)
split = sample.split(iris$Species, SplitRatio = 0.75)
train = subset(iris, split == TRUE)
test = subset(iris, split == FALSE)

model = naiveBayes(Species ~ ., data = train)
predictions = predict(model, test)

mean(predictions == test$Species)
accuracy = numeric(100)

for(i in 1:100) {
  split = sample.split(iris$Species, SplitRatio = 0.75)
  train = subset(iris, split == TRUE)
  test = subset(iris, split == FALSE)

  model = naiveBayes(Species ~ ., data = train)
  predictions = predict(model, test)

  accuracy[i] = mean(predictions == test$Species)
}

mean(accuracy)
model = naiveBayes(Species ~ ., data = iris)

accuracy = cv.accuracy(model, iris, FUN = function(x, y) mean(predict(x, y) == y$Species))

mean(accuracy)
# Scale data
train_scaled = scale(train[,1:4])
test_scaled = scale(test[,1:4])

# Train Naive Bayes classifier on scaled data
model = naiveBayes(Species ~ ., data = train_scaled)
predictions = predict(model,

KNN---------------------------------------
# Load the Iris dataset
data(iris)

# Split the dataset into training and testing sets
set.seed(123)
train_index <- sample(1:nrow(iris), 0.75*nrow(iris))
train_data <- iris[train_index,]
test_data <- iris[-train_index,]

# Scale the data to standard format
train_data_scaled <- scale(train_data[,1:4])
test_data_scaled <- scale(test_data[,1:4])

# Train the k-NN model using the training set
library(class)
k <- 3
knn_model <- knn(train_data_scaled, test_data_scaled, train_data$Species, k)

# Evaluate the model on the testing set
table(knn_model, test_data$Species)
accuracy <- sum(knn_model == test_data$Species) / length(test_data$Species)
print(paste("Accuracy:", round(accuracy, 4)))

DECISION TREE----------------------------------
                      library(caret)
library(rpart)
data(iris)
# Situation 5.1 a) Training set = 75% Test set = 25%
set.seed(123)
trainIndex <- createDataPartition(iris$Species, p = 0.75, list = FALSE)
train <- iris[trainIndex,]
test <- iris[-trainIndex,]

# Situation 5.1 b) Training set = 66.6% (2/3rd of total), Test set = 33.3%
set.seed(123)
trainIndex <- createDataPartition(iris$Species, p = 0.666, list = FALSE)
train <- iris[trainIndex,]
test <- iris[-trainIndex,]

# Situation 5.2 i) hold out method
set.seed(123)
trainIndex <- sample(nrow(iris), 0.75*nrow(iris))
train <- iris[trainIndex,]
test <- iris[-trainIndex,]

# Situation 5.2 ii) Random subsampling
set.seed(123)
subsamples <- split(iris, sample(1:5, nrow(iris), replace = TRUE))
train <- do.call(rbind, subsamples[-1])
test <- subsamples[[1]]

# Situation 5.2 iii) Cross-Validation
trainControl <- trainControl(method = "cv", number = 10)
model <- train(Species ~ ., data = iris, method = "rpart", trControl = trainControl)
train[,1:4] <- scale(train[,1:4])
test[,1:4] <- scale(test[,1:4])
# Build Decision tree classifiers
model1 <- rpart(Species ~ ., data = train, method = "class")
model2 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(cp = 0.01))
model3 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(minsplit = 20))
model4 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(maxdepth = 2))

# Make predictions and calculate accuracy for each model
pred1 <- predict(model1, newdata = test, type = "class")
confusionMatrix(pred1, test$Species)$overall[1]
------------------------------------------------------PRACTICAL-6---------------------------------------
                      # Load the iris dataset
data(iris)

# Select only the numeric variables for clustering
iris_numeric <- iris[, 1:4]

# Scale the variables to have mean = 0 and standard deviation = 1
iris_scaled <- scale(iris_numeric)

# Simple Kmeans clustering
set.seed(123)
kmeans_result <- kmeans(iris_scaled, centers = 3, nstart = 20)

# DBSCAN clustering
library(dbscan)
dbscan_result <- dbscan(iris_scaled, eps = 0.4, minPts = 5)

# Hierarchical clustering
hclust_result <- hclust(dist(iris_scaled), method = "complete")
hclust_groups <- cutree(hclust_result, k = 3)

# Compare the performance of the clustering algorithms
library(cluster)
library(factoextra)

# Silhouette analysis for Kmeans
fviz_nbclust(iris_scaled, kmeans, method = "silhouette")

# Elbow method for Kmeans
fviz_nbclust(iris_scaled, kmeans, method = "wss")

# Plot DBSCAN results
fviz_cluster(dbscan_result, iris_scaled)

# Dendrogram for hierarchical clustering
fviz_dend(hclust_result, k = 3, cex = 0.5)

# Silhouette analysis for hierarchical clustering
silhouette(hclust_groups, dist(iris_scaled))

# Adjust the parameters and repeat the analysis to compare the performance of the algorithms