----------------------------------------PRACTICAL-1------------------------------------------------------- # create the data age <- c(21, 2, 18, 221, 34) agegroup <- c("adult", "child", "adult", "elderly", "child") height <- c(6.0, 3, 5.7, 5, -7) status <- c("single", "married", "married", "widowed", "married") yearsmarried <- c(-1, 0, 20, 2, 3) # combine the data into a data frame people <- data.frame(age, agegroup, height, status, yearsmarried) # write the data frame to a text file write.table(people, "people.txt", sep="\t", row.names=FALSE) # read the data from the file people <- read.table("people.txt", header=TRUE, sep="\t") # create the ruleset E library(dplyr) E <- list( check(age > 0 & age <= 150, "Age should be in the range 0-150"), check(age > yearsmarried, "Age should be greater than yearsmarried"), check(status %in% c("married", "single", "widowed"), "Status should be married or single or widowed"), check( ifelse(age < 18, agegroup == "child", ifelse(age <= 65, agegroup == "adult", agegroup == "elderly")), "Agegroup should be child, adult, or elderly" ) ) # apply the ruleset E to the data violations <- people %>% validate(E) # summarize the results summary(violations) # visualize the results library(ggplot2) ggplot(violations, aes(x=rule, y=row)) + geom_point(size=3, color="red") + ggtitle("Violations of Ruleset E") + ylab("Row") + xlab("Rule") -----------------------------------------PRACTICAL-2------------------------------------- # load the dataset dirty_iris <- read.csv("dirty_iris.csv", header=TRUE) # calculate the number and percentage of complete observations complete_obs <- complete.cases(dirty_iris) num_complete <- sum(complete_obs) perc_complete <- mean(complete_obs) * 100 cat("Number of complete observations:", num_complete, "\n") cat("Percentage of complete observations:", perc_complete, "%\n") # replace all special values with NA dirty_iris[dirty_iris == "NA"] <- NA dirty_iris[dirty_iris == "N/A"] <- NA dirty_iris[dirty_iris == "?"] <- NA Species %in% c("setosa", "versicolor", "virginica") Sepal.Length > 0 Sepal.Length <= 30 Sepal.Length > Petal.Length Petal.Length >= 2 * Petal.Width # read the rules using the editrules package library(editrules) rules <- editfile("iris_rules.txt") print(rules) # apply the rules to the dataset and count the number of violations violations <- validate(dirty_iris, rules) num_violations <- nrow(violations) cat("Number of violations:", num_violations, "\n") # summarize the violations summary(violations) # plot the violations plot(violations) # create a boxplot of sepal length boxplot(dirty_iris$Sepal.Length) # calculate the outliers using boxplot.stats sepallength_stats <- boxplot.stats(dirty_iris$Sepal.Length) outliers <- sepallength_stats$out cat("Number of outliers in sepal length:", length(outliers), "\n") -----------------------------------------PRACTICAL-3-------------------------------------- # load the wine dataset wine <- read.csv("wine.csv", header = TRUE) # check if all attributes are standardized mean_wine <- apply(wine, 2, mean) sd_wine <- apply(wine, 2, sd) if(all(abs(mean_wine) < 1e-10) && all(abs(sd_wine - 1) < 1e-10)) { cat("All attributes are standardized.\n") } else { # standardize the attributes wine_std <- scale(wine) cat("Attributes have been standardized.\n") } # load the Iris dataset iris <- read.csv("iris.csv", header = TRUE) # check if all attributes are standardized mean_iris <- apply(iris[,1:4], 2, mean) sd_iris <- apply(iris[,1:4], 2, sd) if(all(abs(mean_iris) < 1e-10) && all(abs(sd_iris - 1) < 1e-10)) { cat("All attributes are standardized.\n") } else { # standardize the attributes iris_std <- scale(iris[,1:4]) iris_std <- cbind(iris_std, iris[,5]) colnames(iris_std) <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", "Species") cat("Attributes have been standardized.\n") } ---------------------------------------PRACTICAL-4------------------------------------- # generate example data transactions <- list( c("beer", "chips", "nuts", "salsa"), c("beer", "chips", "nuts"), c("beer", "chips"), c("beer", "salsa"), c("beer", "nuts"), c("chips", "nuts", "salsa"), c("chips", "nuts"), c("chips", "salsa"), c("nuts", "salsa") ) # load the arules package library(arules) # convert the transaction data to a transaction object trans <- as(transactions, "transactions") # run the Apriori algorithm frequentItemsets <- apriori(trans, parameter = list(supp = 0.5, conf = 0.75)) # view the frequent itemsets inspect(frequentItemsets) # extract the association rules associationRules <- as(frequentItemsets, "rules") # view the association rules inspect(associationRules) # run the Apriori algorithm frequentItemsets <- apriori(trans, parameter = list(supp = 0.6, conf = 0.6)) # view the frequent itemsets inspect(frequentItemsets) # extract the association rules associationRules <- as(frequentItemsets, "rules") # view the association rules inspect(associationRules) -------------------------------------------------------------PRACTICAL-5------------------------- NAIVE BAYES---------------------------- library(caTools) library(e1071) set.seed(123) # Load iris dataset data(iris) # Split dataset into training and testing sets split = sample.split(iris$Species, SplitRatio = 0.75) train = subset(iris, split == TRUE) test = subset(iris, split == FALSE) # Train Naive Bayes classifier model = naiveBayes(Species ~ ., data = train) # Make predictions on testing set predictions = predict(model, test) # Calculate confusion matrix table(predictions, test$Species) # Calculate accuracy mean(predictions == test$Species) split = sample.split(iris$Species, SplitRatio = 0.75) train = subset(iris, split == TRUE) test = subset(iris, split == FALSE) model = naiveBayes(Species ~ ., data = train) predictions = predict(model, test) mean(predictions == test$Species) split = sample.split(iris$Species, SplitRatio = 0.666) train = subset(iris, split == TRUE) test = subset(iris, split == FALSE) model = naiveBayes(Species ~ ., data = train) predictions = predict(model, test) mean(predictions == test$Species) split = sample.split(iris$Species, SplitRatio = 0.75) train = subset(iris, split == TRUE) test = subset(iris, split == FALSE) model = naiveBayes(Species ~ ., data = train) predictions = predict(model, test) mean(predictions == test$Species) accuracy = numeric(100) for(i in 1:100) { split = sample.split(iris$Species, SplitRatio = 0.75) train = subset(iris, split == TRUE) test = subset(iris, split == FALSE) model = naiveBayes(Species ~ ., data = train) predictions = predict(model, test) accuracy[i] = mean(predictions == test$Species) } mean(accuracy) model = naiveBayes(Species ~ ., data = iris) accuracy = cv.accuracy(model, iris, FUN = function(x, y) mean(predict(x, y) == y$Species)) mean(accuracy) # Scale data train_scaled = scale(train[,1:4]) test_scaled = scale(test[,1:4]) # Train Naive Bayes classifier on scaled data model = naiveBayes(Species ~ ., data = train_scaled) predictions = predict(model, KNN--------------------------------------- # Load the Iris dataset data(iris) # Split the dataset into training and testing sets set.seed(123) train_index <- sample(1:nrow(iris), 0.75*nrow(iris)) train_data <- iris[train_index,] test_data <- iris[-train_index,] # Scale the data to standard format train_data_scaled <- scale(train_data[,1:4]) test_data_scaled <- scale(test_data[,1:4]) # Train the k-NN model using the training set library(class) k <- 3 knn_model <- knn(train_data_scaled, test_data_scaled, train_data$Species, k) # Evaluate the model on the testing set table(knn_model, test_data$Species) accuracy <- sum(knn_model == test_data$Species) / length(test_data$Species) print(paste("Accuracy:", round(accuracy, 4))) DECISION TREE---------------------------------- library(caret) library(rpart) data(iris) # Situation 5.1 a) Training set = 75% Test set = 25% set.seed(123) trainIndex <- createDataPartition(iris$Species, p = 0.75, list = FALSE) train <- iris[trainIndex,] test <- iris[-trainIndex,] # Situation 5.1 b) Training set = 66.6% (2/3rd of total), Test set = 33.3% set.seed(123) trainIndex <- createDataPartition(iris$Species, p = 0.666, list = FALSE) train <- iris[trainIndex,] test <- iris[-trainIndex,] # Situation 5.2 i) hold out method set.seed(123) trainIndex <- sample(nrow(iris), 0.75*nrow(iris)) train <- iris[trainIndex,] test <- iris[-trainIndex,] # Situation 5.2 ii) Random subsampling set.seed(123) subsamples <- split(iris, sample(1:5, nrow(iris), replace = TRUE)) train <- do.call(rbind, subsamples[-1]) test <- subsamples[[1]] # Situation 5.2 iii) Cross-Validation trainControl <- trainControl(method = "cv", number = 10) model <- train(Species ~ ., data = iris, method = "rpart", trControl = trainControl) train[,1:4] <- scale(train[,1:4]) test[,1:4] <- scale(test[,1:4]) # Build Decision tree classifiers model1 <- rpart(Species ~ ., data = train, method = "class") model2 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(cp = 0.01)) model3 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(minsplit = 20)) model4 <- rpart(Species ~ ., data = train, method = "class", control = rpart.control(maxdepth = 2)) # Make predictions and calculate accuracy for each model pred1 <- predict(model1, newdata = test, type = "class") confusionMatrix(pred1, test$Species)$overall[1] ------------------------------------------------------PRACTICAL-6--------------------------------------- # Load the iris dataset data(iris) # Select only the numeric variables for clustering iris_numeric <- iris[, 1:4] # Scale the variables to have mean = 0 and standard deviation = 1 iris_scaled <- scale(iris_numeric) # Simple Kmeans clustering set.seed(123) kmeans_result <- kmeans(iris_scaled, centers = 3, nstart = 20) # DBSCAN clustering library(dbscan) dbscan_result <- dbscan(iris_scaled, eps = 0.4, minPts = 5) # Hierarchical clustering hclust_result <- hclust(dist(iris_scaled), method = "complete") hclust_groups <- cutree(hclust_result, k = 3) # Compare the performance of the clustering algorithms library(cluster) library(factoextra) # Silhouette analysis for Kmeans fviz_nbclust(iris_scaled, kmeans, method = "silhouette") # Elbow method for Kmeans fviz_nbclust(iris_scaled, kmeans, method = "wss") # Plot DBSCAN results fviz_cluster(dbscan_result, iris_scaled) # Dendrogram for hierarchical clustering fviz_dend(hclust_result, k = 3, cex = 0.5) # Silhouette analysis for hierarchical clustering silhouette(hclust_groups, dist(iris_scaled)) # Adjust the parameters and repeat the analysis to compare the performance of the algorithms