Code: import pandas as pd df = pd.read_csv('data/data.csv') # Remove Duplicates df = df.drop_duplicates() # Handle Missing Values df["Age"] = df["Age"].fillna(df["Age"].median()) df = df.dropna(subset=["Salary", "Join_Date"]) # Data Type Conversion df["Join_Date"] = pd.to_datetime(df["Join_Date"], errors="coerce") # Simple Outlier Detection df.loc[df["Age"] > 100, "Age"] = df["Age"].median() print("Cleaned data:\n", df) printf("Bubble: %d\n", bubble_sort(arr2, n)); memcpy(arr2, arr, n*sizeof(int)); printf("Selection: %d\n", selection_sort(arr2, n)); memcpy(arr2, arr, n*sizeof(int)); printf("Insertion: %d\n", insertion_sort(arr2, n)); printf("\nSorted array: \n"); for(int i=0; i= min_support} freq_itemsets = {itemset: counts[itemset] for itemset in l_k} k = 2 while l_k: c_k = generate_candidates(l_k, k) if not c_k: break candidate_counts = defaultdict(int) for t in map(set, transactions_list): for candidate in c_k: if candidate.issubset(t): candidate_counts[candidate] += 1 l_k = {c for c in c_k if candidate_counts[c] >= min_support} freq_itemsets.update({itemset: candidate_counts[itemset] for itemset in l_k}) k += 1 return freq_itemsets def generate_rules(freq_itemsets, min_confidence): rules = [] for itemset, count in freq_itemsets.items(): if len(itemset) > 1: for i in range(1, len(itemset)): for antecedent in map(frozenset, combinations(itemset, i)): consequent = itemset - antecedent confidence = count / freq_itemsets[antecedent] if confidence >= min_confidence: rules.append((antecedent, consequent, confidence)) return rules transactions = [ ['Sausage', 'Peanut', 'Beer'], ['Peanut', 'Beer', 'Apple'], ['Apple', 'Milk'], ['Sausage', 'Peanut', 'Apple'], ['Sausage', 'Peanut', 'Beer', 'Milk'], ['Sausage', 'Peanut', 'Beer', 'Apple'] ] min_sup, min_conf = 3, 0.7 freq_itemsets = apriori(transactions, min_sup) rules = generate_rules(freq_itemsets, min_conf) print("Association Rules:") for ant, cons, conf in rules: print(f"{sorted(list(ant))} => {sorted(list(cons))} (Conf: {conf:.2f})") -------------------------------------------------- from collections import Counter import pandas as pd class TreeNode: def __init__(self, name, count, parent): self.name = name self.count = count self.parent = parent self.children = {} self.node_link = None def build_tree(data, min_sup): counts = Counter(item for trans in data for item in trans) header = {k: [v, None] for k, v in counts.items() if v >= min_sup} if not header: return None, None root = TreeNode("Null", 1, None) for trans in data: items = sorted([i for i in trans if i in header], key=lambda x: header[x][0], reverse=True) current = root for item in items: if item not in current.children: new_node = TreeNode(item, 0, current) current.children[item] = new_node # Update node link if header[item][1] is None: header[item][1] = new_node else: ptr = header[item][1] while ptr.node_link: ptr = ptr.node_link ptr.node_link = new_node current = current.children[item] current.count += 1 return root, header def mine_tree(header, min_sup, prefix, found): for item, (count, node_ptr) in sorted(header.items(), key=lambda x: x[1][0]): new_set = prefix | {item} found.append((new_set, count)) # Find prefix paths paths = [] curr_node = node_ptr while curr_node: path, parent = [], curr_node.parent while parent and parent.name != "Null": path.append(parent.name) parent = parent.parent if path: paths.extend([path] * curr_node.count) curr_node = curr_node.node_link cond_tree, cond_header = build_tree(paths, min_sup) if cond_header: mine_tree(cond_header, min_sup, new_set, found) # Execution transactions = [ ['Sausage', 'Peanut', 'Beer'], ['Peanut', 'Beer', 'Apple'], ['Apple', 'Milk'], ['Sausage', 'Peanut', 'Apple'], ['Sausage', 'Peanut', 'Beer', 'Milk'], ['Sausage', 'Peanut', 'Beer', 'Apple'] ] tree, header = build_tree(transactions, 2) patterns = [] mine_tree(header, 3, set(), patterns) df = pd.DataFrame(patterns, columns=["Itemset", "Frequency"]) print(df.sort_values("Frequency", ascending=False).reset_index(drop=True)) -------------------------------------------------- import pandas as pd import numpy as np def get_entropy(s): p = s.value_counts(normalize=True) return -(p * np.log2(p)).sum() def build_tree(df, target, feats): vals = df[target].unique() if len(vals) == 1: return vals[0] if not feats: return df[target].mode()[0] # Find best feature using Information Gain base_ent = get_entropy(df[target]) gains = {} for f in feats: w_ent = sum(len(sub)/len(df) * get_entropy(sub[target]) for _, sub in df.groupby(f)) gains[f] = base_ent - w_ent best = max(gains, key=gains.get) remaining = [i for i in feats if i != best] # Recursive tree building return {best: {v: build_tree(sub, target, remaining) for v, sub in df.groupby(best)}} def predict(tree, query): if not isinstance(tree, dict): return tree root = next(iter(tree)) val = query.get(root) return predict(tree[root][val], query) if val in tree[root] else "Unknown" df = pd.read_csv('data/class.csv') target_col = 'Play' features = [c for c in df.columns if c != target_col] tree = build_tree(df, target_col, features) print("Tree Structure:", tree) query = {'Outlook': 'Rain', 'Temp': 'Cool', 'Humidity': 'Normal', 'Wind': 'Weak'} print("Prediction:", predict(tree, query)) -------------------------------------------------- import pandas as pd def train_nb(df, target): classes = df[target].unique() features = [c for c in df.columns if c != target] model = {'prior': df[target].value_counts(normalize=True).to_dict(), 'lk': {}} for cls in classes: df_c = df[df[target] == cls] model['lk'][cls] = { f: ((df_c[f].value_counts() + 1) / (len(df_c) + df[f].nunique())).to_dict() for f in features } return model def predict_nb(model, query): probs = {} for cls, prior in model['prior'].items(): p = prior for f, val in query.items(): # Get smoothed probability or default to small value if unseen p *= model['lk'][cls][f].get(val, 1 / 100) probs[cls] = p return max(probs, key=probs.get) df = pd.read_csv('data/class.csv') model = train_nb(df, 'Play') query = {'Outlook': 'Rain', 'Temp': 'Mild', 'Humidity': 'High', 'Wind': 'Weak'} print(f"Query: {query}") print(f"Predicted: {predict_nb(model, query)}") -------------------------------------------------- import numpy as np import pandas as pd def svm_fit(X, y, lr=0.001, lambda_param=0.01, n_iters=1000): n_samples, n_features = X.shape unique_classes = np.unique(y) y_transformed = np.where(y == unique_classes[0], -1, 1) w = np.zeros(n_features) b = 0 # Gradient Descent for _ in range(n_iters): for idx, x_i in enumerate(X): condition = y_transformed[idx] * (np.dot(x_i, w) - b) >= 1 if condition: w -= lr * (2 * lambda_param * w) else: w -= lr * (2 * lambda_param * w - np.dot(x_i, y_transformed[idx])) b -= lr * y_transformed[idx] return w, b, unique_classes def svm_predict(X, w, b, classes): approx = np.dot(X, w) - b indices = np.where(approx >= 0, 1, 0) return classes[indices] data = { 'Hours': [1, 2, 3, 7, 8, 9, 2, 8], 'Attendance': [30, 40, 50, 80, 90, 95, 20, 70], 'Result': [0, 0, 0, 1, 1, 1, 0, 1] } df = pd.DataFrame(data) X = df[['Hours', 'Attendance']].values y = df['Result'].values weights, bias, classes = svm_fit(X, y, lr=0.001, n_iters=5000) test_data = np.array([[2, 35], [6, 85]]) predictions = svm_predict(test_data, weights, bias, classes) print("Predictions: ") for i, pred in enumerate(predictions): status = "Pass" if pred == 1 else "Fail" print( f"Student {i+1} (Hours: {test_data[i][0]}, " f"Attendance: {test_data[i][1]}%): {status}" ) -------------------------------------------------- import numpy as np import matplotlib.pyplot as plt # Generate linear data X = np.random.rand(200) * 5 W_act = np.random.randint(1, 5) B_act = np.random.randint(5) Y = W_act * X + B_act + np.random.randn(200) * 2 # Linear Regression x_mean = np.mean(X) y_mean = np.mean(Y) W = np.sum((X - x_mean) * (Y - y_mean)) / np.sum((X - x_mean) ** 2) B = y_mean - W * x_mean # Visualization plt.figure(figsize=(10, 5)) plt.scatter(X, Y, label="Data Points", alpha=0.6) X_line = np.linspace(0, 5, 10) Z_line = W * X_line + B plt.plot(X_line, Z_line, color='orange', linewidth=3) plt.title("Simple Linear Regression") plt.legend() plt.show() -------------------------------------------------- import numpy as np import pandas as pd # Dummy data data = { "YearsExperience": [1.1, 1.3, 1.5, 2.0, 2.2, 2.9, 3.0, 3.2, 3.2, 3.7], "Certifications": [1, 2, 1, 3, 2, 4, 3, 2, 5, 4], "Salary": [39.34, 46.20, 37.73, 43.52, 39.89, 56.64, 60.15, 54.44, 56.44, 57.18], } df = pd.DataFrame(data) X = df[["YearsExperience", "Certifications"]].values y = df["Salary"].values train_size = int(0.8 * len(X)) X_train, X_test = X[:train_size], X[train_size:] y_train, y_test = y[:train_size], y[train_size:] X_b = np.c_[np.ones((len(X_train), 1)), X_train] weights = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train) intercept = weights[0] coefficients = weights[1:] def predict(input_data, intercept, coefficients): return input_data.dot(coefficients) + intercept predictions = predict(X_test, intercept, coefficients) predictions = predict(X_test, intercept, coefficients) mse = np.mean((predictions - y_test) ** 2) print(f"Actual values: {y_test}") print(f"Predicted values: {predictions}") print(f"Mean Squared Error: {mse:.4f}") -------------------------------------------------- import numpy as np from cluster_tools import generate_data, plot_clusters def kmeans(X, k, max_iters=100): # Initialize centroids randomly rng = np.random.default_rng(0) initial_indices = rng.choice(X.shape[0], k, replace=False) centroids = X[initial_indices] for i in range(max_iters): distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2) labels = np.argmin(distances, axis=1) new_centroids = np.array([ X[labels == j].mean(axis=0) if len(X[labels == j]) > 0 else centroids[j] for j in range(k) ]) if np.allclose(centroids, new_centroids): print(f"Algorithm converged at iteration {i}") break centroids = new_centroids return centroids, labels K = 4 data = generate_data(n_samples=400, centers=K) final_centroids, final_labels = kmeans(data, k=K) plot_clusters(data, final_labels, final_centroids, k=K) -------------------------------------------------- import numpy as np from cluster_tools import generate_data, plot_clusters def k_medioids(X, k, max_iters=100): rng = np.random.default_rng(42) medoid_indices = rng.choice(X.shape[0], k, replace=False) medoids = X[medoid_indices] for i in range(max_iters): distances = np.linalg.norm(X[:, np.newaxis] - medoids, axis=2) labels = np.argmin(distances, axis=1) old_medoid_indices = medoid_indices.copy() for j in range(k): cluster_points = X[labels == j] if len(cluster_points) > 0: dist_matrix = np.linalg.norm( cluster_points[:, np.newaxis] - cluster_points, axis=2 ) best_point_idx = np.argmin(dist_matrix.sum(axis=1)) medoids[j] = cluster_points[best_point_idx] if np.allclose(medoids, X[old_medoid_indices]): print(f"K-Medoids converged at iteration {i}") break return medoids, labels K = 4 data = generate_data(n_samples=400, centers=K) final_centroids, final_labels = k_medioids(data, k=K) plot_clusters(data, final_labels, final_centroids, k=K) -------------------------------------------------- import numpy as np from cluster_tools import generate_data, plot_clusters def agglomerative(X, k): n_samples = X.shape[0] clusters = [[i] for i in range(n_samples)] dist_matrix = np.linalg.norm(X[:, np.newaxis] - X, axis=2) np.fill_diagonal(dist_matrix, np.inf) while len(clusters) > k: min_dist = np.inf to_merge = (0, 0) for i in range(len(clusters)): for j in range(i + 1, len(clusters)): cluster_dist = dist_matrix[np.ix_(clusters[i], clusters[j])] current_min = np.min(cluster_dist) if current_min < min_dist: min_dist = current_min to_merge = (i, j) idx_i, idx_j = to_merge clusters[idx_i].extend(clusters[idx_j]) clusters.pop(idx_j) labels = np.zeros(n_samples, dtype=int) centroids = np.zeros((k, X.shape[1])) for cluster_id, point_indices in enumerate(clusters): labels[point_indices] = cluster_id centroids[cluster_id] = np.mean(X[point_indices], axis=0) return centroids, labels K = 4 data = generate_data(n_samples=150, centers=K) final_centroids, final_labels = agglomerative(data, k=K) plot_clusters(data, final_labels, final_centroids, k=K) -------------------------------------------------- import numpy as np from cluster_tools import generate_data, plot_clusters def dbscan(X, eps=0.5, min_samples=5): n_samples = X.shape[0] labels = np.full(n_samples, -1) cluster_id = 0 dist_matrix = np.linalg.norm(X[:, np.newaxis] - X, axis=2) for i in range(n_samples): if labels[i] != -1: continue neighbors = np.where(dist_matrix[i] <= eps)[0] if len(neighbors) < min_samples: continue labels[i] = cluster_id seeds = list(neighbors) idx = 0 while idx < len(seeds): current_point = seeds[idx] if labels[current_point] == -1: labels[current_point] = cluster_id elif labels[current_point] == -1 or labels[current_point] == cluster_id: pass new_neighbors = np.where(dist_matrix[current_point] <= eps)[0] if len(new_neighbors) >= min_samples: for neighbor in new_neighbors: if labels[neighbor] == -1: labels[neighbor] = cluster_id seeds.append(neighbor) idx += 1 cluster_id += 1 unique_labels = [l for l in np.unique(labels) if l != -1] num_clusters = len(unique_labels) centroids = np.zeros((num_clusters, X.shape[1])) for idx, l in enumerate(unique_labels): centroids[idx] = np.mean(X[labels == l], axis=0) return centroids, labels data = generate_data(n_samples=200, centers=5) final_centroids, final_labels = dbscan(data, eps=1, min_samples=5) plot_clusters(data, final_labels, final_centroids, k=len(final_centroids)) -------------------------------------------------- import numpy as np def sigmoid(x): return 1 / (1 + np.exp(-x)) def d_sigmoid(x): y = sigmoid(x) return y * (1-y) def forward(w01, w12, b1, b2, x): h = sigmoid(w01.dot(x) + b1) return sigmoid(w12.dot(h) + b2)[0] def train(w01, w12, b1, b2, x, d): u1 = w01.dot(x) + b1 h = sigmoid(u1) u2 = (w12.dot(h) + b2)[0] y = sigmoid(u2) e = d - y delta2 = e * d_sigmoid(u2) delta1 = (delta2 * w12.flatten()) * d_sigmoid(u1) w12 += alpha * delta2 * h b2 += alpha * delta2 w01 += alpha * np.outer(delta1, x) b1 += alpha * delta1 return w01, w12, b1, b2 alpha = 0.1 N_ITER = 1_000 w01 = np.random.rand(2, 2) * 2 - 1 w12 = np.random.rand(1, 2) * 2 - 1 b1 = np.zeros(2) b2 = 0 for _ in range(N_ITER): x = np.random.randint(0, 2, (2,)) t = int(x[0] ^ x[1]) w01, w12, b1, b2 = train(w01, w12, b1, b2, x, t) print("Classification:") for x in [(0,0), (0,1), (1,0), (1,1)]: y = forward(w01, w12, b1, b2, np.array(x)) print(f"{x} -> {round(y)}") --------------------------------------------------