Advertisement
Guest User

Iris Dataset PCA in Julia

a guest
Mar 16th, 2019
171
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Julia 2.04 KB | None | 0 0
  1. using DataFrames;
  2. using CSV;
  3. using LinearAlgebra;
  4. using Statistics;
  5. using StatsPlots;
  6.  
  7. # Load iris data
  8. filename = "/home/user/path/to/iris.data";
  9. iris = CSV.read(filename, header=false)
  10. iris = dropmissing(iris, disallowmissing=false);
  11. header = [:sepal_length, :sepal_width, :petal_length, :petal_width, :species];
  12. names!(iris, header);
  13.  
  14. function pca(X)
  15.     # Standardize the data by subtracting the mean
  16.     # and dividing by the standard deviation
  17.     X_s = (X .- mean(X, dims=1)) ./ std(X, dims=1);
  18.     # Get the number of datapoints in the dataset
  19.     n, _ = size(X_s)
  20.     # Compute the correlation matrix
  21.     gram = transpose(X_s) * X_s / (n - 1);
  22.     # NOTE: eigen-values computed by eigvals
  23.     # are sorted in increasing order by default
  24.     # so are the corresponding eigen-vectors
  25.     lambda = eigvals(gram);
  26.     vecs = eigvecs(gram);
  27.     # Return the standardized data in the new
  28.     # principal components (eigen) basis
  29.     X_s * vecs
  30. end
  31.  
  32. # Convert iris flower features into matrix form
  33. # and perform Principal Component Analysis with
  34. # the above pca function
  35. features = header[1:4];
  36. X = convert(Matrix, iris[:, features]);
  37. irispca = pca(X)
  38. irispca = DataFrame(irispca)
  39. irispca[:species] = iris[:species]
  40.  
  41. # Plot datapoints on the plane formed by the first and second principal components
  42. plt1 = @df irispca scatter(:x4, :x3, group=:species,
  43.                 title = "Iris Dataset PCA",
  44.                 xlabel = "Frist principal component (z-score)", ylabel = "Second principal component (z-score)",
  45.                 m=(0.7, [:cross :hex :star7], 5),
  46.                 bg=RGB(.2,.2,.2),
  47.                 );
  48.  
  49. # Plot datapoints on the plane formed by sepal length and sepal width
  50. # NOTE: Data on this plot is not standardized
  51. plt2 = @df iris scatter(:sepal_length, :sepal_width, group=:species,
  52.                title = "Iris Dataset",
  53.                xlabel = "Sepal length (cm)", ylabel = "Sepal width (cm)",
  54.                m=(0.7, [:cross :hex :star7], 5),
  55.                bg=RGB(.2,.2,.2),
  56.                );
  57.  
  58. plot(plt1, plt2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement