Advertisement
Guest User

Iris Dataset PCA in Julia

a guest
Mar 16th, 2019
122
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Julia 2.09 KB | None | 0 0
  1. using Plots;
  2. using Base;
  3. using LinearAlgebra;
  4. using Statistics;
  5. using DataFrames;
  6. using CSV;
  7. using StatsPlots;
  8.  
  9. # Load iris data
  10. filename = "/home/nchashch/BORG/SEM6/ITASOU/dataset/iris.data";
  11. iris = CSV.read(filename, header=false)
  12. iris = dropmissing(iris, disallowmissing=false);
  13. header = [:sepal_length, :sepal_width, :petal_length, :petal_width, :species];
  14. names!(iris, header);
  15.  
  16. function pca(X)
  17.     # Standardize the data by subtracting the mean
  18.     # and dividing by the standard deviation
  19.     X_s = (X .- mean(X, dims=1)) ./ std(X, dims=1);
  20.     # Get the number of datapoints in the dataset
  21.     n, _ = size(X_s)
  22.     # Compute the correlation matrix
  23.     gram = transpose(X_s) * X_s / (n - 1);
  24.     # NOTE: eigen-values computed by eigvals
  25.     # are sorted in increasing order by default
  26.     # so are the corresponding eigen-vectors
  27.     lambda = eigvals(gram);
  28.     vecs = eigvecs(gram);
  29.     # Return the standardized data in the new
  30.     # principal components (eigen) basis
  31.     X_s * vecs
  32. end
  33.  
  34. # Convert iris flower features into matrix form
  35. # and perform Principal Component Analysis with
  36. # the above pca function
  37. features = header[1:4];
  38. X = convert(Matrix, iris[:, features]);
  39. irispca = pca(X)
  40. irispca = DataFrame(irispca)
  41. irispca[:species] = iris[:species]
  42.  
  43. # Plot datapoints on the plane formed by the first and second principal components
  44. plt1 = @df irispca scatter(:x4, :x3, group=:species,
  45.                 title = "Iris Dataset PCA",
  46.                 xlabel = "Frist principal component (z-score)", ylabel = "Second principal component (z-score)",
  47.                 m=(0.7, [:cross :hex :star7], 5),
  48.                 bg=RGB(.2,.2,.2),
  49.                 );
  50.  
  51. # Plot datapoints on the plane formed by sepal length and sepal width
  52. # NOTE: Data on this plot is not standardized
  53. plt2 = @df iris scatter(:sepal_length, :sepal_width, group=:species,
  54.                title = "Iris Dataset",
  55.                xlabel = "Sepal length (cm)", ylabel = "Sepal width (cm)",
  56.                m=(0.7, [:cross :hex :star7], 5),
  57.                bg=RGB(.2,.2,.2),
  58.                );
  59.  
  60. plot(plt1, plt2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement