Guest User

Untitled

a guest
Nov 22nd, 2017
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.90 KB | None | 0 0
  1. from pyspark.sql import SparkSession
  2. from pyspark.ml.clustering import KMeans
  3. from pyspark.ml.feature import VectorAssembler
  4.  
  5. from pyspark.sql.types import IntegerType
  6. import matplotlib.pyplot as plt
  7.  
  8. spark = SparkSession.builder.appName("kmeans").getOrCreate()
  9. dataset = spark. \
  10. read.\
  11. format("org.apache.spark.csv"). \
  12. option("inferSchema", True). \
  13. csv("datasets/zoo.csv", header=False)
  14.  
  15. feat_cols = dataset.drop("_c0").columns
  16. vectorAss = VectorAssembler(inputCols=feat_cols, outputCol="features")
  17. vdf = vectorAss.transform(dataset)
  18. vdf = vdf.select("_c0", "features")
  19.  
  20. costs = []
  21. for k in xrange(2, 10):
  22. kmeans = KMeans(k=k)
  23. kmm = kmeans.fit(vdf)
  24. c = kmm.computeCost(vdf)
  25. costs.append(c)
  26.  
  27. plt.plot(xrange(2,10), costs)
  28. plt.show()
  29.  
  30. kmeans = KMeans(k=5) # Escolhi 5 centroids
  31. kmm = kmeans.fit(vdf)
  32.  
  33. pred = kmm.transform(vdf)
  34. pred.show(truncate=True)
Add Comment
Please, Sign In to add comment