daily pastebin goal
23%
SHARE
TWEET

Untitled

a guest Nov 22nd, 2017 64 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from pyspark.sql import SparkSession
  2. from pyspark.ml.clustering import KMeans
  3. from pyspark.ml.feature import VectorAssembler
  4.  
  5. from pyspark.sql.types import IntegerType
  6. import matplotlib.pyplot as plt
  7.  
  8. spark = SparkSession.builder.appName("kmeans").getOrCreate()
  9. dataset = spark. \
  10.             read.\
  11.             format("org.apache.spark.csv"). \
  12.             option("inferSchema", True). \
  13.             csv("datasets/zoo.csv", header=False)
  14.  
  15. feat_cols = dataset.drop("_c0").columns
  16. vectorAss = VectorAssembler(inputCols=feat_cols, outputCol="features")
  17. vdf = vectorAss.transform(dataset)
  18. vdf = vdf.select("_c0", "features")
  19.  
  20. costs = []
  21. for k in xrange(2, 10):
  22.     kmeans = KMeans(k=k)
  23.     kmm = kmeans.fit(vdf)
  24.     c = kmm.computeCost(vdf)
  25.     costs.append(c)
  26.  
  27. plt.plot(xrange(2,10), costs)
  28. plt.show()
  29.  
  30. kmeans = KMeans(k=5)   # Escolhi 5 centroids
  31. kmm = kmeans.fit(vdf)
  32.  
  33. pred = kmm.transform(vdf)
  34. pred.show(truncate=True)
RAW Paste Data
Top