Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark.sql import SparkSession
- from pyspark.ml.clustering import KMeans
- from pyspark.ml.feature import VectorAssembler
- from pyspark.sql.types import IntegerType
- import matplotlib.pyplot as plt
- spark = SparkSession.builder.appName("kmeans").getOrCreate()
- dataset = spark. \
- read.\
- format("org.apache.spark.csv"). \
- option("inferSchema", True). \
- csv("datasets/zoo.csv", header=False)
- feat_cols = dataset.drop("_c0").columns
- vectorAss = VectorAssembler(inputCols=feat_cols, outputCol="features")
- vdf = vectorAss.transform(dataset)
- vdf = vdf.select("_c0", "features")
- costs = []
- for k in xrange(2, 10):
- kmeans = KMeans(k=k)
- kmm = kmeans.fit(vdf)
- c = kmm.computeCost(vdf)
- costs.append(c)
- plt.plot(xrange(2,10), costs)
- plt.show()
- kmeans = KMeans(k=5) # Escolhi 5 centroids
- kmm = kmeans.fit(vdf)
- pred = kmm.transform(vdf)
- pred.show(truncate=True)
Add Comment
Please, Sign In to add comment