Advertisement
Guest User

Untitled

a guest
Sep 30th, 2016
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.76 KB | None | 0 0
  1. import argparse
  2.  
  3. from time import clock
  4.  
  5. from pyspark.sql import SparkSession
  6. from pyspark.mllib.clustering import KMeans
  7. from numpy import fromstring
  8.  
  9. parser = argparse.ArgumentParser()
  10. parser.add_argument('--master', help='Spark master URL (default: "local[*]")', default="local[*]")
  11.  
  12. parser.add_argument('--infile', help='where to find input data')
  13. parser.add_argument('--partitions', help='number of partitions to operate on (default=64)', type=int, default=64)
  14. parser.add_argument('--iterations', help='number of iterations in each training run (default=32)', type=int, default=32)
  15. parser.add_argument('--runs', help='number of training runs (default=10)', type=int, default=10)
  16. parser.add_argument('--clusters', help='number of cluster centers to find (default=128)', type=int, default=128)
  17. parser.add_argument('--config', metavar="KEY=VAL", help="add KEY=VAL to Spark's configuration", action='append', default=[], dest='config')
  18.  
  19.  
  20. if __name__ == "__main__":
  21. args = parser.parse_args()
  22. print(args)
  23. protospark = SparkSession.builder.appName("k-means-app").master(args.master)
  24. spark = reduce(lambda x, y: x.config(*y.split("=")), args.config, protospark).getOrCreate()
  25. runs = args.runs
  26. iterations = args.iterations
  27. partitions = args.partitions
  28. clusters = args.clusters
  29.  
  30. sc = spark.sparkContext
  31. rdd = sc.textFile(args.infile).map(lambda line: fromstring(line, sep=",")).repartition(partitions)
  32.  
  33. logger = sc._jvm.org.apache.log4j
  34. logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
  35.  
  36. start_time = clock()
  37. for run in (range(runs)):
  38. KMeans.train(rdd, clusters, iterations)
  39. end_time = clock()
  40.  
  41. sc.stop()
  42.  
  43. print("completed %d run%s in %f seconds" % (runs, (runs > 1 and "s" or ""), end_time - start_time))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement