Advertisement
Guest User

Untitled

a guest
Jan 21st, 2015
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.05 KB | None | 0 0
  1. from pyspark.mllib.regression import LinearRegressionWithSGD, LabeledPoint
  2. from pyspark import SparkContext
  3.  
  4. sc = SparkContext (appName="mllib_simple_accuracy")
  5.  
  6. raw_data = sc.textFile ("data_yequalsx.csv", minPartitions=10) #MinPartitions doesnt guarantee that you get that many partitions, just that you wont have fewer than that many partitions
  7. data = raw_data.map(lambda line: [float(x) for x in line.split (",")]).map(lambda entry: LabeledPoint (entry[-1], entry[:-1])).zipWithIndex()
  8. test_samples= data.count()/10
  9.  
  10. training_data = data.filter(lambda (entry, index): index >= test_samples).map(lambda (lp,index): lp)
  11. test_data = data.filter(lambda (entry, index): index < test_samples).map(lambda (lp,index): lp)
  12.  
  13. model = LinearRegressionWithSGD.train(training_data, step=0.01, iterations=100, regType="l2", regParam=0.0001, intercept=True)
  14. print(model._coeff)
  15. print(model._intercept)
  16.  
  17. mse = (test_data.map(lambda lp: (lp.label - model.predict(lp.features))**2 ).reduce(lambda x,y: x+y))/test_samples;
  18. print("Mean Squared Error: %s" % str(mse))
  19.  
  20. sc.stop ()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement