Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark.mllib.regression import LinearRegressionWithSGD, LabeledPoint
- from pyspark import SparkContext
- sc = SparkContext (appName="mllib_simple_accuracy")
- raw_data = sc.textFile ("data_yequalsx.csv", minPartitions=10) #MinPartitions doesnt guarantee that you get that many partitions, just that you wont have fewer than that many partitions
- data = raw_data.map(lambda line: [float(x) for x in line.split (",")]).map(lambda entry: LabeledPoint (entry[-1], entry[:-1])).zipWithIndex()
- test_samples= data.count()/10
- training_data = data.filter(lambda (entry, index): index >= test_samples).map(lambda (lp,index): lp)
- test_data = data.filter(lambda (entry, index): index < test_samples).map(lambda (lp,index): lp)
- model = LinearRegressionWithSGD.train(training_data, step=0.01, iterations=100, regType="l2", regParam=0.0001, intercept=True)
- print(model._coeff)
- print(model._intercept)
- mse = (test_data.map(lambda lp: (lp.label - model.predict(lp.features))**2 ).reduce(lambda x,y: x+y))/test_samples;
- print("Mean Squared Error: %s" % str(mse))
- sc.stop ()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement