Untitled

#Model Tuning
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

pipeLine2=Pipeline()
pipeLine2.setStages([classifier])

paramGrid2 = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build()
classifier.explainParam('stepSize')

cv = CrossValidator(estimator=pipeLine2,
                          estimatorParamMaps=paramGrid2,
                          evaluator=evaluator,
                          numFolds=2)
cvm = cv.fit(train)
predictions = cvm.transform(test)
evaluator.evaluate(predictions)


#Compare models
bestModel = cvm.bestModel.stages[0]
bestModel.weights.array

bestModel.layers

cvm.avgMetrics

#Try other classifiers
from pyspark.ml.classification import (LogisticRegression,
                                       DecisionTreeClassifier,
                                       RandomForestClassifier
                                      )

lr  = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(maxBins=10)

lrm = lr.fit(train)
dtm = dt.fit(train)
rfm = rf.fit(train)

lrm_results = lrm.transform(test)
dtm_results = dtm.transform(test)
rfm_results = rfm.transform(test)

evaluation_lrm=evaluator.evaluate(lrm_results)
evaluation_dtm=evaluator.evaluate(dtm_results)
evaluation_rfm=evaluator.evaluate(rfm_results)

print('evaluation of logistic regression model = %g'%evaluation_lrm)
print('evaluation of decision tree model = %g'%evaluation_dtm)
print('evaluation of random forest model = %g'%evaluation_rfm)