Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding: utf-8
- # In[1]:
- from pyspark.sql import SparkSession
- # In[2]:
- spark = SparkSession.builder.appName("nlpProject").getOrCreate()
- # In[3]:
- from pyspark.ml.feature import Tokenizer, RegexTokenizer, HashingTF, IDF
- # In[7]:
- data = spark.read.csv("SMSSpamCollection", inferSchema= True, sep="\t", )
- # In[8]:
- data.printSchema()
- # In[9]:
- data = data.withColumnRenamed(existing="_c0", new="class").withColumnRenamed("_c1", "text")
- # In[10]:
- data.columns
- # In[11]:
- data.show(5)
- # In[12]:
- from pyspark.sql.functions import length
- # In[13]:
- data = data.withColumn("length", length(data['text']))
- # In[14]:
- # will show the average length of ham and spam
- data.groupBy("class").mean().show()
- # # All the NLP preproc
- # In[15]:
- from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
- # In[16]:
- tokenizer = Tokenizer(inputCol="text", outputCol="tokenText")
- # In[17]:
- stopRemove = StopWordsRemover( inputCol= "tokenText", outputCol="stopToken")
- # In[18]:
- countVec = CountVectorizer(inputCol="stopToken", outputCol="cVec")
- # In[19]:
- idf = IDF(inputCol="cVec", outputCol="tfIdf")
- # In[20]:
- hamSpamNumeric = StringIndexer(inputCol="class", outputCol="label")
- # In[21]:
- from pyspark.ml.feature import VectorAssembler
- # In[22]:
- assembler = VectorAssembler(inputCols=["tfIdf", "length"], outputCol="features")
- # In[23]:
- from pyspark.ml.classification import NaiveBayes
- # In[24]:
- nb = NaiveBayes(labelCol="label", featuresCol="features")
- # In[25]:
- from pyspark.ml import Pipeline
- # In[26]:
- dataPrepPipe = Pipeline(stages=[hamSpamNumeric, tokenizer, stopRemove, countVec, idf, assembler])
- # In[27]:
- cleaner = dataPrepPipe.fit(data)
- # In[28]:
- cleanData = cleaner.transform(data)
- # In[29]:
- cleanData = cleanData.select(["label", "features"])
- # ### ML Session
- # In[30]:
- train, test = cleanData.randomSplit([0.7, 0.3])
- # In[31]:
- spamDetector = nb.fit(train)
- # In[32]:
- result = spamDetector.transform(test)
- # In[44]:
- result.filter(result["prediction"] != 0).show(5)
- # In[34]:
- from pyspark.ml.evaluation import MulticlassClassificationEvaluator
- #available metrics: |weightedPrecision|weightedRecall|accuracy
- # In[35]:
- accEval = MulticlassClassificationEvaluator(metricName="accuracy", labelCol="label", predictionCol="prediction")
- # In[36]:
- accEval.evaluate(result)
- # In[37]:
- precEval = MulticlassClassificationEvaluator(metricName="weightedPrecision", labelCol="label", predictionCol="prediction")
- # In[38]:
- precEval.evaluate(result)
- # In[39]:
- recallEval = MulticlassClassificationEvaluator(metricName="weightedRecall", labelCol="label", predictionCol="prediction")
- # In[40]:
- recallEval.evaluate(result)
- # In[ ]:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement