Advertisement
Guest User

Untitled

a guest
Mar 19th, 2018
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.88 KB | None | 0 0
  1.  
  2. # coding: utf-8
  3.  
  4. # In[1]:
  5.  
  6. from pyspark.sql import SparkSession
  7.  
  8.  
  9. # In[2]:
  10.  
  11. spark = SparkSession.builder.appName("nlpProject").getOrCreate()
  12.  
  13.  
  14. # In[3]:
  15.  
  16. from pyspark.ml.feature import Tokenizer, RegexTokenizer, HashingTF, IDF
  17.  
  18.  
  19. # In[7]:
  20.  
  21. data = spark.read.csv("SMSSpamCollection", inferSchema= True, sep="\t", )
  22.  
  23.  
  24. # In[8]:
  25.  
  26. data.printSchema()
  27.  
  28.  
  29. # In[9]:
  30.  
  31. data = data.withColumnRenamed(existing="_c0", new="class").withColumnRenamed("_c1", "text")
  32.  
  33.  
  34. # In[10]:
  35.  
  36. data.columns
  37.  
  38.  
  39. # In[11]:
  40.  
  41. data.show(5)
  42.  
  43.  
  44. # In[12]:
  45.  
  46. from pyspark.sql.functions import length
  47.  
  48.  
  49. # In[13]:
  50.  
  51. data = data.withColumn("length", length(data['text']))
  52.  
  53.  
  54. # In[14]:
  55.  
  56. # will show the average length of ham and spam
  57. data.groupBy("class").mean().show()
  58.  
  59.  
  60. # # All the NLP preproc
  61.  
  62. # In[15]:
  63.  
  64. from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
  65.  
  66.  
  67. # In[16]:
  68.  
  69. tokenizer = Tokenizer(inputCol="text", outputCol="tokenText")
  70.  
  71.  
  72. # In[17]:
  73.  
  74. stopRemove = StopWordsRemover( inputCol= "tokenText", outputCol="stopToken")
  75.  
  76.  
  77. # In[18]:
  78.  
  79. countVec = CountVectorizer(inputCol="stopToken", outputCol="cVec")
  80.  
  81.  
  82. # In[19]:
  83.  
  84. idf = IDF(inputCol="cVec", outputCol="tfIdf")
  85.  
  86.  
  87. # In[20]:
  88.  
  89. hamSpamNumeric = StringIndexer(inputCol="class", outputCol="label")
  90.  
  91.  
  92. # In[21]:
  93.  
  94. from pyspark.ml.feature  import VectorAssembler
  95.  
  96.  
  97. # In[22]:
  98.  
  99. assembler = VectorAssembler(inputCols=["tfIdf", "length"], outputCol="features")
  100.  
  101.  
  102. # In[23]:
  103.  
  104. from pyspark.ml.classification import NaiveBayes
  105.  
  106.  
  107. # In[24]:
  108.  
  109. nb = NaiveBayes(labelCol="label", featuresCol="features")
  110.  
  111.  
  112. # In[25]:
  113.  
  114. from pyspark.ml import Pipeline
  115.  
  116.  
  117. # In[26]:
  118.  
  119. dataPrepPipe = Pipeline(stages=[hamSpamNumeric, tokenizer, stopRemove, countVec, idf, assembler])
  120.  
  121.  
  122. # In[27]:
  123.  
  124. cleaner = dataPrepPipe.fit(data)
  125.  
  126.  
  127. # In[28]:
  128.  
  129. cleanData = cleaner.transform(data)
  130.  
  131.  
  132. # In[29]:
  133.  
  134. cleanData = cleanData.select(["label", "features"])
  135.  
  136.  
  137. # ### ML Session
  138.  
  139. # In[30]:
  140.  
  141. train, test = cleanData.randomSplit([0.7, 0.3])
  142.  
  143.  
  144. # In[31]:
  145.  
  146. spamDetector = nb.fit(train)
  147.  
  148.  
  149. # In[32]:
  150.  
  151. result = spamDetector.transform(test)
  152.  
  153.  
  154. # In[44]:
  155.  
  156. result.filter(result["prediction"] != 0).show(5)
  157.  
  158.  
  159. # In[34]:
  160.  
  161. from pyspark.ml.evaluation import MulticlassClassificationEvaluator
  162. #available metrics: |weightedPrecision|weightedRecall|accuracy
  163.  
  164.  
  165. # In[35]:
  166.  
  167. accEval = MulticlassClassificationEvaluator(metricName="accuracy", labelCol="label", predictionCol="prediction")
  168.  
  169.  
  170. # In[36]:
  171.  
  172. accEval.evaluate(result)
  173.  
  174.  
  175. # In[37]:
  176.  
  177. precEval = MulticlassClassificationEvaluator(metricName="weightedPrecision", labelCol="label", predictionCol="prediction")
  178.  
  179.  
  180. # In[38]:
  181.  
  182. precEval.evaluate(result)
  183.  
  184.  
  185. # In[39]:
  186.  
  187. recallEval = MulticlassClassificationEvaluator(metricName="weightedRecall", labelCol="label", predictionCol="prediction")
  188.  
  189.  
  190. # In[40]:
  191.  
  192. recallEval.evaluate(result)
  193.  
  194.  
  195. # In[ ]:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement