Advertisement
Dundre32

Untitled

Apr 26th, 2020
790
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Scala 1.14 KB | None | 0 0
  1. import org.json4s._
  2. import org.json4s.jackson.JsonMethods._
  3. import org.apache.spark.SparkContext
  4. //import org.apache.spark.api.java.JavaRDD;\
  5. import org.apache.spark.sql.Row;
  6. import org.apache.spark.sql.SparkSession;
  7. import org.apache.spark.sql.SQLContext
  8. import org.apache.spark.{SparkContext, SparkConf}
  9. //import org.apache.spark.api.java.JavaRDD;\
  10.  
  11. import org.apache.spark.sql.Row;
  12. import org.apache.spark.sql.SparkSession;
  13.  
  14. //sc.stop()
  15. val conf = new SparkConf().set("spark.executor.memory", "8g")
  16. val sc =  new SparkContext(conf)
  17.  
  18.  
  19. val path = "reviews_devset.json"
  20. val rdd = sc.textFile(path,100).cache()
  21. //var ee = spark.read.json("reviews_devset.json", multiLine=True)
  22.  
  23. val rdMapped =  rdd.map{ row =>
  24.   val json_row = parse(row)
  25.  
  26.   (compact(json_row \ "category"), compact(json_row \ "reviewText").toLowerCase())
  27.                                                                     .replaceAll("[\\*{}\\[\\]()>#\\+:\\^&@<\\?;,\"!\\$=\\|\\.]", " "))
  28.    
  29. }.collect()
  30.  
  31.  
  32.  
  33. val groupedRDD = rdMapped.flatMap{case(k, v)=> v.split(" ").filter(word => word.length() > 1)
  34.                                                   .map(x => (k, x))}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement