Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import org.json4s._
- import org.json4s.jackson.JsonMethods._
- import org.apache.spark.SparkContext
- //import org.apache.spark.api.java.JavaRDD;\
- import org.apache.spark.sql.Row;
- import org.apache.spark.sql.SparkSession;
- import org.apache.spark.sql.SQLContext
- import org.apache.spark.{SparkContext, SparkConf}
- //import org.apache.spark.api.java.JavaRDD;\
- import org.apache.spark.sql.Row;
- import org.apache.spark.sql.SparkSession;
- //sc.stop()
- val conf = new SparkConf().set("spark.executor.memory", "8g")
- val sc = new SparkContext(conf)
- val path = "reviews_devset.json"
- val rdd = sc.textFile(path,100).cache()
- //var ee = spark.read.json("reviews_devset.json", multiLine=True)
- val rdMapped = rdd.map{ row =>
- val json_row = parse(row)
- (compact(json_row \ "category"), compact(json_row \ "reviewText").toLowerCase())
- .replaceAll("[\\*{}\\[\\]()>#\\+:\\^&@<\\?;,\"!\\$=\\|\\.]", " "))
- }.collect()
- val groupedRDD = rdMapped.flatMap{case(k, v)=> v.split(" ").filter(word => word.length() > 1)
- .map(x => (k, x))}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement