Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import org.apache.hadoop.conf.Configuration
- import org.apache.mahout.math.cf.SimilarityAnalysis
- import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark
- import org.apache.spark.rdd.{NewHadoopRDD, RDD}
- import org.apache.spark.{SparkConf, SparkContext}
- import org.bson.BSONObject
- import com.mongodb.hadoop.MongoInputFormat
- object SparkExample extends App {
- val mongoConfig = new Configuration()
- mongoConfig.set("mongo.input.uri", "mongodb://my_mongo_ip:27017/db.collection")
- val sparkConf = new SparkConf()
- val sc = new SparkContext("local", "SparkExample", sparkConf)
- val documents = sc.newAPIHadoopRDD(
- mongoConfig,
- classOf[MongoInputFormat],
- classOf[Object],
- classOf[BSONObject]
- )
- val new_doc: RDD[(String, String)] = documents.map(
- doc1 => (
- doc1._2.get("product_id").toString(),
- doc1._2.get("product_attribute_value").toString().replace("[ "", "").replace(""]", "").split("" , "").map(value => value.toLowerCase.replace(" ", "-")).mkString(" ")
- )
- )
- var myIDs = IndexedDatasetSpark(new_doc) // This line gives error
- SimilarityAnalysis.rowSimilarityIDS(myIDs)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement