Untitled

// SD1 and SD2
val text = sc.textFile("/1.txt")
val hBV = text.map(a=>a.toDouble)
val hbv1 p pairHBV.map(a=>(a._2, a.2)
val pairHBV = hBV.zipWithIndex

import org.apache.spark.rdd.PairRDDFunctions
val tmp1 = pairHBV.map(a => (a._2, a._1))
val tmp2 = pairHBV.map(a => (a._2 + 1, a._1))
val hbvBase = tmp1.join(tmp2)
val hbvBase1 = hbvBase.map(a => a._2)


val h = hbvBase1.map(a=>(1, a._2-a._1, (a._2- a._1) * (a._2-a._1))).reduce((a, b) => (a._1+b._1,a._2+b._2,a._3+b._3))

(h._2-h._3)*(h._2-h._3) / h._1
// mapPartition example 0
val parallel = sc.parallelize(1 to 9, 3)
parallel.mapPartitions( x => List(x.next).iterator).collect
// res383: Array[Int] = Array(1, 4, 7)
// compare to the same, but with default parallelize
val parallel = sc.parallelize(1 to 9)
parallel.mapPartitions( x => List(x.next).iterator).collect
// res384: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8)

// mapPartition example 1
val a = sc.parallelize(1 to 9, 3)
def myfunc[T](iter: Iterator[T]) : Iterator[(T, T)] = {
  var res = List[(T, T)]()
  var pre = iter.next
  while (iter.hasNext)
  {
    val cur = iter.next;
    res .::= (pre, cur)
    pre = cur;
  }
  res.iterator
}
a.mapPartitions(myfunc).collect
//res0: Array[(Int, Int)] = Array((2,3), (1,2), (5,6), (4,5), (8,9), (7,8))

// mapPartition example 2
val z = sc.parallelize(List("a","b","c","d","e","f"),2)

//lets first print out the contents of the RDD with partition labels
def myfunc(index: Int, iter: Iterator[(String)]) : Iterator[String] = {
  iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator
}

z.mapPartitionsWithIndex(myfunc).collect