Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- scala> import scala.xml._
- scala> val strxml = <employees>
- | <employee><id>1</id><name>chris</name></employee>
- | <employee><id>2</id><name>adam</name></employee>
- | <employee><id>3</id><name>karl</name></employee>
- | </employees>
- strxml: scala.xml.Elem =
- <employees>
- <employee><id>1</id><name>chris</name></employee>
- <employee><id>2</id><name>adam</name></employee>
- <employee><id>3</id><name>karl</name></employee>
- </employees>
- scala> val t = strxml.flatMap(line => line \ "employee")
- t: scala.xml.NodeSeq = NodeSeq(<employee><id>1</id><name>chris</name></employee>, <employee><id>2</id><name>adam</name></employee>, <employee><id>3</id><name>karl</name></employee>)
- scala> t.map(l => (l \ "id").text + "@" + (l \ "name").text).foreach(println)
- 1@chris
- 2@adam
- 3@karl
- scala> val filexml = sc.wholeTextFiles("file:///home/cloudera/test*")
- filexml: org.apache.spark.rdd.RDD[(String, String)] = file:///home/cloudera/test* MapPartitionsRDD[66] at wholeTextFiles at <console>:30
- scala> val lines = filexml.map(line => XML.loadString(line._2))
- lines: org.apache.spark.rdd.RDD[scala.xml.Elem] = MapPartitionsRDD[89] at map at <console>:32
- scala> val ft = lines.map(l => l \ "employee")
- ft: org.apache.spark.rdd.RDD[scala.xml.NodeSeq] = MapPartitionsRDD[99] at map at <console>:34
- scala> ft.map(l => (l \ "id").text + "@" + (l \ "name").text).foreach(println)
- Exception in task 0.0 in stage 63.0 (TID 63)
- org.xml.sax.SAXParseException; lineNumber: 1; columnNumber: 1; Content is not allowed in prolog
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement