Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- val sparkSession = SparkSession.builder.master("local[*]").getOrCreate()
- val xmldf = sparkSession.read.format(SEAConstant.STR_IMPORT_SPARK_DATA_BRICK_XML)
- .option(SEAConstant.STR_ROW_TAG, "Employee").option("nullValue", "").load("demo.xml")
- val columnNames = xmldf.columns.toSeq
- val sdf = xmldf.select(columnNames.map(c => xmldf.col(c)): _*)
- sdf.write.format("com.databricks.spark.xml").option("rootTag", "Company")
- .option("rowTag", "Employee").save("Rel")
- <?xml version="1.0"?>
- <Company>
- <Employee id="id47" masterRef="#id53" revision="" nomenclature="">
- <ApplicationRef version="J.0" application="Teamcenter"></ApplicationRef>
- <UserData id="id52">
- <UserValue valueRef="#id4" value="" title="_CONFIG_CONTEXT"></UserValue></UserData></Employee>
- <Employee id="id47" masterRef="#id53" revision="" nomenclature="">
- <ApplicationRef version="B.0" application="Teamcenter"></ApplicationRef>
- <UserData id="id63">
- <UserValue valueRef="#id5" value="" title="_CONFIG_CONTEXT"></UserValue></UserData></Employee>
- </Company>
- 19/06/25 14:45:14 ERROR Utils: Aborting task
- java.lang.NullPointerException
- at com.databricks.spark.xml.parsers.StaxXmlGenerator$$anonfun$apply$4.apply(StaxXmlGenerator.scala:131)
- at com.databricks.spark.xml.parsers.StaxXmlGenerator$$anonfun$apply$4.apply(StaxXmlGenerator.scala:129)
- at scala.collection.immutable.List.foreach(List.scala:383)
- at com.databricks.spark.xml.parsers.StaxXmlGenerator$.apply(StaxXmlGenerator.scala:129)
- at com.databricks.spark.xml.util.XmlFile$$anonfun$1$$anon$1.next(XmlFile.scala:108)
- at com.databricks.spark.xml.util.XmlFile$$anonfun$1$$anon$1.next(XmlFile.scala:96)
- at scala.collection.Iterator$$anon$11.next(Iterator.scala:363)
- at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:125)
- at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:123)
- at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1414)
- at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:135)
- at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:79)
- at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
- at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
- at org.apache.spark.scheduler.Task.run(Task.scala:109)
- at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
- at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
- at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
- at java.lang.Thread.run(Thread.java:748)
- 19/06/25 14:45:14 ERROR SparkHadoopWriter: Task attempt_20190625144513_0012_m_000000_0 aborted.
- 19/06/25 14:45:14 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1)
- org.apache.spark.SparkException: Task failed while writing rows
- at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:151)
- at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:79)
- at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
- at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
- at org.apache.spark.scheduler.Task.run(Task.scala:109)
- at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
- at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
- at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
- at java.lang.Thread.run(Thread.java:748)
- <dependency>
- <groupId>com.databricks</groupId>
- <artifactId>spark-xml_2.11</artifactId>
- <version>0.4.1</version>
- </dependency>
- package com.examples
- import java.io.File
- import org.apache.commons.io.FileUtils
- import org.apache.log4j.Level
- import org.apache.spark.sql.{SQLContext, SparkSession}
- /**
- * Created by Ram Ghadiyaram
- */
- object SparkXmlTest {
- org.apache.log4j.Logger.getLogger("org").setLevel(Level.ERROR)
- def main(args: Array[String]) {
- val spark = SparkSession.builder.
- master("local")
- .appName(this.getClass.getName)
- .getOrCreate()
- spark.sparkContext.setLogLevel("ERROR")
- val sc = spark.sparkContext
- val sqlContext = new SQLContext(sc)
- val str =
- """
- |<?xml version="1.0"?>
- |<Company>
- | <Employee id="1">
- | <Email>tp@xyz.com</Email>
- | <UserData id="id32" type="AttributesInContext">
- | <UserValue value="7in" title="Height"></UserValue>
- | <UserValue value="23lb" title="Weight"></UserValue>
- |</UserData>
- | </Employee>
- | <Measures id="1">
- | <Email>newdata@rty.com</Email>
- | <UserData id="id32" type="SitesInContext">
- |</UserData>
- | </Measures>
- | <Employee id="2">
- | <Email>tp@xyz.com</Email>
- | <UserData id="id33" type="AttributesInContext">
- | <UserValue value="7in" title="Height"></UserValue>
- | <UserValue value="34lb" title="Weight"></UserValue>
- |</UserData>
- | </Employee>
- | <Measures id="2">
- | <Email>nextrow@rty.com</Email>
- | <UserData id="id35" type="SitesInContext">
- |</UserData>
- | </Measures>
- | <Employee id="3">
- | <Email>tp@xyz.com</Email>
- | <UserData id="id34" type="AttributesInContext">
- | <UserValue value="7in" title="Height"></UserValue>
- | <UserValue value="" title="Weight"></UserValue>
- |</UserData>
- | </Employee>
- |</Company>
- """.stripMargin
- println("save to file ")
- val f = new File("xmltest.xml")
- FileUtils.writeStringToFile(f, str)
- val xmldf = spark.read.format("com.databricks.spark.xml")
- .option("rootTag", "Company")
- .option("rowTag", "Employee")
- .option("nullValue", "")
- .load(f.getAbsolutePath)
- val columnNames = xmldf.columns.toSeq
- val sdf = xmldf.select(columnNames.map(c => xmldf.col(c)): _*)
- sdf.write.format("com.databricks.spark.xml")
- .option("rootTag", "Company")
- .option("rowTag", "Employee")
- .mode("overwrite")
- .save("/src/main/resources/Rel1")
- println("read back from saved file ....")
- val readbackdf = spark.read.format("com.databricks.spark.xml")
- .option("rootTag", "Company")
- .option("rowTag", "Employee")
- .option("nullValue", "")
- .load("/src/main/resources/Rel1")
- readbackdf.show(false)
- }
- }
- save to file
- read back from saved file ....
- +----------+------------------------------------------------------------------------------+---+
- |Email |UserData |_id|
- +----------+------------------------------------------------------------------------------+---+
- |tp@xyz.com|[WrappedArray([null,Height,7in], [null,Weight,23lb]),id32,AttributesInContext]|1 |
- |tp@xyz.com|[WrappedArray([null,Height,7in], [null,Weight,34lb]),id33,AttributesInContext]|2 |
- |tp@xyz.com|[WrappedArray([null,Height,7in], [null,Weight,null]),id34,AttributesInContext]|3 |
- +----------+------------------------------------------------------------------------------+---+
- .option("attributePrefix", "_Att")
- .option("valueTag", "_VALUE")
- package com.examples
- import java.io.File
- import org.apache.commons.io.FileUtils
- import org.apache.spark.sql.{SQLContext, SparkSession}
- /**
- * Created by Ram Ghadiyaram
- */
- object SparkXmlTest {
- // org.apache.log4j.Logger.getLogger("org").setLevel(Level.ERROR)
- def main(args: Array[String]) {
- val spark = SparkSession.builder.
- master("local")
- .appName(this.getClass.getName)
- .getOrCreate()
- // spark.sparkContext.setLogLevel("ERROR")
- val sc = spark.sparkContext
- val sqlContext = new SQLContext(sc)
- // val str =
- // """
- // |<?xml version="1.0"?>
- // |<Company>
- // | <Employee id="1">
- // | <Email>tp@xyz.com</Email>
- // | <UserData id="id32" type="AttributesInContext">
- // | <UserValue value="7in" title="Height"></UserValue>
- // | <UserValue value="23lb" title="Weight"></UserValue>
- // |</UserData>
- // | </Employee>
- // | <Measures id="1">
- // | <Email>newdata@rty.com</Email>
- // | <UserData id="id32" type="SitesInContext">
- // |</UserData>
- // | </Measures>
- // | <Employee id="2">
- // | <Email>tp@xyz.com</Email>
- // | <UserData id="id33" type="AttributesInContext">
- // | <UserValue value="7in" title="Height"></UserValue>
- // | <UserValue value="34lb" title="Weight"></UserValue>
- // |</UserData>
- // | </Employee>
- // | <Measures id="2">
- // | <Email>nextrow@rty.com</Email>
- // | <UserData id="id35" type="SitesInContext">
- // |</UserData>
- // | </Measures>
- // | <Employee id="3">
- // | <Email>tp@xyz.com</Email>
- // | <UserData id="id34" type="AttributesInContext">
- // | <UserValue value="7in" title="Height"></UserValue>
- // | <UserValue value="" title="Weight"></UserValue>
- // |</UserData>
- // | </Employee>
- // |</Company>
- // """.stripMargin
- val str =
- """
- |<Company>
- | <Employee id="id47" masterRef="#id53" revision="" nomenclature="">
- |<ApplicationRef version="J.0" application="Teamcenter"></ApplicationRef>
- |<UserData id="id52">
- |<UserValue valueRef="#id4" value="" title="_CONFIG_CONTEXT"></UserValue></UserData></Employee>
- |<Employee id="id47" masterRef="#id53" revision="" nomenclature="">
- |<ApplicationRef version="B.0" application="Teamcenter"></ApplicationRef>
- |<UserData id="id63">
- |<UserValue valueRef="#id5" value="" title="_CONFIG_CONTEXT"></UserValue></UserData></Employee>
- |</Company>
- """.stripMargin
- println("save to file ")
- val f = new File("xmltest.xml")
- FileUtils.writeStringToFile(f, str)
- val xmldf = spark.read.format("com.databricks.spark.xml")
- .option("rootTag", "Company")
- .option("rowTag", "Employee")
- .option("nullValue", "")
- .load(f.getAbsolutePath)
- val columnNames = xmldf.columns.toSeq
- val sdf = xmldf.select(columnNames.map(c => xmldf.col(c)): _*)
- sdf.write.format("com.databricks.spark.xml")
- .option("rootTag", "Company")
- .option("rowTag", "Employee")
- .option("attributePrefix", "_Att")
- .option("valueTag", "_VALUE")
- .mode("overwrite")
- .save("./src/main/resources/Rel1")
- println("read back from saved file ....")
- val readbackdf = spark.read.format("com.databricks.spark.xml")
- .option("rootTag", "Company")
- .option("rowTag", "Employee")
- .option("nullValue", "")
- .load("./src/main/resources/Rel1")
- readbackdf.show(false)
- }
- }
- package com.examples
- import java.io.File
- import org.apache.commons.io.FileUtils
- import org.apache.spark.sql.{SQLContext, SparkSession}
- /**
- * Created by Ram Ghadiyaram
- */
- object SparkXmlTest {
- // org.apache.log4j.Logger.getLogger("org").setLevel(Level.ERROR)
- def main(args: Array[String]) {
- val spark = SparkSession.builder.
- master("local")
- .appName(this.getClass.getName)
- .getOrCreate()
- // spark.sparkContext.setLogLevel("ERROR")
- val sc = spark.sparkContext
- val sqlContext = new SQLContext(sc)
- // val str =
- // """
- // |<?xml version="1.0"?>
- // |<Company>
- // | <Employee id="1">
- // | <Email>tp@xyz.com</Email>
- // | <UserData id="id32" type="AttributesInContext">
- // | <UserValue value="7in" title="Height"></UserValue>
- // | <UserValue value="23lb" title="Weight"></UserValue>
- // |</UserData>
- // | </Employee>
- // | <Measures id="1">
- // | <Email>newdata@rty.com</Email>
- // | <UserData id="id32" type="SitesInContext">
- // |</UserData>
- // | </Measures>
- // | <Employee id="2">
- // | <Email>tp@xyz.com</Email>
- // | <UserData id="id33" type="AttributesInContext">
- // | <UserValue value="7in" title="Height"></UserValue>
- // | <UserValue value="34lb" title="Weight"></UserValue>
- // |</UserData>
- // | </Employee>
- // | <Measures id="2">
- // | <Email>nextrow@rty.com</Email>
- // | <UserData id="id35" type="SitesInContext">
- // |</UserData>
- // | </Measures>
- // | <Employee id="3">
- // | <Email>tp@xyz.com</Email>
- // | <UserData id="id34" type="AttributesInContext">
- // | <UserValue value="7in" title="Height"></UserValue>
- // | <UserValue value="" title="Weight"></UserValue>
- // |</UserData>
- // | </Employee>
- // |</Company>
- // """.stripMargin
- val str =
- """
- |<Company>
- | <Employee id="id47" masterRef="#id53" revision="" nomenclature="">
- |<ApplicationRef version="J.0" application="Teamcenter"></ApplicationRef>
- |<UserData id="id52">
- |<UserValue valueRef="#id4" value="" title="_CONFIG_CONTEXT"></UserValue></UserData></Employee>
- |<Employee id="id47" masterRef="#id53" revision="" nomenclature="">
- |<ApplicationRef version="B.0" application="Teamcenter"></ApplicationRef>
- |<UserData id="id63">
- |<UserValue valueRef="#id5" value="" title="_CONFIG_CONTEXT"></UserValue></UserData></Employee>
- |</Company>
- """.stripMargin
- println("save to file ")
- val f = new File("xmltest.xml")
- FileUtils.writeStringToFile(f, str)
- val xmldf = spark.read.format("com.databricks.spark.xml")
- .option("rootTag", "Company")
- .option("rowTag", "Employee")
- .option("nullValue", "")
- .load(f.getAbsolutePath)
- val columnNames = xmldf.columns.toSeq
- val sdf = xmldf.select(columnNames.map(c => xmldf.col(c)): _*)
- sdf.write.format("com.databricks.spark.xml")
- .option("rootTag", "Company")
- .option("rowTag", "Employee")
- .option("attributePrefix", "_Att")
- .option("valueTag", "_VALUE")
- .mode("overwrite")
- .save("./src/main/resources/Rel1")
- println("read back from saved file ....")
- val readbackdf = spark.read.format("com.databricks.spark.xml")
- .option("rootTag", "Company")
- .option("rowTag", "Employee")
- .option("nullValue", "")
- .load("./src/main/resources/Rel1")
- readbackdf.show(false)
- }
- }
- save to file
- read back from saved file ....
- +-----------------+-------------------------------+----+----------+
- |ApplicationRef |UserData |_id |_masterRef|
- +-----------------+-------------------------------+----+----------+
- |[Teamcenter, J.0]|[[_CONFIG_CONTEXT, #id4], id52]|id47|#id53 |
- |[Teamcenter, B.0]|[[_CONFIG_CONTEXT, #id5], id63]|id47|#id53 |
- +-----------------+-------------------------------+----+----------+
- xmldf.write.format("com.databricks.spark.xml").option("rootTag", "Company")
- .option("rowTag", "Employee").option("attributePrefix", "_Att")
- .option("valueTag","_VALUE").save("Rel")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement