Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- sqoop-import --connect jdbc:mysql://quickstart.cloudera:3306/retail_db \
- --username retail_dba \
- --password cloudera \
- --table orders \
- --fields-terminated-by "\t" \
- --lines-terminated-by "\n" \
- --as-textfile \
- --target-dir /user/cloudera/teja_arun/file-formats/orders-text \
- -m 1
- sqoop-import --connect jdbc:mysql://quickstart.cloudera:3306/retail_db \
- --username retail_dba \
- --password cloudera \
- --table orders \
- --target-dir /user/cloudera/teja_arun/file-formats/orders-avros \
- --as-avrodatafile \
- -m 1
- #########not working because with issue in parquetfile ie kite SDK development################
- sqoop-import --connect jdbc:mysql://quickstart.cloudera:3306/retail_db \
- --username retail_dba \
- --password cloudera \
- --table orders \
- --target-dir /user/cloudera/teja_arun/file-formats/orders-parquet \
- --as-parquetfile \
- -m 1
- avroData=sqlContext.read.load("/user/cloudera/teja_arun/file-formats/orders-avros","com.databricks.spark.avro")
- sqlContext.setConf("spark.sql.parquet.compression.codec","snappy")
- avroData.write.parquet("/user/cloudera/teja_arun/file-formats/orders-parquet-snappy")
- avroData.map(lambda rec: (str(rec[0])+"\t"+str(rec[1])+"\t"+str(rec[2])+"\t"+str(rec[3]))).saveAsTextFile("/user/cloudera/teja_arun/file-formats/order-text-gzip","org.apache.hadoop.io.compress.GzipCodec")
- ####sequence file should be key value pairs
- avroData.map(lambda rec: (rec[0],(str(rec[0])+"\t"+str(rec[1])+"\t"+str(rec[2])+"\t"+str(rec[3])))).saveAsSequenceFile("/user/cloudera/teja_arun/file-formats/order-sequence")
- avroData.map(lambda rec: (str(rec[0])+"\t"+str(rec[1])+"\t"+str(rec[2])+"\t"+str(rec[3]))).saveAsTextFile("/user/cloudera/teja_arun/file-formats/order-text-snappy","org.apache.hadoop.io.compress.SnappyCodec")
- parquetData=sqlContext.read.parquet("/user/cloudera/teja_arun/file-formats/orders-parquet-snappy")
- sqlContext.setConf("spark.sql.parquet.compression.codec","uncompressed")
- parquetData.write.parquet("/user/cloudera/teja_arun/file-formats/orders-parquet-uncompress")
- sqlContext.setConf("spark.sql.avro.compression.codec","snappy")
- parquetData.write.save("/user/cloudera/teja_arun/file-formats/orders-avro-snappy","com.databricks.spark.avro")
- avroSnappyData=sqlContext.read.load("/user/cloudera/teja_arun/file-formats/orders-avro-snappy","com.databricks.spark.avro")
- avroSnappyData.toJSON().saveAsTextFile("/user/cloudera/teja_arun/file-formats/orders-json-nocompress")
- avroSnappyData.toJSON().saveAsTextFile("/user/cloudera/teja_arun/file-formats/orders-json-gzip","org.apache.hadoop.io.compress.GzipCodec")
- jsonData=sqlContext.read.json("/user/cloudera/teja_arun/file-formats/orders-json-nocompress")
- jsonData.map(lambda rec:(str(rec[0])+","+str(rec[1])+","+str(rec[2])+","+str(rec[3]))).saveAsTextFile("/user/cloudera/teja_arun/file-formats/orders-csv-gzip","org.apache.hadoop.io.compress.GzipCodec")
Add Comment
Please, Sign In to add comment