Guest User

Untitled

a guest
Apr 2nd, 2018
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.33 KB | None | 0 0
  1. sqoop-import --connect jdbc:mysql://quickstart.cloudera:3306/retail_db \
  2. --username retail_dba \
  3. --password cloudera \
  4. --table products \
  5. --fields-terminated-by "|" \
  6. --lines-terminated-by "\n" \
  7. --target-dir /user/cloudera/teja_arun/ep02/products \
  8. --as-textfile \
  9. -m 1
  10.  
  11.  
  12. from pyspark import Row,HiveContext,SparkContext,SparkConf
  13.  
  14. conf=SparkConf().setAppName("ep02").setMaster("yarn-client")
  15. sc=SparkContext(conf=conf)
  16. sqlContext=HiveContext(sc)
  17.  
  18.  
  19. productsRDD=sc.textFile("/user/cloudera/teja_arun/ep02/products")
  20.  
  21. productsDF=productsRDD.map(lambda rec: Row(product_id=int(rec.split("|")[0]),product_category_id=int(rec.split("|")[1]),product_name=rec.split("|")[2],product_desc=rec.split("|")[3],product_price=float(rec.split("|")[4]),product_image=rec.split("|")[5])).toDF()
  22.  
  23. productsDF.registerTempTable("products")
  24.  
  25. sqlContext.setConf("spark.sql.shuffle.partitions","4")
  26.  
  27. sqlResult=sqlContext.sql("select product_category_id,max(product_price) max_price,min(product_price) min_price, avg(product_price) avg_price, count(product_id) total_products from products where product_price < 100 group by product_category_id order by product_category_id")
  28.  
  29.  
  30.  
  31.  
  32. sqlContext.setConf("spark.sql.avro.compression.codec","org.apache.hadoop.io.compress.SnappyCodec")
  33. sqlResult.write.save("/user/cloudera/teja_arun/ep02/solutions/products_price","com.databricks.spark.avro")
Add Comment
Please, Sign In to add comment