Guest User

Untitled

a guest
Sep 8th, 2018
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.22 KB | None | 0 0
  1. sqoop-import --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
  2. --username retail_dba --password cloudera --table products \
  3. --target-dir "/user/cloudera/products" \
  4. --fields-terminated-by '|'
  5.  
  6. hadoop fs -mkdir problem2
  7. hadoop fs -mkdir problem2/products
  8.  
  9. hadoop fs -mv products problem2
  10.  
  11. hadoop fs -chmod 765 problem2/products
  12.  
  13.  
  14. from pyspark import Row
  15. productsData=sc.textFile("/user/cloudera/problem2/products")
  16. productsFilterd = productsData.filter(lambda rec: float(rec.split("|")[4])<100)
  17.  
  18. products=productsFiltered.map(lambda rec:Row(product_id=int(rec.split("|")[0]),product_cat_id=int(rec.split("|")[1]),product_name=rec.split("|")[2],product_desc=rec.split("|")[3],product_price=float(rec.split("|")[4]),product_image=rec.split("|")[5])).toDF()
  19.  
  20. products.registerTempTable("products")
  21.  
  22. sqlContext.setConf("spark.sql.shuffle.partitions","2")
  23.  
  24. sqlResult=sqlContext.sql("select product_cat_id,max(product_price) max_price,count(product_id) total_products, min(product_price) min_price,avg(product_price) from products group by product_cat_id sort by product_cat_id")
  25.  
  26. sqlContext.setConf("spark.sql.avro.compression.codec","snappy")
  27.  
  28. sqlResult.write.format("com.databricks.spark.avro").save("/user/cloudera/problem2/products/result-sql")
Add Comment
Please, Sign In to add comment