Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- sqoop-import --connect jdbc:mysql://ms.itversity.com:3306/retail_db \
- --username retail_user \
- --password itversity \
- --table products \
- --fields-terminated-by "|" \
- --lines-terminated-by "\n" \
- --warehouse-dir "/user/krishnatejathatavarthi/teja_aruns/solutions02/" \
- --as-textfile
- hadoop fs -chmod -R 765 /user/krishnatejathatavarthi/teja_aruns/solutions02
- from pyspark import SparkContext,SparkConf,Row,HiveContext
- import avro.schema
- conf=SparkConf().setAppName("importnexport").setMaster("yarn-client")
- sc=SparkContext(conf=conf)
- sqlContext=HiveContext(sc)
- productsRDD=sc.textFile("/user/krishnatejathatavarthi/teja_aruns/solutions02/products")
- productsDF = productsRDD.map(lambda rec: Row(product_id=int(rec.split("|")[0]),product_category_id=int(rec.split("|")[1]),product_name=rec.split("|")[2],product_desc=rec.split("|")[3],product_price=float(rec.split("|")[4]),product_image=rec.split("|")[5])).toDF()
- productsDF.registerTempTable("products")
- sqlContext.setConf("spark.sql.shuffle.partitions","2")
- sqlResult=sqlContext.sql("select product_category_id,max(product_price) max_price, count(product_id) total_orders, avg(product_price) avg_price, min(product_price) min_price from products where product_price < 100 group by product_category_id order by product_category_id")
- sqlContext.setConf("spark.sql.avro.compression.codec","snappy")
- sqlResult.save("/user/krishnatejathatavarthi/teja_aruns/solutions02/solution","com.databricks.spark.avro")
Add Comment
Please, Sign In to add comment