Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark import SparkConf, SparkContext
- from pyspark.sql import SQLContext, Row
- conf = SparkConf().setAppName("PlaySQL")
- sc = SparkContext(conf = conf)
- sqlContext = SQLContext(sc)
- jdbcUrl = 'jdbc:mysql://ip-172-31-13-154.ec2.internal:3306/retail_db'
- dataframes_list = ['df_departments', 'df_categories', 'df_products', 'df_orders', 'df_order_items']
- sql_tables = ['departments', 'categories', 'products', 'orders', 'order_items']
- for i in range (0,5):
- dataframes_list[i] = sqlContext.read.format('jdbc').options(url=jdbcUrl, user='retail_dba', password='itversity',dbtable=sql_tables[i]).load()
- dataframes_list[i].registerTempTable(sql_tables[i])
- agg_results = sqlContext.sql("SELECT substr(o.order_date, 1,7) order_month, d.department_name, sum(oi.order_item_subtotal) FROM departments d join categories c ON d.department_id = c.category_department_id JOIN products p ON c.category_id = p.product_category_id JOIN order_items oi ON p.product_id = oi.order_item_product_id JOIN orders o ON o.order_id = oi.order_item_order_id GROUP BY substr(o.order_date, 1,7), d.department_name ORDER BY order_month, department_name")
- for res in agg_results.collect():
- print(res)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement