Advertisement
Guest User

Untitled

a guest
Oct 8th, 2016
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.35 KB | None | 0 0
  1. from pyspark import SparkConf, SparkContext
  2. from pyspark.sql import SQLContext, Row
  3.  
  4. #Setting the application name and creating the spark context and sql context objects.
  5. conf = SparkConf().setAppName("SparkSQL")
  6. sc = SparkContext(conf = conf)
  7. sqlContext = SQLContext(sc)
  8.  
  9. #The URL where the mysql database is hosted
  10. jdbcUrl = 'jdbcUrl'
  11.  
  12. #The list of all the dataframes which will be created
  13. dataframes_list = ['df_departments', 'df_categories', 'df_products', 'df_orders', 'df_order_items']
  14. #The list of all the tables from which the data has to be loaded from
  15. sql_tables = ['departments', 'categories', 'products', 'orders', 'order_items']
  16.  
  17. for i in range (0,5):
  18. dataframes_list[i] = sqlContext.read.format('jdbc').options(url=jdbcUrl, user='user', password='password',dbtable=sql_tables[i]).load()
  19. dataframes_list[i].registerTempTable(sql_tables[i])
  20.  
  21.  
  22. #Executing the query which returns the revenue generated by each department in a month
  23. agg_results = sqlContext.sql("SELECT substr(o.order_date, 1,7) order_month, d.department_name, sum(oi.order_item_subtotal) FROM departments d join categories c ON d.department_id = c.category_department_id JOIN products p ON c.category_id = p.product_category_id JOIN order_items oi ON p.product_id = oi.order_item_product_id JOIN orders o ON o.order_id = oi.order_item_order_id GROUP BY substr(o.order_date, 1,7), d.department_name ORDER BY order_month, department_name")
  24.  
  25. for res in agg_results.collect():
  26. print(res)
  27.  
  28. #Sample results:
  29. '''
  30. Row(order_month=u'2013-07', department_name=u'Golf', _c2=101087.26000000005)
  31. Row(order_month=u'2013-07', department_name=u'Outdoors', _c2=20877.519999999993)
  32. Row(order_month=u'2013-07', department_name=u'Testing Merge', _c2=5622.149999999998)
  33. Row(order_month=u'2013-07', department_name=u'data', _c2=163108.2700000002)
  34. Row(order_month=u'2013-07', department_name=u'fanshoppy', _c2=389162.09)
  35. Row(order_month=u'2013-07', department_name=u'footwear', _c2=84624.92000000001)
  36. Row(order_month=u'2013-08', department_name=u'Golf', _c2=391535.7299999997)
  37. Row(order_month=u'2013-08', department_name=u'Outdoors', _c2=82849.71999999997)
  38. Row(order_month=u'2013-08', department_name=u'Testing Merge', _c2=23719.610000000008)
  39. Row(order_month=u'2013-08', department_name=u'data', _c2=598981.2099999998)
  40. Row(order_month=u'2013-08', department_name=u'fanshoppy', _c2=1410128.6399999997)
  41. Row(order_month=u'2013-08', department_name=u'footwear', _c2=321443.78999999986)
  42. Row(order_month=u'2013-09', department_name=u'Golf', _c2=388137.44)
  43. Row(order_month=u'2013-09', department_name=u'Outdoors', _c2=84080.7)
  44. Row(order_month=u'2013-09', department_name=u'Testing Merge', _c2=24377.450000000004)
  45. Row(order_month=u'2013-09', department_name=u'data', _c2=615739.5500000002)
  46. Row(order_month=u'2013-09', department_name=u'fanshoppy', _c2=1452927.6799999995)
  47. Row(order_month=u'2013-09', department_name=u'footwear', _c2=369264.4499999999)
  48.  
  49. #The execution of the query using different engines has taken the following times
  50. 1. Executing the query using mySQL engine : 0.49 seconds
  51. 2. Executing the query on hive using Map Reduce engine which exececutes the query by running Map Reduce jobs in background: 52.085 seconds
  52. 3. Executing the query on hive using Tez as execution engine in background: 25.63 seconds
  53. 4. Executing the query using SparkSQL by loading the tables as Data Frames, it uses Apache Spark catalyst optimizer for running the query: 4.2 seconds
  54. '''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement