Advertisement
Guest User

Untitled

a guest
Mar 19th, 2019
5,099
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.76 KB | None | 0 0
  1. // ===========================================================================
  2. // Welcome!
  3. //
  4. // Log in to your Snowflake instance, create a new worksheet, then
  5. // copy and paste the contents of this file into your worksheet.
  6. //
  7. // Make sure you're using the ACCOUNTADMIN role before we get started
  8. // (see the context in the upper right corner of your worksheet)
  9. //
  10. // Next, head over to the Partner Connect page (link in the top right of the snowflake toolbar).
  11. //
  12. // We're going to start by setting up a Talend Stitch account using the built in Snowflake connection.
  13. // This will give us everything we need to start securely ingesting data.
  14. // ===========================================================================
  15.  
  16.  
  17.  
  18.  
  19. // ===========================================================================
  20. // Next, let's manually setup PowerBI similarly to how Partner Connect set
  21. // Talend Stitch up for us before.
  22. // ===========================================================================
  23. // Create the DB, user, role, and warehouse that powerbi will use to securely connect
  24. CREATE DATABASE IF NOT EXISTS POWERBI;
  25. CREATE USER IF NOT EXISTS POWERBI_USER_ACCOUNT PASSWORD="my super cool password." MUST_CHANGE_PASSWORD=false; // use your own password, dummy
  26. CREATE ROLE IF NOT EXISTS POWERBI_ROLE;
  27. CREATE WAREHOUSE IF NOT EXISTS POWERBI_WH WAREHOUSE_SIZE=XSMALL;
  28.  
  29. // Attach permissions
  30. GRANT USAGE ON DATABASE POWERBI TO ROLE POWERBI_ROLE;
  31. GRANT USAGE ON WAREHOUSE POWERBI_WH TO ROLE POWERBI_ROLE;
  32. ALTER USER POWERBI_USER_ACCOUNT SET DEFAULT_ROLE=POWERBI_ROLE; // you must do this because the PowerBI connector doesn't let you specify a role.
  33. // ===========================================================================
  34.  
  35.  
  36.  
  37.  
  38. // ===========================================================================
  39. // Now, setup ingestion from the Stitch side of things and wait for data to flow.
  40. //
  41. // Let's explore the raw data.
  42. // ===========================================================================
  43. // Initial look at data
  44. SELECT * FROM PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA LIMIT 10;
  45.  
  46. // let's make the table easier to use in queries
  47. SET SPOTIFY = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA';
  48. SELECT * FROM TABLE($SPOTIFY) LIMIT 10;
  49.  
  50. // we can also set context to do this
  51. USE DATABASE PC_STITCH_DB;
  52. USE SCHEMA AWS_S3_CSV_BUCKET;
  53. SELECT * FROM SPOTIFY_DATA LIMIT 10;
  54.  
  55. // NOTE: I prefer the variable approach for worksheets
  56. // where you'll be in multiple databases. It prevents mistakes
  57. // when running different parts of your worksheet
  58.  
  59. // let's find out what the top songs are
  60. SELECT * FROM TABLE($SPOTIFY) ORDER BY STREAMS DESC LIMIT 10;
  61.  
  62. // let's see what date ranges we're working with
  63. SELECT
  64. MIN(DATE) AS startDate,
  65. MAX(DATE) AS endDate
  66. FROM
  67. TABLE($SPOTIFY);
  68.  
  69. // looks like we're essentially playing with top daily streams for all of 2017.
  70. // Let's see which artist had the most streams over the whole data set
  71. SELECT
  72. ARTIST, SUM(STREAMS) as streamCount
  73. FROM
  74. TABLE($SPOTIFY)
  75. GROUP BY
  76. ARTIST
  77. ORDER BY
  78. streamCount DESC
  79. LIMIT 10;
  80.  
  81. // hmm, these numbers seem high. Maybe we need to explore those region codes
  82. SELECT DISTINCT REGION FROM TABLE($SPOTIFY);
  83.  
  84. // maybe we should just use global? I think aggregating with regions AND global
  85. // is resulting in double counting. Lets find out using Drake's streams.
  86. SET globalDrakeSteams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION='global');
  87. SET sumOfRegionalDrakeStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION!='global');
  88. SELECT $globalDrakeSteams, $sumOfRegionalDrakeStreams;
  89.  
  90. // what about Kendrick Lamar ("DAMN." came out in the spring of 2017 and it is a masterpiece)
  91. SET globalKendrickSteams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION='global');
  92. SET sumOfRegionalKendrickStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION!='global');
  93. SELECT $globalKendrickSteams, $sumOfRegionalKendrickStreams;
  94.  
  95. // it's not exact, but close enough to convince me to filter by global from now on. (check with other artists if you like)
  96. // The easiest way to do that is with a view. Let's make one now
  97. CREATE VIEW IF NOT EXISTS
  98. PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL
  99. AS
  100. SELECT * FROM TABLE($SPOTIFY) WHERE REGION='global';
  101.  
  102. // Now grab the view name as a variable
  103. SET SPOTIFY_GLOBAL = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL';
  104.  
  105. // Cool, now we're working with some cleaner data.
  106. // Let's revisit our top artists.
  107. SELECT
  108. ARTIST, SUM(STREAMS) as streamCount
  109. FROM
  110. TABLE($SPOTIFY_GLOBAL)
  111. GROUP BY
  112. ARTIST
  113. ORDER BY
  114. streamCount DESC
  115. LIMIT 10;
  116.  
  117. // now top songs
  118. SELECT
  119. ARTIST, TRACK_NAME, SUM(STREAMS) as streamCount
  120. FROM
  121. TABLE($SPOTIFY_GLOBAL)
  122. GROUP BY
  123. ARTIST, TRACK_NAME
  124. ORDER BY
  125. streamCount DESC
  126. LIMIT 10;
  127.  
  128. // wow, Ed Sheeran was killing it in 2017. Good for him.
  129. //
  130. // I think we have enough context to maybe build a good BI table...
  131. // ===========================================================================
  132.  
  133.  
  134.  
  135.  
  136. // ===========================================================================
  137. // Now, let's send some clean, BI-ready data to our PowerBI DB.
  138. //
  139. // Our goal from here will be to transform the data in PC_STITCH_DB to a clean,
  140. // BI-ready form inside of the POWERBI database for analytics consumption
  141. // ===========================================================================
  142. // let's create our spotify schema in the powerbi db
  143. CREATE SCHEMA IF NOT EXISTS POWERBI.SPOTIFY;
  144.  
  145. // I think it would be fun to see the top 5 songs every
  146. // month for US streaming.
  147.  
  148. // start by creating a temp table with stream counts for each song by month.
  149. // We'll filter to only US data and we'll ignore the partial January 2018 data
  150. CREATE OR REPLACE TEMPORARY TABLE
  151. POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH
  152. AS (
  153. SELECT
  154. ARTIST,
  155. TRACK_NAME,
  156. SUM(STREAMS) AS STREAM_COUNT,
  157. DATE_TRUNC(MONTH, DATE) AS MONTH
  158. FROM
  159. TABLE($SPOTIFY)
  160. WHERE
  161. REGION='us' AND YEAR(DATE) = 2017
  162. GROUP BY
  163. MONTH, ARTIST, TRACK_NAME
  164. );
  165.  
  166. // now we'll get song ranks within each month using a nested
  167. // query with a window function
  168. CREATE OR REPLACE TABLE
  169. POWERBI.SPOTIFY.US_MONTHLY_TOP_5_SONGS
  170. AS (
  171. SELECT * FROM
  172. (
  173. SELECT
  174. *,
  175. RANK() OVER (PARTITION BY MONTH ORDER BY STREAM_COUNT DESC) AS MONTHLY_RANK
  176. FROM
  177. POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH
  178. )
  179. WHERE
  180. MONTHLY_RANK <= 5
  181. );
  182.  
  183. // now that we're finished with all that messiness, let's grant read access
  184. // on our new schema and table to our powerbi user
  185. GRANT USAGE ON SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE;
  186. GRANT SELECT ON ALL TABLES IN SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE;
  187.  
  188.  
  189. // Great job! You should be able to make some cool visuazliations in PBI now.
  190. // ===========================================================================
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement