Advertisement
Guest User

Untitled

a guest
Jun 28th, 2017
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.28 KB | None | 0 0
  1. {"cells":[{"cell_type":"code","source":["sc._jsc.hadoopConfiguration().set(\"fs.s3n.awsAccessKeyId\", \"\")\nsc._jsc.hadoopConfiguration().set(\"fs.s3n.awsSecretAccessKey\", \"\")\n\n# Read data from a table\ndf = sqlContext.read \\\n .format(\"com.databricks.spark.redshift\") \\\n .option(\"url\", \"jdbc:redshift://url:5439/probes?user=username&password=password\") \\\n .option(\"query\", \"select * from cloudtraxwebhook\") \\\n .option('forward_spark_s3_credentials',True) \\\n .option(\"tempdir\", \"s3n://ncompass-cloudtrax3/tmp/\") \\\n .load()\n \nprint(\"There are {} probes\".format(df.count()))"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["from pyspark.sql import Window\nwindow = Window.partitionBy(\"probe_requests_mac\").orderBy(\"probe_requests_first_seen\")\n\nfrom pyspark.sql.functions import lag\nmax_dwell = 20 * 60\nmin_dwell = 5 * 60\nprobes_lag = df.withColumn(\"prev_last_seen\", lag(df['probe_requests_last_seen']).over(window))\ndisplay(probes_lag)"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["from pyspark.sql.functions import when\nnew_sessions = probes_lag.withColumn(\"new_session\", when(probes_lag['prev_last_seen'] - probes_lag[\"probe_requests_last_seen\"] < max_dwell, 1).otherwise(0))\ndisplay(new_sessions)"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["from pyspark.sql.functions import sum, avg\n\nwindow = Window.partitionBy(\"probe_requests_mac\").orderBy(\"probe_requests_first_seen\")\nsessions = new_sessions.select(\"*\", sum(\"new_session\").over(window).alias(\"session_id\"))\ndisplay(sessions)"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["# count unique macs\ndisplay(sessions.groupBy(sessions['probe_requests_mac']).count().collect())"],"metadata":{},"outputs":[],"execution_count":5},{"cell_type":"code","source":["time_interval = probes_lag.withColumn(\"time_interval\", probes_lag[\"probe_requests_last_seen\"] - probes_lag['prev_last_seen'])\ndisplay(time_interval.describe())"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":7}],"metadata":{"name":"read from redshift","notebookId":2075251966645031},"nbformat":4,"nbformat_minor":0}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement