SHARE
TWEET

Untitled

a guest Jul 17th, 2017 52 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. {
  2.  "cells": [
  3.   {
  4.    "cell_type": "code",
  5.    "execution_count": 7,
  6.    "metadata": {
  7.     "collapsed": true
  8.    },
  9.    "outputs": [],
  10.    "source": [
  11.     "from pyspark.sql import functions as F\n",
  12.     "from pyspark.sql import types as T"
  13.    ]
  14.   },
  15.   {
  16.    "cell_type": "code",
  17.    "execution_count": 2,
  18.    "metadata": {
  19.     "collapsed": false
  20.    },
  21.    "outputs": [],
  22.    "source": [
  23.     "df = sqlContext.read.parquet(\"/guoda/data/idigbio-20170708T023306.parquet\")"
  24.    ]
  25.   },
  26.   {
  27.    "cell_type": "code",
  28.    "execution_count": 3,
  29.    "metadata": {
  30.     "collapsed": false
  31.    },
  32.    "outputs": [],
  33.    "source": [
  34.     "ic_cc = df.groupby([\"institutioncode\",\"collectioncode\"]).count()"
  35.    ]
  36.   },
  37.   {
  38.    "cell_type": "code",
  39.    "execution_count": 10,
  40.    "metadata": {
  41.     "collapsed": false
  42.    },
  43.    "outputs": [
  44.     {
  45.      "data": {
  46.       "text/plain": [
  47.        "128926"
  48.       ]
  49.      },
  50.      "execution_count": 10,
  51.      "metadata": {},
  52.      "output_type": "execute_result"
  53.     }
  54.    ],
  55.    "source": [
  56.     "ic_cc.count()"
  57.    ]
  58.   },
  59.   {
  60.    "cell_type": "code",
  61.    "execution_count": 12,
  62.    "metadata": {
  63.     "collapsed": false
  64.    },
  65.    "outputs": [
  66.     {
  67.      "data": {
  68.       "text/plain": [
  69.        "3573"
  70.       ]
  71.      },
  72.      "execution_count": 12,
  73.      "metadata": {},
  74.      "output_type": "execute_result"
  75.     }
  76.    ],
  77.    "source": [
  78.     "ic_cc[F.col(\"count\") > 1].count()"
  79.    ]
  80.   },
  81.   {
  82.    "cell_type": "code",
  83.    "execution_count": 32,
  84.    "metadata": {
  85.     "collapsed": false
  86.    },
  87.    "outputs": [],
  88.    "source": [
  89.     "h = ic_cc.select(\"count\").rdd.flatMap(lambda x: x).histogram(list(range(1,10)) + [10**x for x in range(1,7)])"
  90.    ]
  91.   },
  92.   {
  93.    "cell_type": "code",
  94.    "execution_count": 42,
  95.    "metadata": {
  96.     "collapsed": false
  97.    },
  98.    "outputs": [
  99.     {
  100.      "data": {
  101.       "text/plain": [
  102.        "([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 1000, 10000, 100000, 1000000],\n",
  103.        " [125353, 453, 154, 109, 94, 53, 48, 45, 31, 656, 547, 588, 572, 213])"
  104.       ]
  105.      },
  106.      "execution_count": 42,
  107.      "metadata": {},
  108.      "output_type": "execute_result"
  109.     }
  110.    ],
  111.    "source": [
  112.     "h"
  113.    ]
  114.   },
  115.   {
  116.    "cell_type": "code",
  117.    "execution_count": null,
  118.    "metadata": {
  119.     "collapsed": true
  120.    },
  121.    "outputs": [],
  122.    "source": []
  123.   }
  124.  ],
  125.  "metadata": {
  126.   "kernelspec": {
  127.    "display_name": "1 - PySpark Python3 Small",
  128.    "language": "python",
  129.    "name": "pyspark3-small"
  130.   },
  131.   "language_info": {
  132.    "codemirror_mode": {
  133.     "name": "ipython",
  134.     "version": 3
  135.    },
  136.    "file_extension": ".py",
  137.    "mimetype": "text/x-python",
  138.    "name": "python",
  139.    "nbconvert_exporter": "python",
  140.    "pygments_lexer": "ipython3",
  141.    "version": "3.5.2"
  142.   }
  143.  },
  144.  "nbformat": 4,
  145.  "nbformat_minor": 1
  146. }
RAW Paste Data
Top