Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import math, time\n",
    "from dask.distributed import Client, wait"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from distributed.client import default_client\n",
    "import pandas as pd\n",
    "\n",
    "def run(func, client=None):\n",
    "    client = client or default_client()\n",
    "    client.restart()\n",
    "    n = sum(client.ncores().values())\n",
    "    coroutine = func(n)\n",
    "\n",
    "    name, unit, numerator = next(coroutine)\n",
    "    out = []\n",
    "    while True:\n",
    "        # time.sleep(1)\n",
    "        start = time.time()\n",
    "        try:\n",
    "            next_name, next_unit, next_numerator = next(coroutine)\n",
    "        except StopIteration:\n",
    "            break\n",
    "        finally:\n",
    "            end = time.time()\n",
    "            record = {'name': name, \n",
    "                      'duration': end - start, \n",
    "                      'unit': unit + '/s', \n",
    "                      'rate': numerator / (end - start), \n",
    "                      'n': n,\n",
    "                      'collection': func.__name__}\n",
    "            out.append(record)\n",
    "        name = next_name\n",
    "        unit = next_unit\n",
    "        numerator = next_numerator\n",
    "    return pd.DataFrame(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import operator\n",
    "import time\n",
    "\n",
    "def slowinc(x, delay=0.1):\n",
    "    time.sleep(delay)\n",
    "    return x + 1\n",
    "\n",
    "def slowadd(x, y, delay=0.1):\n",
    "    time.sleep(delay)\n",
    "    return x + y\n",
    "\n",
    "def slowsum(L, delay=0.1):\n",
    "    time.sleep(delay)\n",
    "    return sum(L)\n",
    "\n",
    "def inc(x):\n",
    "    return x + 1\n",
    "\n",
    "\n",
    "def tasks(n):\n",
    "    yield 'task map fast tasks', 'tasks', n * 200\n",
    "    \n",
    "    futures = client.map(inc, range(n * 200))\n",
    "    wait(futures)\n",
    "    \n",
    "    yield 'task map 100ms tasks', 'tasks', n * 100\n",
    "\n",
    "    futures = client.map(slowinc, range(100 * n))\n",
    "    wait(futures)\n",
    "        \n",
    "    yield 'task map 1s tasks', 'tasks', n * 4\n",
    "\n",
    "    futures = client.map(slowinc, range(4 * n), delay=1)\n",
    "    wait(futures)\n",
    "\n",
    "    yield 'tree reduction fast tasks', 'tasks', 2**7 * n\n",
    "    \n",
    "    from dask import delayed\n",
    "\n",
    "    L = range(2**7 * n)\n",
    "    while len(L) > 1:\n",
    "        L = list(map(delayed(operator.add), L[0::2], L[1::2]))\n",
    "\n",
    "    L[0].compute()\n",
    "    \n",
    "    yield 'tree reduction 100ms tasks', 'tasks', 2**6 * n * 2\n",
    "    \n",
    "    from dask import delayed\n",
    "\n",
    "    L = range(2**6 * n)\n",
    "    while len(L) > 1:\n",
    "        L = list(map(delayed(slowadd), L[0::2], L[1::2]))\n",
    "\n",
    "    L[0].compute()\n",
    "    \n",
    "    yield 'sequential', 'tasks', 100\n",
    "\n",
    "    x = 1\n",
    "\n",
    "    for i in range(100):\n",
    "        x = delayed(inc)(x)\n",
    "        \n",
    "    x.compute()\n",
    "    \n",
    "    yield 'dynamic tree reduction fast tasks', 'tasks', 100 * n\n",
    "    \n",
    "    from dask.distributed import as_completed\n",
    "    futures = client.map(inc, range(n * 100))\n",
    "    \n",
    "    pool = as_completed(futures)\n",
    "    batches = pool.batches()\n",
    "    \n",
    "    while True:\n",
    "        try:\n",
    "            batch = next(batches)\n",
    "            if len(batch) == 1:\n",
    "                batch += next(batches)\n",
    "        except StopIteration:\n",
    "            break\n",
    "        future = client.submit(sum, batch)\n",
    "        pool.add(future)\n",
    "        \n",
    "    yield 'dynamic tree reduction 100ms tasks', 'tasks', 100 * n\n",
    "    \n",
    "    from dask.distributed import as_completed\n",
    "    futures = client.map(slowinc, range(n * 20))\n",
    "    \n",
    "    pool = as_completed(futures)\n",
    "    batches = pool.batches()\n",
    "    \n",
    "    while True:\n",
    "        try:\n",
    "            batch = next(batches)\n",
    "            if len(batch) == 1:\n",
    "                batch += next(batches)\n",
    "        except StopIteration:\n",
    "            break\n",
    "        future = client.submit(slowsum, batch)\n",
    "        pool.add(future)\n",
    "\n",
    "        \n",
    "    yield 'nearest neighbor fast tasks', 'tasks', 100 * n * 2\n",
    "    \n",
    "    L = range(100 * n)\n",
    "    L = client.map(operator.add, L[:-1], L[1:])\n",
    "    L = client.map(operator.add, L[:-1], L[1:])\n",
    "    wait(L)\n",
    "    \n",
    "    yield 'nearest neighbor 100ms tasks', 'tasks', 20 * n * 2\n",
    "    \n",
    "    L = range(20 * n)\n",
    "    L = client.map(slowadd, L[:-1], L[1:])\n",
    "    L = client.map(slowadd, L[:-1], L[1:])\n",
    "    wait(L)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def arrays(n):\n",
    "    import dask.array as da\n",
    "    N = int(5000 * math.sqrt(n))\n",
    "    x = da.random.randint(0, 10000, size=(N, N), chunks=(2000, 2000))\n",
    "    \n",
    "    yield 'create random', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    x = x.persist()\n",
    "    wait(x)\n",
    "    \n",
    "    yield 'blockwise 100ms tasks', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    y = x.map_blocks(slowinc, dtype=x.dtype).persist()\n",
    "    wait(y)\n",
    "    \n",
    "    yield 'random access', 'bytes', 8\n",
    "    \n",
    "    x[1234, 4567].compute()\n",
    "   \n",
    "    yield 'reduction', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    x.std().compute()\n",
    "    \n",
    "    yield 'reduction along axis', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    x.std(axis=0).compute()\n",
    "    \n",
    "    yield 'elementwise computation', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    y = da.sin(x) ** 2 + da.cos(x) ** 2\n",
    "    y = y.persist()\n",
    "    wait(y)    \n",
    "    \n",
    "    yield 'rechunk small', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    y = x.rechunk((20000, 200)).persist()\n",
    "    wait(y)\n",
    "    \n",
    "    yield 'rechunk large', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    y = y.rechunk((200, 20000)).persist()\n",
    "    wait(y)\n",
    "    \n",
    "    yield 'transpose addition', 'MB', x.nbytes / 1e6\n",
    "    y = x + x.T\n",
    "    y = y.persist()\n",
    "    wait(y)\n",
    "    \n",
    "    yield 'nearest neighbor fast tasks', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    y = x.map_overlap(inc, depth=1).persist()\n",
    "    wait(y)   \n",
    "        \n",
    "    yield 'nearest neighbor 100ms tasks', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    y = x.map_overlap(slowinc, depth=1, delay=0.1).persist()\n",
    "    wait(y)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def dataframes(n):\n",
    "    import dask.array as da\n",
    "    import dask.dataframe as dd\n",
    "    N = 2000000 * n\n",
    "    \n",
    "    x = da.random.randint(0, 10000, size=(N, 10), chunks=(1000000, 10))\n",
    "\n",
    "    \n",
    "    yield 'create random', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    df = dd.from_dask_array(x).persist()\n",
    "    wait(df)\n",
    "    \n",
    "    yield 'blockwise 100ms tasks', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    wait(df.map_partitions(slowinc, meta=df).persist())\n",
    "    \n",
    "    yield 'arithmetic', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    y = (df[0] + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10).persist()\n",
    "    wait(y)\n",
    "    \n",
    "    yield 'random access', 'bytes', 8\n",
    "    \n",
    "    df.loc[123456].compute()\n",
    "    \n",
    "    yield 'dataframe reduction', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    df.std().compute()\n",
    "    \n",
    "    yield 'series reduction', 'MB', x.nbytes / 1e6 / 10\n",
    "    \n",
    "    df[3].std().compute()\n",
    "    \n",
    "    yield 'groupby reduction', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    df.groupby(0)[1].mean().compute()\n",
    "    \n",
    "    yield 'groupby apply (full shuffle)', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    df.groupby(0).apply(len).compute()\n",
    "    \n",
    "    yield 'set index (full shuffle)', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    wait(df.set_index(1).persist())\n",
    "    \n",
    "    yield 'rolling aggregations', 'MB', x.nbytes / 1e6\n",
    "    \n",
    "    wait(df.rolling(5).mean().persist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "L = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 tasks\n",
      "1 arrays\n",
      "1 dataframes\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/mrocklin/Software/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:41: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n",
      "  Before: .apply(func)\n",
      "  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n",
      "  or:     .apply(func, meta=('x', 'f8'))            for series result\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 tasks\n",
      "1 arrays\n",
      "1 dataframes\n",
      "2 tasks\n",
      "2 arrays\n",
      "2 dataframes\n",
      "2 tasks\n",
      "2 arrays\n",
      "2 dataframes\n",
      "4 tasks\n",
      "4 arrays\n",
      "4 dataframes\n",
      "4 tasks\n",
      "4 arrays\n",
      "4 dataframes\n",
      "CPU times: user 1min 15s, sys: 8.1 s, total: 1min 23s\n",
      "Wall time: 4min 58s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "for n_workers in [1, 2, 4]:\n",
    "    with Client(n_workers=n_workers, threads_per_worker=1, processes=True, diagnostics_port=None) as client:\n",
    "        for i in range(2):\n",
    "            for func in [tasks, arrays, dataframes]:\n",
    "                print(n_workers, func.__name__)\n",
    "                df = run(func, client=client)\n",
    "                L.append(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ddf = pd.concat(L)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>duration</th>\n",
       "      <th>rate</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>collection</th>\n",
       "      <th>name</th>\n",
       "      <th>n</th>\n",
       "      <th>unit</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"30\" valign=\"top\">arrays</th>\n",
       "      <th rowspan=\"3\" valign=\"top\">blockwise 100ms tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>1.029480</td>\n",
       "      <td>194.274386</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.970075</td>\n",
       "      <td>412.724335</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.911542</td>\n",
       "      <td>879.661395</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">create random</th>\n",
       "      <th>1</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.405803</td>\n",
       "      <td>492.852099</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.583153</td>\n",
       "      <td>701.543327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.785832</td>\n",
       "      <td>1044.580052</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">elementwise computation</th>\n",
       "      <th>1</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>2.410327</td>\n",
       "      <td>83.223446</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>2.746347</td>\n",
       "      <td>145.645420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>3.498197</td>\n",
       "      <td>229.083358</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">nearest neighbor 100ms tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>1.367269</td>\n",
       "      <td>146.296656</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>1.314216</td>\n",
       "      <td>304.434504</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>1.342365</td>\n",
       "      <td>596.067880</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">nearest neighbor fast tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.224858</td>\n",
       "      <td>911.479273</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.379917</td>\n",
       "      <td>1117.691475</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.601148</td>\n",
       "      <td>1420.598421</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">random access</th>\n",
       "      <th>1</th>\n",
       "      <th>bytes/s</th>\n",
       "      <td>0.025998</td>\n",
       "      <td>358.347875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>bytes/s</th>\n",
       "      <td>0.054119</td>\n",
       "      <td>266.879176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>bytes/s</th>\n",
       "      <td>0.046264</td>\n",
       "      <td>172.956708</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">rechunk large</th>\n",
       "      <th>1</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.195920</td>\n",
       "      <td>1023.514836</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.532372</td>\n",
       "      <td>751.343367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>1.636902</td>\n",
       "      <td>489.614913</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">rechunk small</th>\n",
       "      <th>1</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.172437</td>\n",
       "      <td>1167.949136</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.241122</td>\n",
       "      <td>1659.630424</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.965582</td>\n",
       "      <td>829.053704</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">reduction</th>\n",
       "      <th>1</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.235358</td>\n",
       "      <td>850.295775</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.358469</td>\n",
       "      <td>1116.073630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.505442</td>\n",
       "      <td>1589.940599</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">reduction along axis</th>\n",
       "      <th>1</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.238496</td>\n",
       "      <td>840.850606</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.343045</td>\n",
       "      <td>1167.376920</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>MB/s</th>\n",
       "      <td>0.495310</td>\n",
       "      <td>1616.628542</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"30\" valign=\"top\">tasks</th>\n",
       "      <th rowspan=\"3\" valign=\"top\">dynamic tree reduction 100ms tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>4.049874</td>\n",
       "      <td>24.692124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>4.272979</td>\n",
       "      <td>46.805757</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>3.752805</td>\n",
       "      <td>106.844621</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">dynamic tree reduction fast tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.123307</td>\n",
       "      <td>815.427446</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.182649</td>\n",
       "      <td>1105.414498</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.456500</td>\n",
       "      <td>902.061964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">nearest neighbor 100ms tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>3.858868</td>\n",
       "      <td>10.366202</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>4.147557</td>\n",
       "      <td>19.288811</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>4.227893</td>\n",
       "      <td>37.846582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">nearest neighbor fast tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.173912</td>\n",
       "      <td>1167.974119</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.382527</td>\n",
       "      <td>1057.424268</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.537346</td>\n",
       "      <td>1542.613544</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">sequential</th>\n",
       "      <th>1</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.587582</td>\n",
       "      <td>170.592242</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.566866</td>\n",
       "      <td>176.410216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.555918</td>\n",
       "      <td>179.977319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">task map 100ms tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>10.339996</td>\n",
       "      <td>9.671213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>10.351813</td>\n",
       "      <td>19.320287</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>10.349894</td>\n",
       "      <td>38.649526</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">task map 1s tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>4.051148</td>\n",
       "      <td>0.987380</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>4.063326</td>\n",
       "      <td>1.968832</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>4.064835</td>\n",
       "      <td>3.936216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">task map fast tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.166153</td>\n",
       "      <td>1244.405187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.149536</td>\n",
       "      <td>2675.049581</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.428613</td>\n",
       "      <td>1906.359809</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">tree reduction 100ms tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>6.585549</td>\n",
       "      <td>19.436496</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>7.040561</td>\n",
       "      <td>36.360751</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>7.320610</td>\n",
       "      <td>69.946284</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">tree reduction fast tasks</th>\n",
       "      <th>1</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.179473</td>\n",
       "      <td>733.527981</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.280910</td>\n",
       "      <td>921.576918</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <th>tasks/s</th>\n",
       "      <td>0.388071</td>\n",
       "      <td>1326.426160</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>93 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                          duration  \\\n",
       "collection name                               n unit                 \n",
       "arrays     blockwise 100ms tasks              1 MB/s      1.029480   \n",
       "                                              2 MB/s      0.970075   \n",
       "                                              4 MB/s      0.911542   \n",
       "           create random                      1 MB/s      0.405803   \n",
       "                                              2 MB/s      0.583153   \n",
       "                                              4 MB/s      0.785832   \n",
       "           elementwise computation            1 MB/s      2.410327   \n",
       "                                              2 MB/s      2.746347   \n",
       "                                              4 MB/s      3.498197   \n",
       "           nearest neighbor 100ms tasks       1 MB/s      1.367269   \n",
       "                                              2 MB/s      1.314216   \n",
       "                                              4 MB/s      1.342365   \n",
       "           nearest neighbor fast tasks        1 MB/s      0.224858   \n",
       "                                              2 MB/s      0.379917   \n",
       "                                              4 MB/s      0.601148   \n",
       "           random access                      1 bytes/s   0.025998   \n",
       "                                              2 bytes/s   0.054119   \n",
       "                                              4 bytes/s   0.046264   \n",
       "           rechunk large                      1 MB/s      0.195920   \n",
       "                                              2 MB/s      0.532372   \n",
       "                                              4 MB/s      1.636902   \n",
       "           rechunk small                      1 MB/s      0.172437   \n",
       "                                              2 MB/s      0.241122   \n",
       "                                              4 MB/s      0.965582   \n",
       "           reduction                          1 MB/s      0.235358   \n",
       "                                              2 MB/s      0.358469   \n",
       "                                              4 MB/s      0.505442   \n",
       "           reduction along axis               1 MB/s      0.238496   \n",
       "                                              2 MB/s      0.343045   \n",
       "                                              4 MB/s      0.495310   \n",
       "...                                                            ...   \n",
       "tasks      dynamic tree reduction 100ms tasks 1 tasks/s   4.049874   \n",
       "                                              2 tasks/s   4.272979   \n",
       "                                              4 tasks/s   3.752805   \n",
       "           dynamic tree reduction fast tasks  1 tasks/s   0.123307   \n",
       "                                              2 tasks/s   0.182649   \n",
       "                                              4 tasks/s   0.456500   \n",
       "           nearest neighbor 100ms tasks       1 tasks/s   3.858868   \n",
       "                                              2 tasks/s   4.147557   \n",
       "                                              4 tasks/s   4.227893   \n",
       "           nearest neighbor fast tasks        1 tasks/s   0.173912   \n",
       "                                              2 tasks/s   0.382527   \n",
       "                                              4 tasks/s   0.537346   \n",
       "           sequential                         1 tasks/s   0.587582   \n",
       "                                              2 tasks/s   0.566866   \n",
       "                                              4 tasks/s   0.555918   \n",
       "           task map 100ms tasks               1 tasks/s  10.339996   \n",
       "                                              2 tasks/s  10.351813   \n",
       "                                              4 tasks/s  10.349894   \n",
       "           task map 1s tasks                  1 tasks/s   4.051148   \n",
       "                                              2 tasks/s   4.063326   \n",
       "                                              4 tasks/s   4.064835   \n",
       "           task map fast tasks                1 tasks/s   0.166153   \n",
       "                                              2 tasks/s   0.149536   \n",
       "                                              4 tasks/s   0.428613   \n",
       "           tree reduction 100ms tasks         1 tasks/s   6.585549   \n",
       "                                              2 tasks/s   7.040561   \n",
       "                                              4 tasks/s   7.320610   \n",
       "           tree reduction fast tasks          1 tasks/s   0.179473   \n",
       "                                              2 tasks/s   0.280910   \n",
       "                                              4 tasks/s   0.388071   \n",
       "\n",
       "                                                                rate  \n",
       "collection name                               n unit                  \n",
       "arrays     blockwise 100ms tasks              1 MB/s      194.274386  \n",
       "                                              2 MB/s      412.724335  \n",
       "                                              4 MB/s      879.661395  \n",
       "           create random                      1 MB/s      492.852099  \n",
       "                                              2 MB/s      701.543327  \n",
       "                                              4 MB/s     1044.580052  \n",
       "           elementwise computation            1 MB/s       83.223446  \n",
       "                                              2 MB/s      145.645420  \n",
       "                                              4 MB/s      229.083358  \n",
       "           nearest neighbor 100ms tasks       1 MB/s      146.296656  \n",
       "                                              2 MB/s      304.434504  \n",
       "                                              4 MB/s      596.067880  \n",
       "           nearest neighbor fast tasks        1 MB/s      911.479273  \n",
       "                                              2 MB/s     1117.691475  \n",
       "                                              4 MB/s     1420.598421  \n",
       "           random access                      1 bytes/s   358.347875  \n",
       "                                              2 bytes/s   266.879176  \n",
       "                                              4 bytes/s   172.956708  \n",
       "           rechunk large                      1 MB/s     1023.514836  \n",
       "                                              2 MB/s      751.343367  \n",
       "                                              4 MB/s      489.614913  \n",
       "           rechunk small                      1 MB/s     1167.949136  \n",
       "                                              2 MB/s     1659.630424  \n",
       "                                              4 MB/s      829.053704  \n",
       "           reduction                          1 MB/s      850.295775  \n",
       "                                              2 MB/s     1116.073630  \n",
       "                                              4 MB/s     1589.940599  \n",
       "           reduction along axis               1 MB/s      840.850606  \n",
       "                                              2 MB/s     1167.376920  \n",
       "                                              4 MB/s     1616.628542  \n",
       "...                                                              ...  \n",
       "tasks      dynamic tree reduction 100ms tasks 1 tasks/s    24.692124  \n",
       "                                              2 tasks/s    46.805757  \n",
       "                                              4 tasks/s   106.844621  \n",
       "           dynamic tree reduction fast tasks  1 tasks/s   815.427446  \n",
       "                                              2 tasks/s  1105.414498  \n",
       "                                              4 tasks/s   902.061964  \n",
       "           nearest neighbor 100ms tasks       1 tasks/s    10.366202  \n",
       "                                              2 tasks/s    19.288811  \n",
       "                                              4 tasks/s    37.846582  \n",
       "           nearest neighbor fast tasks        1 tasks/s  1167.974119  \n",
       "                                              2 tasks/s  1057.424268  \n",
       "                                              4 tasks/s  1542.613544  \n",
       "           sequential                         1 tasks/s   170.592242  \n",
       "                                              2 tasks/s   176.410216  \n",
       "                                              4 tasks/s   179.977319  \n",
       "           task map 100ms tasks               1 tasks/s     9.671213  \n",
       "                                              2 tasks/s    19.320287  \n",
       "                                              4 tasks/s    38.649526  \n",
       "           task map 1s tasks                  1 tasks/s     0.987380  \n",
       "                                              2 tasks/s     1.968832  \n",
       "                                              4 tasks/s     3.936216  \n",
       "           task map fast tasks                1 tasks/s  1244.405187  \n",
       "                                              2 tasks/s  2675.049581  \n",
       "                                              4 tasks/s  1906.359809  \n",
       "           tree reduction 100ms tasks         1 tasks/s    19.436496  \n",
       "                                              2 tasks/s    36.360751  \n",
       "                                              4 tasks/s    69.946284  \n",
       "           tree reduction fast tasks          1 tasks/s   733.527981  \n",
       "                                              2 tasks/s   921.576918  \n",
       "                                              4 tasks/s  1326.426160  \n",
       "\n",
       "[93 rows x 2 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = ddf.groupby(['collection', 'name', 'n', 'unit']).mean()\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df.to_csv('scaling-data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'cloud'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-5-a83d970e64f2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mgcsfs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mgcs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgcsfs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGCSFileSystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'cloud'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mgcs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'scaling-data.csv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'dask-data/scaling-data-1.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/mrocklin/Software/anaconda/lib/python3.6/site-packages/gcsfs/core.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, project, access, token, block_size)\u001b[0m\n\u001b[1;32m    155\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maccess\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maccess\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    156\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdirs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 157\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    158\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_singleton\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    159\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/mrocklin/Software/anaconda/lib/python3.6/site-packages/gcsfs/core.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self, refresh)\u001b[0m\n\u001b[1;32m    207\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    208\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;34m'type'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 209\u001b[0;31m                 \u001b[0mtoken\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_gtoken\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    210\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mproject\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccess\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtoken\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    211\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mproject\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccess\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/mrocklin/Software/anaconda/lib/python3.6/site-packages/gcsfs/core.py\u001b[0m in \u001b[0;36m_parse_gtoken\u001b[0;34m(gt)\u001b[0m\n\u001b[1;32m    172\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_parse_gtoken\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    173\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 174\u001b[0;31m             \u001b[0mt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    175\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    176\u001b[0m             \u001b[0mt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'cloud'"
     ]
    }
   ],
   "source": [
    "import gcsfs\n",
    "gcs = gcsfs.GCSFileSystem(token='cloud')\n",
    "gcs.put('scaling-data.csv', 'dask-data/scaling-data-1.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}