Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import math, time\n",
- "from dask.distributed import Client, wait"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "from distributed.client import default_client\n",
- "import pandas as pd\n",
- "\n",
- "def run(func, client=None):\n",
- " client = client or default_client()\n",
- " client.restart()\n",
- " n = sum(client.ncores().values())\n",
- " coroutine = func(n)\n",
- "\n",
- " name, unit, numerator = next(coroutine)\n",
- " out = []\n",
- " while True:\n",
- " # time.sleep(1)\n",
- " start = time.time()\n",
- " try:\n",
- " next_name, next_unit, next_numerator = next(coroutine)\n",
- " except StopIteration:\n",
- " break\n",
- " finally:\n",
- " end = time.time()\n",
- " record = {'name': name, \n",
- " 'duration': end - start, \n",
- " 'unit': unit + '/s', \n",
- " 'rate': numerator / (end - start), \n",
- " 'n': n,\n",
- " 'collection': func.__name__}\n",
- " out.append(record)\n",
- " name = next_name\n",
- " unit = next_unit\n",
- " numerator = next_numerator\n",
- " return pd.DataFrame(out)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import operator\n",
- "import time\n",
- "\n",
- "def slowinc(x, delay=0.1):\n",
- " time.sleep(delay)\n",
- " return x + 1\n",
- "\n",
- "def slowadd(x, y, delay=0.1):\n",
- " time.sleep(delay)\n",
- " return x + y\n",
- "\n",
- "def slowsum(L, delay=0.1):\n",
- " time.sleep(delay)\n",
- " return sum(L)\n",
- "\n",
- "def inc(x):\n",
- " return x + 1\n",
- "\n",
- "\n",
- "def tasks(n):\n",
- " yield 'task map fast tasks', 'tasks', n * 200\n",
- " \n",
- " futures = client.map(inc, range(n * 200))\n",
- " wait(futures)\n",
- " \n",
- " yield 'task map 100ms tasks', 'tasks', n * 100\n",
- "\n",
- " futures = client.map(slowinc, range(100 * n))\n",
- " wait(futures)\n",
- " \n",
- " yield 'task map 1s tasks', 'tasks', n * 4\n",
- "\n",
- " futures = client.map(slowinc, range(4 * n), delay=1)\n",
- " wait(futures)\n",
- "\n",
- " yield 'tree reduction fast tasks', 'tasks', 2**7 * n\n",
- " \n",
- " from dask import delayed\n",
- "\n",
- " L = range(2**7 * n)\n",
- " while len(L) > 1:\n",
- " L = list(map(delayed(operator.add), L[0::2], L[1::2]))\n",
- "\n",
- " L[0].compute()\n",
- " \n",
- " yield 'tree reduction 100ms tasks', 'tasks', 2**6 * n * 2\n",
- " \n",
- " from dask import delayed\n",
- "\n",
- " L = range(2**6 * n)\n",
- " while len(L) > 1:\n",
- " L = list(map(delayed(slowadd), L[0::2], L[1::2]))\n",
- "\n",
- " L[0].compute()\n",
- " \n",
- " yield 'sequential', 'tasks', 100\n",
- "\n",
- " x = 1\n",
- "\n",
- " for i in range(100):\n",
- " x = delayed(inc)(x)\n",
- " \n",
- " x.compute()\n",
- " \n",
- " yield 'dynamic tree reduction fast tasks', 'tasks', 100 * n\n",
- " \n",
- " from dask.distributed import as_completed\n",
- " futures = client.map(inc, range(n * 100))\n",
- " \n",
- " pool = as_completed(futures)\n",
- " batches = pool.batches()\n",
- " \n",
- " while True:\n",
- " try:\n",
- " batch = next(batches)\n",
- " if len(batch) == 1:\n",
- " batch += next(batches)\n",
- " except StopIteration:\n",
- " break\n",
- " future = client.submit(sum, batch)\n",
- " pool.add(future)\n",
- " \n",
- " yield 'dynamic tree reduction 100ms tasks', 'tasks', 100 * n\n",
- " \n",
- " from dask.distributed import as_completed\n",
- " futures = client.map(slowinc, range(n * 20))\n",
- " \n",
- " pool = as_completed(futures)\n",
- " batches = pool.batches()\n",
- " \n",
- " while True:\n",
- " try:\n",
- " batch = next(batches)\n",
- " if len(batch) == 1:\n",
- " batch += next(batches)\n",
- " except StopIteration:\n",
- " break\n",
- " future = client.submit(slowsum, batch)\n",
- " pool.add(future)\n",
- "\n",
- " \n",
- " yield 'nearest neighbor fast tasks', 'tasks', 100 * n * 2\n",
- " \n",
- " L = range(100 * n)\n",
- " L = client.map(operator.add, L[:-1], L[1:])\n",
- " L = client.map(operator.add, L[:-1], L[1:])\n",
- " wait(L)\n",
- " \n",
- " yield 'nearest neighbor 100ms tasks', 'tasks', 20 * n * 2\n",
- " \n",
- " L = range(20 * n)\n",
- " L = client.map(slowadd, L[:-1], L[1:])\n",
- " L = client.map(slowadd, L[:-1], L[1:])\n",
- " wait(L)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def arrays(n):\n",
- " import dask.array as da\n",
- " N = int(5000 * math.sqrt(n))\n",
- " x = da.random.randint(0, 10000, size=(N, N), chunks=(2000, 2000))\n",
- " \n",
- " yield 'create random', 'MB', x.nbytes / 1e6\n",
- " \n",
- " x = x.persist()\n",
- " wait(x)\n",
- " \n",
- " yield 'blockwise 100ms tasks', 'MB', x.nbytes / 1e6\n",
- " \n",
- " y = x.map_blocks(slowinc, dtype=x.dtype).persist()\n",
- " wait(y)\n",
- " \n",
- " yield 'random access', 'bytes', 8\n",
- " \n",
- " x[1234, 4567].compute()\n",
- " \n",
- " yield 'reduction', 'MB', x.nbytes / 1e6\n",
- " \n",
- " x.std().compute()\n",
- " \n",
- " yield 'reduction along axis', 'MB', x.nbytes / 1e6\n",
- " \n",
- " x.std(axis=0).compute()\n",
- " \n",
- " yield 'elementwise computation', 'MB', x.nbytes / 1e6\n",
- " \n",
- " y = da.sin(x) ** 2 + da.cos(x) ** 2\n",
- " y = y.persist()\n",
- " wait(y) \n",
- " \n",
- " yield 'rechunk small', 'MB', x.nbytes / 1e6\n",
- " \n",
- " y = x.rechunk((20000, 200)).persist()\n",
- " wait(y)\n",
- " \n",
- " yield 'rechunk large', 'MB', x.nbytes / 1e6\n",
- " \n",
- " y = y.rechunk((200, 20000)).persist()\n",
- " wait(y)\n",
- " \n",
- " yield 'transpose addition', 'MB', x.nbytes / 1e6\n",
- " y = x + x.T\n",
- " y = y.persist()\n",
- " wait(y)\n",
- " \n",
- " yield 'nearest neighbor fast tasks', 'MB', x.nbytes / 1e6\n",
- " \n",
- " y = x.map_overlap(inc, depth=1).persist()\n",
- " wait(y) \n",
- " \n",
- " yield 'nearest neighbor 100ms tasks', 'MB', x.nbytes / 1e6\n",
- " \n",
- " y = x.map_overlap(slowinc, depth=1, delay=0.1).persist()\n",
- " wait(y) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def dataframes(n):\n",
- " import dask.array as da\n",
- " import dask.dataframe as dd\n",
- " N = 2000000 * n\n",
- " \n",
- " x = da.random.randint(0, 10000, size=(N, 10), chunks=(1000000, 10))\n",
- "\n",
- " \n",
- " yield 'create random', 'MB', x.nbytes / 1e6\n",
- " \n",
- " df = dd.from_dask_array(x).persist()\n",
- " wait(df)\n",
- " \n",
- " yield 'blockwise 100ms tasks', 'MB', x.nbytes / 1e6\n",
- " \n",
- " wait(df.map_partitions(slowinc, meta=df).persist())\n",
- " \n",
- " yield 'arithmetic', 'MB', x.nbytes / 1e6\n",
- " \n",
- " y = (df[0] + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10).persist()\n",
- " wait(y)\n",
- " \n",
- " yield 'random access', 'bytes', 8\n",
- " \n",
- " df.loc[123456].compute()\n",
- " \n",
- " yield 'dataframe reduction', 'MB', x.nbytes / 1e6\n",
- " \n",
- " df.std().compute()\n",
- " \n",
- " yield 'series reduction', 'MB', x.nbytes / 1e6 / 10\n",
- " \n",
- " df[3].std().compute()\n",
- " \n",
- " yield 'groupby reduction', 'MB', x.nbytes / 1e6\n",
- " \n",
- " df.groupby(0)[1].mean().compute()\n",
- " \n",
- " yield 'groupby apply (full shuffle)', 'MB', x.nbytes / 1e6\n",
- " \n",
- " df.groupby(0).apply(len).compute()\n",
- " \n",
- " yield 'set index (full shuffle)', 'MB', x.nbytes / 1e6\n",
- " \n",
- " wait(df.set_index(1).persist())\n",
- " \n",
- " yield 'rolling aggregations', 'MB', x.nbytes / 1e6\n",
- " \n",
- " wait(df.rolling(5).mean().persist())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "L = []"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "collapsed": false,
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1 tasks\n",
- "1 arrays\n",
- "1 dataframes\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/mrocklin/Software/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:41: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n",
- " Before: .apply(func)\n",
- " After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n",
- " or: .apply(func, meta=('x', 'f8')) for series result\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1 tasks\n",
- "1 arrays\n",
- "1 dataframes\n",
- "2 tasks\n",
- "2 arrays\n",
- "2 dataframes\n",
- "2 tasks\n",
- "2 arrays\n",
- "2 dataframes\n",
- "4 tasks\n",
- "4 arrays\n",
- "4 dataframes\n",
- "4 tasks\n",
- "4 arrays\n",
- "4 dataframes\n",
- "CPU times: user 1min 15s, sys: 8.1 s, total: 1min 23s\n",
- "Wall time: 4min 58s\n"
- ]
- }
- ],
- "source": [
- "%%time\n",
- "for n_workers in [1, 2, 4]:\n",
- " with Client(n_workers=n_workers, threads_per_worker=1, processes=True, diagnostics_port=None) as client:\n",
- " for i in range(2):\n",
- " for func in [tasks, arrays, dataframes]:\n",
- " print(n_workers, func.__name__)\n",
- " df = run(func, client=client)\n",
- " L.append(df)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "ddf = pd.concat(L)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "collapsed": false,
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th>duration</th>\n",
- " <th>rate</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>collection</th>\n",
- " <th>name</th>\n",
- " <th>n</th>\n",
- " <th>unit</th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th rowspan=\"30\" valign=\"top\">arrays</th>\n",
- " <th rowspan=\"3\" valign=\"top\">blockwise 100ms tasks</th>\n",
- " <th>1</th>\n",
- " <th>MB/s</th>\n",
- " <td>1.029480</td>\n",
- " <td>194.274386</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.970075</td>\n",
- " <td>412.724335</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.911542</td>\n",
- " <td>879.661395</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">create random</th>\n",
- " <th>1</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.405803</td>\n",
- " <td>492.852099</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.583153</td>\n",
- " <td>701.543327</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.785832</td>\n",
- " <td>1044.580052</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">elementwise computation</th>\n",
- " <th>1</th>\n",
- " <th>MB/s</th>\n",
- " <td>2.410327</td>\n",
- " <td>83.223446</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>MB/s</th>\n",
- " <td>2.746347</td>\n",
- " <td>145.645420</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>MB/s</th>\n",
- " <td>3.498197</td>\n",
- " <td>229.083358</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">nearest neighbor 100ms tasks</th>\n",
- " <th>1</th>\n",
- " <th>MB/s</th>\n",
- " <td>1.367269</td>\n",
- " <td>146.296656</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>MB/s</th>\n",
- " <td>1.314216</td>\n",
- " <td>304.434504</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>MB/s</th>\n",
- " <td>1.342365</td>\n",
- " <td>596.067880</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">nearest neighbor fast tasks</th>\n",
- " <th>1</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.224858</td>\n",
- " <td>911.479273</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.379917</td>\n",
- " <td>1117.691475</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.601148</td>\n",
- " <td>1420.598421</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">random access</th>\n",
- " <th>1</th>\n",
- " <th>bytes/s</th>\n",
- " <td>0.025998</td>\n",
- " <td>358.347875</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>bytes/s</th>\n",
- " <td>0.054119</td>\n",
- " <td>266.879176</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>bytes/s</th>\n",
- " <td>0.046264</td>\n",
- " <td>172.956708</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">rechunk large</th>\n",
- " <th>1</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.195920</td>\n",
- " <td>1023.514836</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.532372</td>\n",
- " <td>751.343367</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>MB/s</th>\n",
- " <td>1.636902</td>\n",
- " <td>489.614913</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">rechunk small</th>\n",
- " <th>1</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.172437</td>\n",
- " <td>1167.949136</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.241122</td>\n",
- " <td>1659.630424</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.965582</td>\n",
- " <td>829.053704</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">reduction</th>\n",
- " <th>1</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.235358</td>\n",
- " <td>850.295775</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.358469</td>\n",
- " <td>1116.073630</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.505442</td>\n",
- " <td>1589.940599</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">reduction along axis</th>\n",
- " <th>1</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.238496</td>\n",
- " <td>840.850606</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.343045</td>\n",
- " <td>1167.376920</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>MB/s</th>\n",
- " <td>0.495310</td>\n",
- " <td>1616.628542</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>...</th>\n",
- " <th>...</th>\n",
- " <th>...</th>\n",
- " <th>...</th>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"30\" valign=\"top\">tasks</th>\n",
- " <th rowspan=\"3\" valign=\"top\">dynamic tree reduction 100ms tasks</th>\n",
- " <th>1</th>\n",
- " <th>tasks/s</th>\n",
- " <td>4.049874</td>\n",
- " <td>24.692124</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>tasks/s</th>\n",
- " <td>4.272979</td>\n",
- " <td>46.805757</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>tasks/s</th>\n",
- " <td>3.752805</td>\n",
- " <td>106.844621</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">dynamic tree reduction fast tasks</th>\n",
- " <th>1</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.123307</td>\n",
- " <td>815.427446</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.182649</td>\n",
- " <td>1105.414498</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.456500</td>\n",
- " <td>902.061964</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">nearest neighbor 100ms tasks</th>\n",
- " <th>1</th>\n",
- " <th>tasks/s</th>\n",
- " <td>3.858868</td>\n",
- " <td>10.366202</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>tasks/s</th>\n",
- " <td>4.147557</td>\n",
- " <td>19.288811</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>tasks/s</th>\n",
- " <td>4.227893</td>\n",
- " <td>37.846582</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">nearest neighbor fast tasks</th>\n",
- " <th>1</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.173912</td>\n",
- " <td>1167.974119</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.382527</td>\n",
- " <td>1057.424268</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.537346</td>\n",
- " <td>1542.613544</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">sequential</th>\n",
- " <th>1</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.587582</td>\n",
- " <td>170.592242</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.566866</td>\n",
- " <td>176.410216</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.555918</td>\n",
- " <td>179.977319</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">task map 100ms tasks</th>\n",
- " <th>1</th>\n",
- " <th>tasks/s</th>\n",
- " <td>10.339996</td>\n",
- " <td>9.671213</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>tasks/s</th>\n",
- " <td>10.351813</td>\n",
- " <td>19.320287</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>tasks/s</th>\n",
- " <td>10.349894</td>\n",
- " <td>38.649526</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">task map 1s tasks</th>\n",
- " <th>1</th>\n",
- " <th>tasks/s</th>\n",
- " <td>4.051148</td>\n",
- " <td>0.987380</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>tasks/s</th>\n",
- " <td>4.063326</td>\n",
- " <td>1.968832</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>tasks/s</th>\n",
- " <td>4.064835</td>\n",
- " <td>3.936216</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">task map fast tasks</th>\n",
- " <th>1</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.166153</td>\n",
- " <td>1244.405187</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.149536</td>\n",
- " <td>2675.049581</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.428613</td>\n",
- " <td>1906.359809</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">tree reduction 100ms tasks</th>\n",
- " <th>1</th>\n",
- " <th>tasks/s</th>\n",
- " <td>6.585549</td>\n",
- " <td>19.436496</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>tasks/s</th>\n",
- " <td>7.040561</td>\n",
- " <td>36.360751</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>tasks/s</th>\n",
- " <td>7.320610</td>\n",
- " <td>69.946284</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th rowspan=\"3\" valign=\"top\">tree reduction fast tasks</th>\n",
- " <th>1</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.179473</td>\n",
- " <td>733.527981</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.280910</td>\n",
- " <td>921.576918</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <th>tasks/s</th>\n",
- " <td>0.388071</td>\n",
- " <td>1326.426160</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>93 rows × 2 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " duration \\\n",
- "collection name n unit \n",
- "arrays blockwise 100ms tasks 1 MB/s 1.029480 \n",
- " 2 MB/s 0.970075 \n",
- " 4 MB/s 0.911542 \n",
- " create random 1 MB/s 0.405803 \n",
- " 2 MB/s 0.583153 \n",
- " 4 MB/s 0.785832 \n",
- " elementwise computation 1 MB/s 2.410327 \n",
- " 2 MB/s 2.746347 \n",
- " 4 MB/s 3.498197 \n",
- " nearest neighbor 100ms tasks 1 MB/s 1.367269 \n",
- " 2 MB/s 1.314216 \n",
- " 4 MB/s 1.342365 \n",
- " nearest neighbor fast tasks 1 MB/s 0.224858 \n",
- " 2 MB/s 0.379917 \n",
- " 4 MB/s 0.601148 \n",
- " random access 1 bytes/s 0.025998 \n",
- " 2 bytes/s 0.054119 \n",
- " 4 bytes/s 0.046264 \n",
- " rechunk large 1 MB/s 0.195920 \n",
- " 2 MB/s 0.532372 \n",
- " 4 MB/s 1.636902 \n",
- " rechunk small 1 MB/s 0.172437 \n",
- " 2 MB/s 0.241122 \n",
- " 4 MB/s 0.965582 \n",
- " reduction 1 MB/s 0.235358 \n",
- " 2 MB/s 0.358469 \n",
- " 4 MB/s 0.505442 \n",
- " reduction along axis 1 MB/s 0.238496 \n",
- " 2 MB/s 0.343045 \n",
- " 4 MB/s 0.495310 \n",
- "... ... \n",
- "tasks dynamic tree reduction 100ms tasks 1 tasks/s 4.049874 \n",
- " 2 tasks/s 4.272979 \n",
- " 4 tasks/s 3.752805 \n",
- " dynamic tree reduction fast tasks 1 tasks/s 0.123307 \n",
- " 2 tasks/s 0.182649 \n",
- " 4 tasks/s 0.456500 \n",
- " nearest neighbor 100ms tasks 1 tasks/s 3.858868 \n",
- " 2 tasks/s 4.147557 \n",
- " 4 tasks/s 4.227893 \n",
- " nearest neighbor fast tasks 1 tasks/s 0.173912 \n",
- " 2 tasks/s 0.382527 \n",
- " 4 tasks/s 0.537346 \n",
- " sequential 1 tasks/s 0.587582 \n",
- " 2 tasks/s 0.566866 \n",
- " 4 tasks/s 0.555918 \n",
- " task map 100ms tasks 1 tasks/s 10.339996 \n",
- " 2 tasks/s 10.351813 \n",
- " 4 tasks/s 10.349894 \n",
- " task map 1s tasks 1 tasks/s 4.051148 \n",
- " 2 tasks/s 4.063326 \n",
- " 4 tasks/s 4.064835 \n",
- " task map fast tasks 1 tasks/s 0.166153 \n",
- " 2 tasks/s 0.149536 \n",
- " 4 tasks/s 0.428613 \n",
- " tree reduction 100ms tasks 1 tasks/s 6.585549 \n",
- " 2 tasks/s 7.040561 \n",
- " 4 tasks/s 7.320610 \n",
- " tree reduction fast tasks 1 tasks/s 0.179473 \n",
- " 2 tasks/s 0.280910 \n",
- " 4 tasks/s 0.388071 \n",
- "\n",
- " rate \n",
- "collection name n unit \n",
- "arrays blockwise 100ms tasks 1 MB/s 194.274386 \n",
- " 2 MB/s 412.724335 \n",
- " 4 MB/s 879.661395 \n",
- " create random 1 MB/s 492.852099 \n",
- " 2 MB/s 701.543327 \n",
- " 4 MB/s 1044.580052 \n",
- " elementwise computation 1 MB/s 83.223446 \n",
- " 2 MB/s 145.645420 \n",
- " 4 MB/s 229.083358 \n",
- " nearest neighbor 100ms tasks 1 MB/s 146.296656 \n",
- " 2 MB/s 304.434504 \n",
- " 4 MB/s 596.067880 \n",
- " nearest neighbor fast tasks 1 MB/s 911.479273 \n",
- " 2 MB/s 1117.691475 \n",
- " 4 MB/s 1420.598421 \n",
- " random access 1 bytes/s 358.347875 \n",
- " 2 bytes/s 266.879176 \n",
- " 4 bytes/s 172.956708 \n",
- " rechunk large 1 MB/s 1023.514836 \n",
- " 2 MB/s 751.343367 \n",
- " 4 MB/s 489.614913 \n",
- " rechunk small 1 MB/s 1167.949136 \n",
- " 2 MB/s 1659.630424 \n",
- " 4 MB/s 829.053704 \n",
- " reduction 1 MB/s 850.295775 \n",
- " 2 MB/s 1116.073630 \n",
- " 4 MB/s 1589.940599 \n",
- " reduction along axis 1 MB/s 840.850606 \n",
- " 2 MB/s 1167.376920 \n",
- " 4 MB/s 1616.628542 \n",
- "... ... \n",
- "tasks dynamic tree reduction 100ms tasks 1 tasks/s 24.692124 \n",
- " 2 tasks/s 46.805757 \n",
- " 4 tasks/s 106.844621 \n",
- " dynamic tree reduction fast tasks 1 tasks/s 815.427446 \n",
- " 2 tasks/s 1105.414498 \n",
- " 4 tasks/s 902.061964 \n",
- " nearest neighbor 100ms tasks 1 tasks/s 10.366202 \n",
- " 2 tasks/s 19.288811 \n",
- " 4 tasks/s 37.846582 \n",
- " nearest neighbor fast tasks 1 tasks/s 1167.974119 \n",
- " 2 tasks/s 1057.424268 \n",
- " 4 tasks/s 1542.613544 \n",
- " sequential 1 tasks/s 170.592242 \n",
- " 2 tasks/s 176.410216 \n",
- " 4 tasks/s 179.977319 \n",
- " task map 100ms tasks 1 tasks/s 9.671213 \n",
- " 2 tasks/s 19.320287 \n",
- " 4 tasks/s 38.649526 \n",
- " task map 1s tasks 1 tasks/s 0.987380 \n",
- " 2 tasks/s 1.968832 \n",
- " 4 tasks/s 3.936216 \n",
- " task map fast tasks 1 tasks/s 1244.405187 \n",
- " 2 tasks/s 2675.049581 \n",
- " 4 tasks/s 1906.359809 \n",
- " tree reduction 100ms tasks 1 tasks/s 19.436496 \n",
- " 2 tasks/s 36.360751 \n",
- " 4 tasks/s 69.946284 \n",
- " tree reduction fast tasks 1 tasks/s 733.527981 \n",
- " 2 tasks/s 921.576918 \n",
- " 4 tasks/s 1326.426160 \n",
- "\n",
- "[93 rows x 2 columns]"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df = ddf.groupby(['collection', 'name', 'n', 'unit']).mean()\n",
- "df"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "df.to_csv('scaling-data.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "ename": "FileNotFoundError",
- "evalue": "[Errno 2] No such file or directory: 'cloud'",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-5-a83d970e64f2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mgcsfs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mgcs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgcsfs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGCSFileSystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'cloud'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mgcs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'scaling-data.csv'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'dask-data/scaling-data-1.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/home/mrocklin/Software/anaconda/lib/python3.6/site-packages/gcsfs/core.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, project, access, token, block_size)\u001b[0m\n\u001b[1;32m 155\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maccess\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maccess\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdirs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 157\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 158\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_singleton\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/home/mrocklin/Software/anaconda/lib/python3.6/site-packages/gcsfs/core.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self, refresh)\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'type'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 209\u001b[0;31m \u001b[0mtoken\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_gtoken\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 210\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mproject\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccess\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtoken\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 211\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mproject\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccess\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/home/mrocklin/Software/anaconda/lib/python3.6/site-packages/gcsfs/core.py\u001b[0m in \u001b[0;36m_parse_gtoken\u001b[0;34m(gt)\u001b[0m\n\u001b[1;32m 172\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_parse_gtoken\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 174\u001b[0;31m \u001b[0mt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 175\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 176\u001b[0m \u001b[0mt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'cloud'"
- ]
- }
- ],
- "source": [
- "import gcsfs\n",
- "gcs = gcsfs.GCSFileSystem(token='cloud')\n",
- "gcs.put('scaling-data.csv', 'dask-data/scaling-data-1.csv')"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement