Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import random\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# Construct the sample data (1.000.000 rows)\n",
- "data = []\n",
- "for i in range(1000000):\n",
- " row = []\n",
- " row.append(random.choice(['A1', 'A2', 'A3', 'A4', 'A5']))\n",
- " row.append(random.choice(['B1', 'B2', 'B3', 'B4', 'B5']))\n",
- " row.append(random.choice(['C1', 'C2', 'C3', 'C4', 'C5']))\n",
- " row.append(random.choice(['D1', 'D2', 'D3', 'D4', 'D5']))\n",
- " row.append(random.randint(1000, 100000))\n",
- " data.append(row)\n",
- "df = pd.DataFrame(data, columns=['A','B','C','D', 'val']) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>A</th>\n",
- " <th>B</th>\n",
- " <th>C</th>\n",
- " <th>D</th>\n",
- " <th>val</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>812419</th>\n",
- " <td>A1</td>\n",
- " <td>B1</td>\n",
- " <td>C2</td>\n",
- " <td>D4</td>\n",
- " <td>31882</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>964167</th>\n",
- " <td>A5</td>\n",
- " <td>B3</td>\n",
- " <td>C2</td>\n",
- " <td>D2</td>\n",
- " <td>65951</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>505576</th>\n",
- " <td>A1</td>\n",
- " <td>B2</td>\n",
- " <td>C4</td>\n",
- " <td>D3</td>\n",
- " <td>29178</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>729472</th>\n",
- " <td>A4</td>\n",
- " <td>B5</td>\n",
- " <td>C2</td>\n",
- " <td>D4</td>\n",
- " <td>46592</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>43541</th>\n",
- " <td>A4</td>\n",
- " <td>B3</td>\n",
- " <td>C3</td>\n",
- " <td>D2</td>\n",
- " <td>92247</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " A B C D val\n",
- "812419 A1 B1 C2 D4 31882\n",
- "964167 A5 B3 C2 D2 65951\n",
- "505576 A1 B2 C4 D3 29178\n",
- "729472 A4 B5 C2 D4 46592\n",
- "43541 A4 B3 C3 D2 92247"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Sample output\n",
- "df.sample(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(1000000, 5)"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "<class 'pandas.core.frame.DataFrame'>\n",
- "RangeIndex: 1000000 entries, 0 to 999999\n",
- "Data columns (total 5 columns):\n",
- "A 1000000 non-null object\n",
- "B 1000000 non-null object\n",
- "C 1000000 non-null object\n",
- "D 1000000 non-null object\n",
- "val 1000000 non-null int64\n",
- "dtypes: int64(1), object(4)\n",
- "memory usage: 232.7 MB\n"
- ]
- }
- ],
- "source": [
- "# Memory usage with deep introspection\n",
- "df.info(memory_usage='deep')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "10 loops, best of 3: 57 ms per loop\n"
- ]
- }
- ],
- "source": [
- "%%timeit\n",
- "df[(df['A']== 'A1')]['val'].mean()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "10 loops, best of 3: 80.9 ms per loop\n"
- ]
- }
- ],
- "source": [
- "%%timeit\n",
- "df[(df['A']== 'A2') & (df['B'] == 'BA2')]['val'].mean()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "10 loops, best of 3: 119 ms per loop\n"
- ]
- }
- ],
- "source": [
- "%%timeit\n",
- "df[(df['A']== 'A4') & (df['C'] == 'CA4') & (df['D'] == 'D4')]['val'].mean()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "for col in df.columns.tolist()[:-1]:\n",
- " df[col] = df[col].astype('category')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "<class 'pandas.core.frame.DataFrame'>\n",
- "RangeIndex: 1000000 entries, 0 to 999999\n",
- "Data columns (total 5 columns):\n",
- "A 1000000 non-null category\n",
- "B 1000000 non-null category\n",
- "C 1000000 non-null category\n",
- "D 1000000 non-null category\n",
- "val 1000000 non-null int64\n",
- "dtypes: category(4), int64(1)\n",
- "memory usage: 11.4 MB\n"
- ]
- }
- ],
- "source": [
- "df.info(memory_usage='deep')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "100 loops, best of 3: 9.36 ms per loop\n"
- ]
- }
- ],
- "source": [
- "%%timeit\n",
- "df[(df['A']== 'A1')]['val'].mean()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "100 loops, best of 3: 6.79 ms per loop\n"
- ]
- }
- ],
- "source": [
- "%%timeit\n",
- "df[(df['A']== 'A2') & (df['B'] == 'BA2')]['val'].mean()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "100 loops, best of 3: 9.37 ms per loop\n"
- ]
- }
- ],
- "source": [
- "%%timeit\n",
- "df[(df['A']== 'A4') & (df['C'] == 'CA4') & (df['D'] == 'D4')]['val'].mean()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.1"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement