Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Drawing Conclusions Using Groupby"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>fixed_acidity</th>\n",
- " <th>volatile_acidity</th>\n",
- " <th>citric_acid</th>\n",
- " <th>residual_sugar</th>\n",
- " <th>chlorides</th>\n",
- " <th>free_sulfur_dioxide</th>\n",
- " <th>total_sulfur_dioxide</th>\n",
- " <th>density</th>\n",
- " <th>pH</th>\n",
- " <th>sulphates</th>\n",
- " <th>alcohol</th>\n",
- " <th>quality</th>\n",
- " <th>color</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>7.4</td>\n",
- " <td>0.70</td>\n",
- " <td>0.00</td>\n",
- " <td>1.9</td>\n",
- " <td>0.076</td>\n",
- " <td>11.0</td>\n",
- " <td>34.0</td>\n",
- " <td>0.9978</td>\n",
- " <td>3.51</td>\n",
- " <td>0.56</td>\n",
- " <td>9.4</td>\n",
- " <td>5</td>\n",
- " <td>red</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>7.8</td>\n",
- " <td>0.88</td>\n",
- " <td>0.00</td>\n",
- " <td>2.6</td>\n",
- " <td>0.098</td>\n",
- " <td>25.0</td>\n",
- " <td>67.0</td>\n",
- " <td>0.9968</td>\n",
- " <td>3.20</td>\n",
- " <td>0.68</td>\n",
- " <td>9.8</td>\n",
- " <td>5</td>\n",
- " <td>red</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>7.8</td>\n",
- " <td>0.76</td>\n",
- " <td>0.04</td>\n",
- " <td>2.3</td>\n",
- " <td>0.092</td>\n",
- " <td>15.0</td>\n",
- " <td>54.0</td>\n",
- " <td>0.9970</td>\n",
- " <td>3.26</td>\n",
- " <td>0.65</td>\n",
- " <td>9.8</td>\n",
- " <td>5</td>\n",
- " <td>red</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>11.2</td>\n",
- " <td>0.28</td>\n",
- " <td>0.56</td>\n",
- " <td>1.9</td>\n",
- " <td>0.075</td>\n",
- " <td>17.0</td>\n",
- " <td>60.0</td>\n",
- " <td>0.9980</td>\n",
- " <td>3.16</td>\n",
- " <td>0.58</td>\n",
- " <td>9.8</td>\n",
- " <td>6</td>\n",
- " <td>red</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>7.4</td>\n",
- " <td>0.70</td>\n",
- " <td>0.00</td>\n",
- " <td>1.9</td>\n",
- " <td>0.076</td>\n",
- " <td>11.0</td>\n",
- " <td>34.0</td>\n",
- " <td>0.9978</td>\n",
- " <td>3.51</td>\n",
- " <td>0.56</td>\n",
- " <td>9.4</td>\n",
- " <td>5</td>\n",
- " <td>red</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " fixed_acidity volatile_acidity citric_acid residual_sugar chlorides \\\n",
- "0 7.4 0.70 0.00 1.9 0.076 \n",
- "1 7.8 0.88 0.00 2.6 0.098 \n",
- "2 7.8 0.76 0.04 2.3 0.092 \n",
- "3 11.2 0.28 0.56 1.9 0.075 \n",
- "4 7.4 0.70 0.00 1.9 0.076 \n",
- "\n",
- " free_sulfur_dioxide total_sulfur_dioxide density pH sulphates \\\n",
- "0 11.0 34.0 0.9978 3.51 0.56 \n",
- "1 25.0 67.0 0.9968 3.20 0.68 \n",
- "2 15.0 54.0 0.9970 3.26 0.65 \n",
- "3 17.0 60.0 0.9980 3.16 0.58 \n",
- "4 11.0 34.0 0.9978 3.51 0.56 \n",
- "\n",
- " alcohol quality color \n",
- "0 9.4 5 red \n",
- "1 9.8 5 red \n",
- "2 9.8 5 red \n",
- "3 9.8 6 red \n",
- "4 9.4 5 red "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Load `winequality_edited.csv`\n",
- "import pandas as pd\n",
- "df = pd.read_csv('winequality.csv')\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "collapsed": true
- },
- "source": [
- "### Is a certain type of wine associated with higher quality?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "color\n",
- "red 5.636023\n",
- "white 5.877909\n",
- "Name: quality, dtype: float64"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Find the mean quality of each wine type (red and white) with groupby\n",
- "df.groupby(['color'])['quality'].mean()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### What level of acidity receives the highest average rating?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>fixed_acidity</th>\n",
- " <th>volatile_acidity</th>\n",
- " <th>citric_acid</th>\n",
- " <th>residual_sugar</th>\n",
- " <th>chlorides</th>\n",
- " <th>free_sulfur_dioxide</th>\n",
- " <th>total_sulfur_dioxide</th>\n",
- " <th>density</th>\n",
- " <th>pH</th>\n",
- " <th>sulphates</th>\n",
- " <th>alcohol</th>\n",
- " <th>quality</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>count</th>\n",
- " <td>6497.000000</td>\n",
- " <td>6497.000000</td>\n",
- " <td>6497.000000</td>\n",
- " <td>6497.000000</td>\n",
- " <td>6497.000000</td>\n",
- " <td>6497.000000</td>\n",
- " <td>6497.000000</td>\n",
- " <td>6497.000000</td>\n",
- " <td>6497.000000</td>\n",
- " <td>6497.000000</td>\n",
- " <td>6497.000000</td>\n",
- " <td>6497.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>mean</th>\n",
- " <td>7.215307</td>\n",
- " <td>0.339666</td>\n",
- " <td>0.318633</td>\n",
- " <td>5.443235</td>\n",
- " <td>0.056034</td>\n",
- " <td>30.525319</td>\n",
- " <td>115.744574</td>\n",
- " <td>0.994697</td>\n",
- " <td>3.218501</td>\n",
- " <td>0.531268</td>\n",
- " <td>10.491801</td>\n",
- " <td>5.818378</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>std</th>\n",
- " <td>1.296434</td>\n",
- " <td>0.164636</td>\n",
- " <td>0.145318</td>\n",
- " <td>4.757804</td>\n",
- " <td>0.035034</td>\n",
- " <td>17.749400</td>\n",
- " <td>56.521855</td>\n",
- " <td>0.002999</td>\n",
- " <td>0.160787</td>\n",
- " <td>0.148806</td>\n",
- " <td>1.192712</td>\n",
- " <td>0.873255</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>min</th>\n",
- " <td>3.800000</td>\n",
- " <td>0.080000</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.600000</td>\n",
- " <td>0.009000</td>\n",
- " <td>1.000000</td>\n",
- " <td>6.000000</td>\n",
- " <td>0.987110</td>\n",
- " <td>2.720000</td>\n",
- " <td>0.220000</td>\n",
- " <td>8.000000</td>\n",
- " <td>3.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>25%</th>\n",
- " <td>6.400000</td>\n",
- " <td>0.230000</td>\n",
- " <td>0.250000</td>\n",
- " <td>1.800000</td>\n",
- " <td>0.038000</td>\n",
- " <td>17.000000</td>\n",
- " <td>77.000000</td>\n",
- " <td>0.992340</td>\n",
- " <td>3.110000</td>\n",
- " <td>0.430000</td>\n",
- " <td>9.500000</td>\n",
- " <td>5.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>50%</th>\n",
- " <td>7.000000</td>\n",
- " <td>0.290000</td>\n",
- " <td>0.310000</td>\n",
- " <td>3.000000</td>\n",
- " <td>0.047000</td>\n",
- " <td>29.000000</td>\n",
- " <td>118.000000</td>\n",
- " <td>0.994890</td>\n",
- " <td>3.210000</td>\n",
- " <td>0.510000</td>\n",
- " <td>10.300000</td>\n",
- " <td>6.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>75%</th>\n",
- " <td>7.700000</td>\n",
- " <td>0.400000</td>\n",
- " <td>0.390000</td>\n",
- " <td>8.100000</td>\n",
- " <td>0.065000</td>\n",
- " <td>41.000000</td>\n",
- " <td>156.000000</td>\n",
- " <td>0.996990</td>\n",
- " <td>3.320000</td>\n",
- " <td>0.600000</td>\n",
- " <td>11.300000</td>\n",
- " <td>6.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>max</th>\n",
- " <td>15.900000</td>\n",
- " <td>1.580000</td>\n",
- " <td>1.660000</td>\n",
- " <td>65.800000</td>\n",
- " <td>0.611000</td>\n",
- " <td>289.000000</td>\n",
- " <td>440.000000</td>\n",
- " <td>1.038980</td>\n",
- " <td>4.010000</td>\n",
- " <td>2.000000</td>\n",
- " <td>14.900000</td>\n",
- " <td>9.000000</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " fixed_acidity volatile_acidity citric_acid residual_sugar \\\n",
- "count 6497.000000 6497.000000 6497.000000 6497.000000 \n",
- "mean 7.215307 0.339666 0.318633 5.443235 \n",
- "std 1.296434 0.164636 0.145318 4.757804 \n",
- "min 3.800000 0.080000 0.000000 0.600000 \n",
- "25% 6.400000 0.230000 0.250000 1.800000 \n",
- "50% 7.000000 0.290000 0.310000 3.000000 \n",
- "75% 7.700000 0.400000 0.390000 8.100000 \n",
- "max 15.900000 1.580000 1.660000 65.800000 \n",
- "\n",
- " chlorides free_sulfur_dioxide total_sulfur_dioxide density \\\n",
- "count 6497.000000 6497.000000 6497.000000 6497.000000 \n",
- "mean 0.056034 30.525319 115.744574 0.994697 \n",
- "std 0.035034 17.749400 56.521855 0.002999 \n",
- "min 0.009000 1.000000 6.000000 0.987110 \n",
- "25% 0.038000 17.000000 77.000000 0.992340 \n",
- "50% 0.047000 29.000000 118.000000 0.994890 \n",
- "75% 0.065000 41.000000 156.000000 0.996990 \n",
- "max 0.611000 289.000000 440.000000 1.038980 \n",
- "\n",
- " pH sulphates alcohol quality \n",
- "count 6497.000000 6497.000000 6497.000000 6497.000000 \n",
- "mean 3.218501 0.531268 10.491801 5.818378 \n",
- "std 0.160787 0.148806 1.192712 0.873255 \n",
- "min 2.720000 0.220000 8.000000 3.000000 \n",
- "25% 3.110000 0.430000 9.500000 5.000000 \n",
- "50% 3.210000 0.510000 10.300000 6.000000 \n",
- "75% 3.320000 0.600000 11.300000 6.000000 \n",
- "max 4.010000 2.000000 14.900000 9.000000 "
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# View the min, 25%, 50%, 75%, max pH values with Pandas describe\n",
- "df.describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# Bin edges that will be used to \"cut\" the data into groups\n",
- "bin_edges = [2.72, 3.11 ,3.21 ,3.32, 4.01 ] # Fill in this list with five values you just found"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# Labels for the four acidity level groups\n",
- "bin_names = ['low' ,'medium' ,'high' ,'very high' ] # Name each acidity level category"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>fixed_acidity</th>\n",
- " <th>volatile_acidity</th>\n",
- " <th>citric_acid</th>\n",
- " <th>residual_sugar</th>\n",
- " <th>chlorides</th>\n",
- " <th>free_sulfur_dioxide</th>\n",
- " <th>total_sulfur_dioxide</th>\n",
- " <th>density</th>\n",
- " <th>pH</th>\n",
- " <th>sulphates</th>\n",
- " <th>alcohol</th>\n",
- " <th>quality</th>\n",
- " <th>color</th>\n",
- " <th>acidity_levels</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>7.4</td>\n",
- " <td>0.70</td>\n",
- " <td>0.00</td>\n",
- " <td>1.9</td>\n",
- " <td>0.076</td>\n",
- " <td>11.0</td>\n",
- " <td>34.0</td>\n",
- " <td>0.9978</td>\n",
- " <td>3.51</td>\n",
- " <td>0.56</td>\n",
- " <td>9.4</td>\n",
- " <td>5</td>\n",
- " <td>red</td>\n",
- " <td>very high</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>7.8</td>\n",
- " <td>0.88</td>\n",
- " <td>0.00</td>\n",
- " <td>2.6</td>\n",
- " <td>0.098</td>\n",
- " <td>25.0</td>\n",
- " <td>67.0</td>\n",
- " <td>0.9968</td>\n",
- " <td>3.20</td>\n",
- " <td>0.68</td>\n",
- " <td>9.8</td>\n",
- " <td>5</td>\n",
- " <td>red</td>\n",
- " <td>medium</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>7.8</td>\n",
- " <td>0.76</td>\n",
- " <td>0.04</td>\n",
- " <td>2.3</td>\n",
- " <td>0.092</td>\n",
- " <td>15.0</td>\n",
- " <td>54.0</td>\n",
- " <td>0.9970</td>\n",
- " <td>3.26</td>\n",
- " <td>0.65</td>\n",
- " <td>9.8</td>\n",
- " <td>5</td>\n",
- " <td>red</td>\n",
- " <td>high</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>11.2</td>\n",
- " <td>0.28</td>\n",
- " <td>0.56</td>\n",
- " <td>1.9</td>\n",
- " <td>0.075</td>\n",
- " <td>17.0</td>\n",
- " <td>60.0</td>\n",
- " <td>0.9980</td>\n",
- " <td>3.16</td>\n",
- " <td>0.58</td>\n",
- " <td>9.8</td>\n",
- " <td>6</td>\n",
- " <td>red</td>\n",
- " <td>medium</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>7.4</td>\n",
- " <td>0.70</td>\n",
- " <td>0.00</td>\n",
- " <td>1.9</td>\n",
- " <td>0.076</td>\n",
- " <td>11.0</td>\n",
- " <td>34.0</td>\n",
- " <td>0.9978</td>\n",
- " <td>3.51</td>\n",
- " <td>0.56</td>\n",
- " <td>9.4</td>\n",
- " <td>5</td>\n",
- " <td>red</td>\n",
- " <td>very high</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " fixed_acidity volatile_acidity citric_acid residual_sugar chlorides \\\n",
- "0 7.4 0.70 0.00 1.9 0.076 \n",
- "1 7.8 0.88 0.00 2.6 0.098 \n",
- "2 7.8 0.76 0.04 2.3 0.092 \n",
- "3 11.2 0.28 0.56 1.9 0.075 \n",
- "4 7.4 0.70 0.00 1.9 0.076 \n",
- "\n",
- " free_sulfur_dioxide total_sulfur_dioxide density pH sulphates \\\n",
- "0 11.0 34.0 0.9978 3.51 0.56 \n",
- "1 25.0 67.0 0.9968 3.20 0.68 \n",
- "2 15.0 54.0 0.9970 3.26 0.65 \n",
- "3 17.0 60.0 0.9980 3.16 0.58 \n",
- "4 11.0 34.0 0.9978 3.51 0.56 \n",
- "\n",
- " alcohol quality color acidity_levels \n",
- "0 9.4 5 red very high \n",
- "1 9.8 5 red medium \n",
- "2 9.8 5 red high \n",
- "3 9.8 6 red medium \n",
- "4 9.4 5 red very high "
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Creates acidity_levels column\n",
- "df['acidity_levels'] = pd.cut(df['pH'], bin_edges, labels=bin_names)\n",
- "\n",
- "# Checks for successful creation of this column\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "acidity_levels\n",
- "low 5.783343\n",
- "medium 5.784540\n",
- "high 5.850832\n",
- "very high 5.859593\n",
- "Name: quality, dtype: float64"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Find the mean quality of each acidity level with groupby\n",
- "df.groupby(['acidity_levels'])['quality'].mean()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# Save changes for the next section\n",
- "df.to_csv('winequality_edited.csv', index=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment