Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # lecture 6
- import seaborn as sns
- import pandas as pd
- import matplotlib.pyplot as plt
- import numpy as np
- flights = sns.load_dataset('flights')
- tips = sns.load_dataset('tips')
- # other styles are whitegrid, dark, darkgrid and ticks
- sns.set_style('white')
- plt.clf()
- sns.lmplot(x='total_bill', y='tip', data=tips)
- # if you want to remove the axis lines, this can be done
- sns.despine(left=True, bottom=True)
- plt.figure()
- plt.clf()
- sns.lmplot(x='total_bill', y='tip', data=tips,
- hue='sex', markers=['o', '*'], scatter_kws={'s':100})
- plt.figure()
- plt.clf()
- sns.lmplot(x='total_bill', y='tip', data=tips, col='sex',
- row='time', hue="smoker")
- plt.figure()
- plt.clf()
- plt.figure(figsize=(30,30))
- sns.lmplot(x='total_bill', y='tip', data=tips,
- col='day', hue='sex', aspect=0.6, size=8)
- plt.figure()
- plt.clf()
- # a poster version with bigger text
- # sns.set_context('poster', font_scale=1.2)
- sns.countplot(x='smoker', data=tips)
- plt.figure()
- plt.clf()
- sns.lmplot(x='total_bill', y='tip', data=tips,
- hue='sex', palette='seismic')
- plt.figure()
- # NEW FILE
- import seaborn as sns
- import pandas as pd
- import matplotlib.pyplot as plt
- import numpy as np
- flights = sns.load_dataset('flights')
- tips = sns.load_dataset('tips')
- # other styles are whitegrid, dark, darkgrid and ticks
- sns.set_style('white')
- plt.clf()
- sns.lmplot(x='total_bill', y='tip', data=tips)
- # if you want to remove the axis lines, this can be done
- sns.despine(left=True, bottom=True)
- plt.figure()
- plt.clf()
- sns.lmplot(x='total_bill', y='tip', data=tips,
- hue='sex', markers=['o', '*'], scatter_kws={'s':100})
- plt.figure()
- plt.clf()
- sns.lmplot(x='total_bill', y='tip', data=tips, col='sex',
- row='time', hue="smoker")
- plt.figure()
- plt.clf()
- plt.figure(figsize=(30,30))
- sns.lmplot(x='total_bill', y='tip', data=tips,
- col='day', hue='sex', aspect=0.6, size=8)
- plt.figure()
- plt.clf()
- # a poster version with bigger text
- # sns.set_context('poster', font_scale=1.2)
- sns.countplot(x='smoker', data=tips)
- plt.figure()
- plt.clf()
- sns.lmplot(x='total_bill', y='tip', data=tips,
- hue='sex', palette='seismic')
- plt.figure()
- # NEW FILE
- import seaborn as sns
- import pandas as pd
- import matplotlib.pyplot as plt
- import numpy as np
- iris = sns.load_dataset('iris')
- tips = sns.load_dataset('tips')
- plt.clf()
- sns.pairplot(iris, hue='species')
- plt.figure()
- plt.clf()
- # let's make a custom plot
- g = sns.PairGrid(iris)
- g.map_diag(sns.distplot)
- g.map_upper(plt.scatter)
- g.map_lower(sns.kdeplot)
- plt.figure()
- plt.clf()
- g = sns.FacetGrid(data=tips, col='time', row='smoker')
- g.map(plt.scatter, 'total_bill', 'tip')
- plt.figure()
- plt.clf()
- g = sns.JointGrid(x="total_bill",
- y="tip", data=tips)
- g = g.plot(sns.regplot, sns.distplot)
- plt.figure()
- # THE HOUSING DATA FILE
- import numpy as np
- import pandas as pd
- import seaborn as sns
- import matplotlib.pyplot as plt
- from scipy import stats
- # force normal numbers instead of scientific notation
- pd.set_option('display.float_format', lambda x: '%.1f' % x)
- def age_group(row):
- # yr_built
- if row['yr_built'] >= 2000:
- return 4
- elif 1980 <= row['yr_built'] < 2000:
- return 3
- elif 1960 <= row['yr_built'] < 1980:
- return 2
- else:
- return 1
- # helper method for creating a more simple grading system for pairplots
- def regrade(row):
- if row['grade'] <= 6:
- return 1
- elif row['grade'] == 7:
- return 2
- elif row['grade'] == 8:
- return 3
- elif row['grade'] >= 9:
- return 4
- houses = pd.read_csv('houses.csv')
- print(houses.head())
- # this should be the most expensive house
- single = houses[houses['id'] == 6762700020]
- # getting the price of the single selected property
- price = single.iloc[0]['price']
- print(price)
- # getting the ids of cheapest and most expensive properties
- cheapest_id = houses.loc[houses['price'].idxmin()]['id']
- expensive_id = houses.loc[houses['price'].idxmax()]['id']
- print(houses['bedrooms'].max())
- expensive_houses = houses[houses['price'] >= 2000000].count()
- average_price_per_condition = houses.groupby('condition').mean()
- # axis = 1 => drop a column, this time: id
- houses = houses.drop('id', axis=1)
- houses = houses.drop('date', axis=1)
- houses = houses.drop('zipcode', axis=1)
- houses = houses.drop('lat', axis=1)
- houses = houses.drop('long', axis=1)
- houses = houses.drop('yr_renovated', axis=1)
- houses = houses.drop('waterfront', axis=1)
- houses = houses.drop('view', axis=1)
- houses = houses.drop('sqft_living', axis=1)
- houses = houses.drop('sqft_lot', axis=1)
- houses = houses.drop('sqft_above', axis=1)
- houses = houses.drop('sqft_basement', axis=1)
- houses['living_m2'] = round(houses['sqft_living15'] * 0.09290304, 0)
- houses['yard_m2'] = round(houses['sqft_lot15'] * 0.09290304, 0)
- houses = houses.drop('sqft_living15', axis=1)
- houses = houses.drop('sqft_lot15', axis=1)
- #houses['age_group'] = houses.apply(age_group, axis=1)
- houses = houses.drop('yr_built', axis=1)
- houses = houses.drop('condition', axis=1)
- houses = houses.drop('floors', axis=1)
- houses = houses.drop('yard_m2', axis=1)
- # use normal numbers instead of scientific notation
- houses['price'] = houses['price'].astype('int64')
- grade_counts = houses['grade'].value_counts()
- # use new grading, and remove old
- houses['new_grade'] = houses.apply(regrade, axis=1)
- houses = houses.drop('grade', axis=1)
- # remove decimals from bathrooms
- houses['bathrooms'] = round(houses['bathrooms'], 0).astype('int64')
- houses = houses[(np.abs(stats.zscore(houses)) < 3).all(axis=1)]
- # to make plotting work faster, take a random sample
- houses = houses.sample(n=3000)
- summary = houses.describe()
- print(summary)
- correlations = houses.corr()
- # getting probability of each grade we have
- grading_probabilities = houses.groupby('new_grade').size().div(len(houses))
- # probabilities between two columns
- multiple_probabilities = houses.groupby(['new_grade', 'bathrooms']).size().div(len(houses)).div(grading_probabilities, axis=0, level=0)
- print(multiple_probabilities)
- # the average house in this dataset
- # this prints outs = (2, 2, 3) = typical property has grade 2, 2 bathrooms and 3 bedrooms
- average_house = houses.groupby(['new_grade', 'bathrooms', 'bedrooms']).size().idxmax()
- print(average_house)
- average_price = houses[(houses['new_grade'] == 2) & (houses['bathrooms'] == 2) & (houses['bedrooms'] == 3)]['price'].mean()
- average_m2_= houses[(houses['new_grade'] == 2) & (houses['bathrooms'] == 2) & (houses['bedrooms'] == 3)]['living_m2'].mean()
- # let's try out plots
- plt.clf()
- sns.pairplot(houses, hue='new_grade', palette="hsv")
- plt.figure()
- plt.clf()
- sns.boxplot(x='bathrooms', y='price', data=houses, hue='new_grade')
- plt.figure()
- plt.clf()
- sns.boxplot(x='bedrooms', y='price', data=houses, hue='new_grade')
- plt.figure()
- plt.clf()
- sns.jointplot(x='price', y='living_m2', data=houses)
- plt.figure()
- plt.clf()
- sns.barplot(x='new_grade', y='price', data=houses, estimator=np.std)
- plt.figure()
- plt.clf()
- sns.lmplot(x='price', y='living_m2', data=houses, hue="new_grade")
- plt.figure()
- # NEW FILE, UNEMPLOYMENT DATA
- import seaborn as sns
- import pandas as pd
- import matplotlib.pyplot as plt
- import numpy as np
- emp = pd.read_csv('USUnemployment.csv')
- flights = sns.load_dataset('flights')
- correlations = emp.corr()
- # pandas has a function called melt
- # it allows us to transform columns into rows!
- # id_vars = columns of data we want to be untouched
- # value_vars = which columns are melt into a new variable column
- # var_name = what are the melt columns new column name
- # value_name = the name of the new value column
- emp = emp.melt(
- id_vars = ['Year'],
- value_vars = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
- var_name = 'Month',
- value_name = 'Unemployment'
- )
- emp_pivot = emp.pivot_table(index='Month', columns='Year', values='Unemployment')
- plt.clf()
- sns.heatmap(emp_pivot, cmap='coolwarm')
- plt.figure()
- plt.clf()
- sns.clustermap(emp_pivot, cmap='coolwarm', standard_scale=0)
- plt.figure()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement