Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # lecture 4
- import numpy as np
- import pandas as pd
- # helper function to determine if one column has
- # even or odd numbers (True / False)
- def even_number(row):
- if row['Y'] % 2 == 0:
- return True
- else:
- return False
- # lambda version of previous function
- even_number_lambda = lambda x : x['Y'] % 2 == 0
- # random data 4x3 matrix, values between 1-10
- df = pd.DataFrame(np.random.randint(1, 10, 12).reshape(4,3),
- ['A', 'B', 'C', 'D'], ['X','Y','Z'])
- # use our own function to determine even numbers
- # remember, axis = 1 => use columns instead of rows
- df['Even'] = df.apply(even_number_lambda, axis=1)
- # NEW FILE - HOUSING DATA
- import numpy as np
- import pandas as pd
- from scipy import stats
- # force normal numbers instead of scientific notation
- pd.set_option('display.float_format', lambda x: '%.1f' % x)
- def age_group(row):
- # yr_built
- if row['yr_built'] >= 2000:
- return 4
- elif 1980 <= row['yr_built'] < 2000:
- return 3
- elif 1960 <= row['yr_built'] < 1980:
- return 2
- else:
- return 1
- houses = pd.read_csv('houses.csv')
- print(houses.head())
- # this should be the most expensive house
- single = houses[houses['id'] == 6762700020]
- # getting the price of the single selected property
- price = single.iloc[0]['price']
- print(price)
- # getting the ids of cheapest and most expensive properties
- cheapest_id = houses.loc[houses['price'].idxmin()]['id']
- expensive_id = houses.loc[houses['price'].idxmax()]['id']
- print(houses['bedrooms'].max())
- expensive_houses = houses[houses['price'] >= 2000000].count()
- average_price_per_condition = houses.groupby('condition').mean()
- # axis = 1 => drop a column, this time: id
- houses = houses.drop('id', axis=1)
- houses = houses.drop('date', axis=1)
- houses = houses.drop('zipcode', axis=1)
- houses = houses.drop('lat', axis=1)
- houses = houses.drop('long', axis=1)
- houses = houses.drop('yr_renovated', axis=1)
- houses = houses.drop('waterfront', axis=1)
- houses = houses.drop('view', axis=1)
- houses = houses.drop('sqft_living', axis=1)
- houses = houses.drop('sqft_lot', axis=1)
- houses = houses.drop('sqft_above', axis=1)
- houses = houses.drop('sqft_basement', axis=1)
- houses['living_m2'] = round(houses['sqft_living15'] * 0.09290304, 0)
- houses['yard_m2'] = round(houses['sqft_lot15'] * 0.09290304, 0)
- houses = houses.drop('sqft_living15', axis=1)
- houses = houses.drop('sqft_lot15', axis=1)
- houses['age_group'] = houses.apply(age_group, axis=1)
- houses = houses.drop('yr_built', axis=1)
- # use normal numbers instead of scientific notation
- houses['price'] = houses['price'].astype('int64')
- houses = houses[(np.abs(stats.zscore(houses)) < 3).all(axis=1)]
- summary = houses.describe()
- print(summary)
- correlations = houses.corr()
- # NEW FILE - INCOMPLETE TEST DATA
- import numpy as np
- import pandas as pd
- test = pd.read_csv('incomplete_test.csv')
- # handle missing values
- # rows without a year are removed
- test = test[test['year'].notnull()]
- # rows without a condition, use the average (mean)
- test['condition'].fillna(test['condition'].mean(), inplace=True)
- unique_areas = test['area'].unique()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement