Untitled

# lecture 4

import numpy as np
import pandas as pd

# helper function to determine if one column has
# even or odd numbers (True / False)
def even_number(row):
    if row['Y'] % 2 == 0:
        return True
    else:
        return False


# lambda version of previous function
even_number_lambda = lambda x : x['Y'] % 2 == 0


# random data 4x3 matrix, values between 1-10
df = pd.DataFrame(np.random.randint(1, 10, 12).reshape(4,3),
                  ['A', 'B', 'C', 'D'], ['X','Y','Z'])

# use our own function to determine even numbers
# remember, axis = 1 => use columns instead of rows
df['Even'] = df.apply(even_number_lambda, axis=1)


# NEW FILE - HOUSING DATA

import numpy as np
import pandas as pd

from scipy import stats

# force normal numbers instead of scientific notation
pd.set_option('display.float_format', lambda x: '%.1f' % x)


def age_group(row):
    # yr_built
    if row['yr_built'] >= 2000:
        return 4
    elif 1980 <= row['yr_built'] < 2000:
        return 3
    elif 1960 <= row['yr_built'] < 1980:
        return 2
    else:
        return 1

houses = pd.read_csv('houses.csv')

print(houses.head())

# this should be the most expensive house
single = houses[houses['id'] == 6762700020]

# getting the price of the single selected property
price = single.iloc[0]['price']
print(price)

# getting the ids of cheapest and most expensive properties
cheapest_id = houses.loc[houses['price'].idxmin()]['id']
expensive_id = houses.loc[houses['price'].idxmax()]['id']

print(houses['bedrooms'].max())

expensive_houses = houses[houses['price'] >= 2000000].count()

average_price_per_condition = houses.groupby('condition').mean()

# axis = 1 => drop a column, this time: id
houses = houses.drop('id', axis=1)
houses = houses.drop('date', axis=1)
houses = houses.drop('zipcode', axis=1)
houses = houses.drop('lat', axis=1)
houses = houses.drop('long', axis=1)
houses = houses.drop('yr_renovated', axis=1)
houses = houses.drop('waterfront', axis=1)
houses = houses.drop('view', axis=1)
houses = houses.drop('sqft_living', axis=1)
houses = houses.drop('sqft_lot', axis=1)
houses = houses.drop('sqft_above', axis=1)
houses = houses.drop('sqft_basement', axis=1)

houses['living_m2'] = round(houses['sqft_living15'] * 0.09290304, 0)
houses['yard_m2'] = round(houses['sqft_lot15'] * 0.09290304, 0)

houses = houses.drop('sqft_living15', axis=1)
houses = houses.drop('sqft_lot15', axis=1)

houses['age_group'] = houses.apply(age_group, axis=1)
houses = houses.drop('yr_built', axis=1)


# use normal numbers instead of scientific notation
houses['price'] = houses['price'].astype('int64')

houses = houses[(np.abs(stats.zscore(houses)) < 3).all(axis=1)]

summary = houses.describe()
print(summary)

correlations = houses.corr()

# NEW FILE - INCOMPLETE TEST DATA

import numpy as np
import pandas as pd

test = pd.read_csv('incomplete_test.csv')

# handle missing values
# rows without a year are removed
test = test[test['year'].notnull()]

# rows without a condition, use the average (mean)
test['condition'].fillna(test['condition'].mean(), inplace=True)

unique_areas = test['area'].unique()