# Untitled

Mar 16th, 2021 (edited)
939
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. # lecture 4
2.
3. import numpy as np
4. import pandas as pd
5.
6. # helper function to determine if one column has
7. # even or odd numbers (True / False)
8. def even_number(row):
9.     if row['Y'] % 2 == 0:
10.         return True
11.     else:
12.         return False
13.
14.
15. # lambda version of previous function
16. even_number_lambda = lambda x : x['Y'] % 2 == 0
17.
18.
19. # random data 4x3 matrix, values between 1-10
20. df = pd.DataFrame(np.random.randint(1, 10, 12).reshape(4,3),
21.                   ['A', 'B', 'C', 'D'], ['X','Y','Z'])
22.
23. # use our own function to determine even numbers
24. # remember, axis = 1 => use columns instead of rows
25. df['Even'] = df.apply(even_number_lambda, axis=1)
26.
27.
28. # NEW FILE - HOUSING DATA
29.
30. import numpy as np
31. import pandas as pd
32.
33. from scipy import stats
34.
35. # force normal numbers instead of scientific notation
36. pd.set_option('display.float_format', lambda x: '%.1f' % x)
37.
38.
39. def age_group(row):
40.     # yr_built
41.     if row['yr_built'] >= 2000:
42.         return 4
43.     elif 1980 <= row['yr_built'] < 2000:
44.         return 3
45.     elif 1960 <= row['yr_built'] < 1980:
46.         return 2
47.     else:
48.         return 1
49.
51.
53.
54. # this should be the most expensive house
55. single = houses[houses['id'] == 6762700020]
56.
57. # getting the price of the single selected property
58. price = single.iloc[0]['price']
59. print(price)
60.
61. # getting the ids of cheapest and most expensive properties
62. cheapest_id = houses.loc[houses['price'].idxmin()]['id']
63. expensive_id = houses.loc[houses['price'].idxmax()]['id']
64.
65. print(houses['bedrooms'].max())
66.
67. expensive_houses = houses[houses['price'] >= 2000000].count()
68.
69. average_price_per_condition = houses.groupby('condition').mean()
70.
71. # axis = 1 => drop a column, this time: id
72. houses = houses.drop('id', axis=1)
73. houses = houses.drop('date', axis=1)
74. houses = houses.drop('zipcode', axis=1)
75. houses = houses.drop('lat', axis=1)
76. houses = houses.drop('long', axis=1)
77. houses = houses.drop('yr_renovated', axis=1)
78. houses = houses.drop('waterfront', axis=1)
79. houses = houses.drop('view', axis=1)
80. houses = houses.drop('sqft_living', axis=1)
81. houses = houses.drop('sqft_lot', axis=1)
82. houses = houses.drop('sqft_above', axis=1)
83. houses = houses.drop('sqft_basement', axis=1)
84.
85. houses['living_m2'] = round(houses['sqft_living15'] * 0.09290304, 0)
86. houses['yard_m2'] = round(houses['sqft_lot15'] * 0.09290304, 0)
87.
88. houses = houses.drop('sqft_living15', axis=1)
89. houses = houses.drop('sqft_lot15', axis=1)
90.
91. houses['age_group'] = houses.apply(age_group, axis=1)
92. houses = houses.drop('yr_built', axis=1)
93.
94.
95. # use normal numbers instead of scientific notation
96. houses['price'] = houses['price'].astype('int64')
97.
98. houses = houses[(np.abs(stats.zscore(houses)) < 3).all(axis=1)]
99.
100. summary = houses.describe()
101. print(summary)
102.
103. correlations = houses.corr()
104.
105. # NEW FILE - INCOMPLETE TEST DATA
106.
107. import numpy as np
108. import pandas as pd
109.