Advertisement
tuomasvaltanen

Untitled

Mar 25th, 2021 (edited)
772
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.22 KB | None | 0 0
  1. # lecture 6
  2.  
  3. import seaborn as sns
  4. import pandas as pd
  5. import matplotlib.pyplot as plt
  6. import numpy as np
  7.  
  8. flights = sns.load_dataset('flights')
  9. tips = sns.load_dataset('tips')
  10.  
  11. # other styles are whitegrid, dark, darkgrid and ticks
  12. sns.set_style('white')
  13.  
  14.  
  15. plt.clf()
  16. sns.lmplot(x='total_bill', y='tip', data=tips)
  17.  
  18. # if you want to remove the axis lines, this can be done
  19. sns.despine(left=True, bottom=True)
  20. plt.figure()
  21.  
  22. plt.clf()
  23. sns.lmplot(x='total_bill', y='tip', data=tips,
  24.            hue='sex', markers=['o', '*'], scatter_kws={'s':100})
  25. plt.figure()
  26.  
  27. plt.clf()
  28. sns.lmplot(x='total_bill', y='tip', data=tips, col='sex',
  29.            row='time', hue="smoker")
  30.  
  31. plt.figure()
  32.  
  33. plt.clf()
  34. plt.figure(figsize=(30,30))
  35. sns.lmplot(x='total_bill', y='tip', data=tips,
  36.                        col='day', hue='sex', aspect=0.6, size=8)
  37. plt.figure()
  38.  
  39. plt.clf()
  40. # a poster version with bigger text
  41. # sns.set_context('poster', font_scale=1.2)
  42. sns.countplot(x='smoker', data=tips)
  43. plt.figure()
  44.  
  45.  
  46. plt.clf()
  47.  
  48. sns.lmplot(x='total_bill', y='tip', data=tips,
  49.                        hue='sex', palette='seismic')
  50. plt.figure()
  51.  
  52. # NEW FILE
  53.  
  54. import seaborn as sns
  55. import pandas as pd
  56. import matplotlib.pyplot as plt
  57. import numpy as np
  58.  
  59. flights = sns.load_dataset('flights')
  60. tips = sns.load_dataset('tips')
  61.  
  62. # other styles are whitegrid, dark, darkgrid and ticks
  63. sns.set_style('white')
  64.  
  65.  
  66. plt.clf()
  67. sns.lmplot(x='total_bill', y='tip', data=tips)
  68.  
  69. # if you want to remove the axis lines, this can be done
  70. sns.despine(left=True, bottom=True)
  71. plt.figure()
  72.  
  73. plt.clf()
  74. sns.lmplot(x='total_bill', y='tip', data=tips,
  75.            hue='sex', markers=['o', '*'], scatter_kws={'s':100})
  76. plt.figure()
  77.  
  78. plt.clf()
  79. sns.lmplot(x='total_bill', y='tip', data=tips, col='sex',
  80.            row='time', hue="smoker")
  81.  
  82. plt.figure()
  83.  
  84. plt.clf()
  85. plt.figure(figsize=(30,30))
  86. sns.lmplot(x='total_bill', y='tip', data=tips,
  87.                        col='day', hue='sex', aspect=0.6, size=8)
  88. plt.figure()
  89.  
  90. plt.clf()
  91. # a poster version with bigger text
  92. # sns.set_context('poster', font_scale=1.2)
  93. sns.countplot(x='smoker', data=tips)
  94. plt.figure()
  95.  
  96.  
  97. plt.clf()
  98.  
  99. sns.lmplot(x='total_bill', y='tip', data=tips,
  100.                        hue='sex', palette='seismic')
  101. plt.figure()
  102.  
  103. # NEW FILE
  104.  
  105. import seaborn as sns
  106. import pandas as pd
  107. import matplotlib.pyplot as plt
  108. import numpy as np
  109.  
  110. iris = sns.load_dataset('iris')
  111. tips = sns.load_dataset('tips')
  112.  
  113. plt.clf()
  114. sns.pairplot(iris, hue='species')
  115. plt.figure()
  116.  
  117.  
  118. plt.clf()
  119. # let's make a custom plot
  120. g = sns.PairGrid(iris)
  121. g.map_diag(sns.distplot)
  122. g.map_upper(plt.scatter)
  123. g.map_lower(sns.kdeplot)
  124. plt.figure()
  125.  
  126.  
  127. plt.clf()
  128. g = sns.FacetGrid(data=tips, col='time', row='smoker')
  129. g.map(plt.scatter, 'total_bill', 'tip')
  130.  
  131. plt.figure()
  132.  
  133.  
  134. plt.clf()
  135. g = sns.JointGrid(x="total_bill",
  136.                   y="tip", data=tips)
  137. g = g.plot(sns.regplot, sns.distplot)
  138.  
  139. plt.figure()
  140.  
  141. # THE HOUSING DATA FILE
  142.  
  143. import numpy as np
  144. import pandas as pd
  145. import seaborn as sns
  146. import matplotlib.pyplot as plt
  147. from scipy import stats
  148.  
  149. # force normal numbers instead of scientific notation
  150. pd.set_option('display.float_format', lambda x: '%.1f' % x)
  151.  
  152.  
  153. def age_group(row):
  154.     # yr_built
  155.     if row['yr_built'] >= 2000:
  156.         return 4
  157.     elif 1980 <= row['yr_built'] < 2000:
  158.         return 3
  159.     elif 1960 <= row['yr_built'] < 1980:
  160.         return 2
  161.     else:
  162.         return 1
  163.    
  164.  
  165. # helper method for creating a more simple grading system for pairplots
  166. def regrade(row):
  167.     if row['grade'] <= 6:
  168.         return 1
  169.     elif row['grade'] == 7:
  170.         return 2
  171.     elif row['grade'] == 8:
  172.         return 3
  173.     elif row['grade'] >= 9:
  174.         return 4
  175.  
  176. houses = pd.read_csv('houses.csv')
  177.  
  178. print(houses.head())
  179.  
  180. # this should be the most expensive house
  181. single = houses[houses['id'] == 6762700020]
  182.  
  183. # getting the price of the single selected property
  184. price = single.iloc[0]['price']
  185. print(price)
  186.  
  187. # getting the ids of cheapest and most expensive properties
  188. cheapest_id = houses.loc[houses['price'].idxmin()]['id']
  189. expensive_id = houses.loc[houses['price'].idxmax()]['id']
  190.  
  191. print(houses['bedrooms'].max())
  192.  
  193. expensive_houses = houses[houses['price'] >= 2000000].count()
  194.  
  195. average_price_per_condition = houses.groupby('condition').mean()
  196.  
  197. # axis = 1 => drop a column, this time: id
  198. houses = houses.drop('id', axis=1)
  199. houses = houses.drop('date', axis=1)
  200. houses = houses.drop('zipcode', axis=1)
  201. houses = houses.drop('lat', axis=1)
  202. houses = houses.drop('long', axis=1)
  203. houses = houses.drop('yr_renovated', axis=1)
  204. houses = houses.drop('waterfront', axis=1)
  205. houses = houses.drop('view', axis=1)
  206. houses = houses.drop('sqft_living', axis=1)
  207. houses = houses.drop('sqft_lot', axis=1)
  208. houses = houses.drop('sqft_above', axis=1)
  209. houses = houses.drop('sqft_basement', axis=1)
  210.  
  211. houses['living_m2'] = round(houses['sqft_living15'] * 0.09290304, 0)
  212. houses['yard_m2'] = round(houses['sqft_lot15'] * 0.09290304, 0)
  213.  
  214. houses = houses.drop('sqft_living15', axis=1)
  215. houses = houses.drop('sqft_lot15', axis=1)
  216.  
  217. #houses['age_group'] = houses.apply(age_group, axis=1)
  218. houses = houses.drop('yr_built', axis=1)
  219.  
  220. houses = houses.drop('condition', axis=1)
  221. houses = houses.drop('floors', axis=1)
  222. houses = houses.drop('yard_m2', axis=1)
  223.  
  224. # use normal numbers instead of scientific notation
  225. houses['price'] = houses['price'].astype('int64')
  226. grade_counts = houses['grade'].value_counts()
  227.  
  228. # use new grading, and remove old
  229. houses['new_grade'] = houses.apply(regrade, axis=1)
  230.  
  231. houses = houses.drop('grade', axis=1)
  232.  
  233. # remove decimals from bathrooms
  234. houses['bathrooms'] = round(houses['bathrooms'], 0).astype('int64')
  235.  
  236. houses = houses[(np.abs(stats.zscore(houses)) < 3).all(axis=1)]
  237.  
  238. # to make plotting work faster, take a random sample
  239. houses = houses.sample(n=3000)
  240.  
  241. summary = houses.describe()
  242. print(summary)
  243. correlations = houses.corr()
  244.  
  245. # getting probability of each grade we have
  246. grading_probabilities = houses.groupby('new_grade').size().div(len(houses))
  247.  
  248. # probabilities between two columns
  249. multiple_probabilities = houses.groupby(['new_grade', 'bathrooms']).size().div(len(houses)).div(grading_probabilities, axis=0, level=0)
  250. print(multiple_probabilities)
  251.  
  252. # the average house in this dataset
  253. # this prints outs = (2, 2, 3) = typical property has grade 2, 2 bathrooms and 3 bedrooms
  254. average_house = houses.groupby(['new_grade', 'bathrooms', 'bedrooms']).size().idxmax()
  255. print(average_house)
  256.  
  257. average_price = houses[(houses['new_grade'] == 2) & (houses['bathrooms'] == 2) & (houses['bedrooms'] == 3)]['price'].mean()
  258. average_m2_= houses[(houses['new_grade'] == 2) & (houses['bathrooms'] == 2) & (houses['bedrooms'] == 3)]['living_m2'].mean()
  259.  
  260. # let's try out plots
  261. plt.clf()
  262. sns.pairplot(houses, hue='new_grade', palette="hsv")
  263. plt.figure()
  264.  
  265. plt.clf()
  266. sns.boxplot(x='bathrooms', y='price', data=houses, hue='new_grade')
  267. plt.figure()
  268.  
  269. plt.clf()
  270. sns.boxplot(x='bedrooms', y='price', data=houses, hue='new_grade')
  271. plt.figure()
  272.  
  273. plt.clf()
  274. sns.jointplot(x='price', y='living_m2', data=houses)
  275. plt.figure()
  276.  
  277. plt.clf()
  278. sns.barplot(x='new_grade', y='price', data=houses, estimator=np.std)
  279. plt.figure()
  280.  
  281. plt.clf()
  282. sns.lmplot(x='price', y='living_m2', data=houses, hue="new_grade")
  283. plt.figure()
  284.  
  285. # NEW FILE, UNEMPLOYMENT DATA
  286.  
  287. import seaborn as sns
  288. import pandas as pd
  289. import matplotlib.pyplot as plt
  290. import numpy as np
  291.  
  292. emp = pd.read_csv('USUnemployment.csv')
  293. flights = sns.load_dataset('flights')
  294.  
  295. correlations = emp.corr()
  296.  
  297. # pandas has a function called melt
  298. # it allows us to transform columns into rows!
  299. # id_vars = columns of data we want to be untouched
  300. # value_vars = which columns are melt into a new variable column
  301. # var_name = what are the melt columns new column name
  302. # value_name = the name of the new value column
  303. emp = emp.melt(
  304.     id_vars = ['Year'],
  305.     value_vars = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
  306.     var_name = 'Month',
  307.     value_name = 'Unemployment'
  308.     )
  309.  
  310.  
  311. emp_pivot = emp.pivot_table(index='Month', columns='Year', values='Unemployment')
  312.  
  313. plt.clf()
  314. sns.heatmap(emp_pivot, cmap='coolwarm')
  315. plt.figure()
  316.  
  317. plt.clf()
  318. sns.clustermap(emp_pivot, cmap='coolwarm', standard_scale=0)
  319. plt.figure()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement