Apr 15th, 2021 (edited)
1. # workshop 2, 15.4.2021
2.
3. import numpy as np
4. import pandas as pd
5. import seaborn as sns
6. import matplotlib.pyplot as plt
7. from scipy import stats
8.
9. from sklearn import preprocessing
10.
11. # these datasets can be found in the workshop meeting comments
15.
16. # process the gold data => yearly averages
17. gold['date'] = gold['date'].astype('datetime64[ns, US/Eastern]')
18. gold['year'] = gold['date'].dt.year
19. gold['month'] = gold['date'].dt.month
20. gold['day'] = gold['date'].dt.day
21.
22. gold.drop('date', axis=1, inplace=True)
23. gold = gold.dropna()
24.
25. gold_averages = gold.groupby('year').mean()
26.
27. gold_averages.drop('month', axis=1, inplace=True)
28. gold_averages.drop('day', axis=1, inplace=True)
29. gold_averages = gold_averages.rename({'price': 'gold_price'}, axis=1)
30.
31. plt.clf()
32. sns.lineplot(data=gold_averages)
33. plt.figure()
34.
35. # process the silver data => yearly averages
36. silver['date'] = silver['date'].astype('datetime64[ns, US/Eastern]')
37. silver['year'] = silver['date'].dt.year
38. silver['month'] = silver['date'].dt.month
39. silver['day'] = silver['date'].dt.day
40.
41. silver.drop('date', axis=1, inplace=True)
42. silver = silver.dropna()
43.
44. silver_averages = silver.groupby('year').mean()
45.
46. silver_averages.drop('month', axis=1, inplace=True)
47. silver_averages.drop('day', axis=1, inplace=True)
48. silver_averages = silver_averages.rename({'price': 'silver_price'}, axis=1)
49.
50. plt.clf()
51. sns.lineplot(data=silver_averages)
52. plt.figure()
53.
54.
55. # let's combine the prices in one dataset
56. metal_prices = pd.merge(gold_averages, silver_averages, on="year")
57. correlations = metal_prices.corr()
58.
59. plt.clf()
60. sns.lineplot(data=metal_prices)
61. plt.figure()
62.
63. plt.clf()
64. sns.pairplot(metal_prices)
65. plt.figure()
66.
67. # this makes it easier to compare the trends with gold vs silver
68. # because gold is many times more expensive than gold/silver
69. # it's hard to compare the trends otherwise.
70. x = metal_prices.values #returns a numpy array
71. min_max_scaler = preprocessing.MinMaxScaler()
72. x_scaled = min_max_scaler.fit_transform(x)
73. df = pd.DataFrame(x_scaled)
74.
75. # this looks interesting
76. plt.clf()
77. sns.lineplot(data=df)
78. plt.figure()
79.
80.
81. # NEW FILE, how to open sqlite databases?
82.
83. import numpy as np
84. import pandas as pd
85. import seaborn as sns
86. import matplotlib.pyplot as plt
87. from scipy import stats
88. import sqlite3
89.
90. # this dataset was use in this test:
91. # https://www.kaggle.com/johntukey/clubhouse-dataset
92.
94. cnx = sqlite3.connect('clubhouse.db')
95.
96. # we can filter out data with SQL syntax as we please
97. # in this case, we filter out those users who do not use instagram
98. club = pd.read_sql_query('SELECT * FROM user WHERE instagram != "null"', cnx)
99.
100. # drop rows with NaN -values
101. club = club.dropna()
102.
103. # NEW FILE , world happiness ranking data
104. # data is here: https://www.kaggle.com/anamvillalpando/world-happiness-ranking
105.
106. import numpy as np
107. import pandas as pd
108. import seaborn as sns
109. import matplotlib.pyplot as plt
110. from scipy import stats
111.
112.
114.
115.
116. columns = data.columns
117.
118. for c in columns:
119.     print(c)
120.
121.
122. data.drop('Standard error of ladder score', axis=1, inplace=True)
123. data.drop('upperwhisker', axis=1, inplace=True)
124. data.drop('lowerwhisker', axis=1, inplace=True)
125. data.drop('Ladder score in Dystopia', axis=1, inplace=True)
126. data.drop('Explained by: Log GDP per capita', axis=1, inplace=True)
127. data.drop('Explained by: Social support', axis=1, inplace=True)
128. data.drop('Explained by: Healthy life expectancy', axis=1, inplace=True)
129. data.drop('Explained by: Freedom to make life choices', axis=1, inplace=True)
130. data.drop('Explained by: Generosity', axis=1, inplace=True)
131. data.drop('Explained by: Perceptions of corruption', axis=1, inplace=True)
132. data.drop('Dystopia + residual', axis=1, inplace=True)
133.
134.
135. correlations = data.corr()
136.
137. plt.clf()
138. sns.heatmap(correlations)
139. plt.figure()
140.
141. plt.clf()
142. sns.pairplot(data)
143. plt.figure()
144.
145.
146. print(data['Regional indicator'].unique())
147.
148. print(data.groupby('Regional indicator').count())
149.
150. sub_data = data[data['Regional indicator'] != 'East Asia']
151. sub_data = sub_data[sub_data['Regional indicator'] != 'Southeast Asia']
152. sub_data = sub_data[sub_data['Regional indicator'] != 'North America and ANZ']
153. sub_data = sub_data[sub_data['Regional indicator'] != 'Commonwealth of Independent States']
154.
155. print(sub_data.groupby('Regional indicator').count())
156.
157.
158. plt.clf()
159. sns.pairplot(sub_data, hue='Regional indicator')
160. plt.figure()
161.
162. plt.clf()
163. # genders have similar regression lines
164. sns.lmplot(data=sub_data, x='Ladder score', y='Healthy life expectancy', hue='Regional indicator')
165. plt.figure()