tuomasvaltanen

Untitled

Apr 15th, 2021 (edited)
357
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # workshop 2, 15.4.2021
  2.  
  3. import numpy as np
  4. import pandas as pd
  5. import seaborn as sns
  6. import matplotlib.pyplot as plt
  7. from scipy import stats
  8.  
  9. from sklearn import preprocessing
  10.  
  11. # these datasets can be found in the workshop meeting comments
  12. gold = pd.read_csv('gold_price.csv')
  13. silver = pd.read_csv('silver_price.csv')
  14. copper = pd.read_csv('copper_price.csv')
  15.  
  16. # process the gold data => yearly averages
  17. gold['date'] = gold['date'].astype('datetime64[ns, US/Eastern]')
  18. gold['year'] = gold['date'].dt.year
  19. gold['month'] = gold['date'].dt.month
  20. gold['day'] = gold['date'].dt.day
  21.  
  22. gold.drop('date', axis=1, inplace=True)
  23. gold = gold.dropna()
  24.  
  25. gold_averages = gold.groupby('year').mean()
  26.  
  27. gold_averages.drop('month', axis=1, inplace=True)
  28. gold_averages.drop('day', axis=1, inplace=True)
  29. gold_averages = gold_averages.rename({'price': 'gold_price'}, axis=1)
  30.  
  31. plt.clf()
  32. sns.lineplot(data=gold_averages)
  33. plt.figure()
  34.  
  35. # process the silver data => yearly averages
  36. silver['date'] = silver['date'].astype('datetime64[ns, US/Eastern]')
  37. silver['year'] = silver['date'].dt.year
  38. silver['month'] = silver['date'].dt.month
  39. silver['day'] = silver['date'].dt.day
  40.  
  41. silver.drop('date', axis=1, inplace=True)
  42. silver = silver.dropna()
  43.  
  44. silver_averages = silver.groupby('year').mean()
  45.  
  46. silver_averages.drop('month', axis=1, inplace=True)
  47. silver_averages.drop('day', axis=1, inplace=True)
  48. silver_averages = silver_averages.rename({'price': 'silver_price'}, axis=1)
  49.  
  50. plt.clf()
  51. sns.lineplot(data=silver_averages)
  52. plt.figure()
  53.  
  54.  
  55. # let's combine the prices in one dataset
  56. metal_prices = pd.merge(gold_averages, silver_averages, on="year")
  57. correlations = metal_prices.corr()
  58.  
  59. plt.clf()
  60. sns.lineplot(data=metal_prices)
  61. plt.figure()
  62.  
  63. plt.clf()
  64. sns.pairplot(metal_prices)
  65. plt.figure()
  66.  
  67. # this makes it easier to compare the trends with gold vs silver
  68. # because gold is many times more expensive than gold/silver
  69. # it's hard to compare the trends otherwise.
  70. x = metal_prices.values #returns a numpy array
  71. min_max_scaler = preprocessing.MinMaxScaler()
  72. x_scaled = min_max_scaler.fit_transform(x)
  73. df = pd.DataFrame(x_scaled)
  74.  
  75. # this looks interesting
  76. plt.clf()
  77. sns.lineplot(data=df)
  78. plt.figure()
  79.  
  80.  
  81. # NEW FILE, how to open sqlite databases?
  82.  
  83. import numpy as np
  84. import pandas as pd
  85. import seaborn as sns
  86. import matplotlib.pyplot as plt
  87. from scipy import stats
  88. import sqlite3
  89.  
  90. # this dataset was use in this test:
  91. # https://www.kaggle.com/johntukey/clubhouse-dataset
  92.  
  93. # Create your connection.
  94. cnx = sqlite3.connect('clubhouse.db')
  95.  
  96. # we can filter out data with SQL syntax as we please
  97. # in this case, we filter out those users who do not use instagram
  98. club = pd.read_sql_query('SELECT * FROM user WHERE instagram != "null"', cnx)
  99.  
  100. # drop rows with NaN -values
  101. club = club.dropna()
  102.  
  103. # NEW FILE , world happiness ranking data
  104. # data is here: https://www.kaggle.com/anamvillalpando/world-happiness-ranking
  105.  
  106. import numpy as np
  107. import pandas as pd
  108. import seaborn as sns
  109. import matplotlib.pyplot as plt
  110. from scipy import stats
  111.  
  112.  
  113. data = pd.read_csv('Happiness.csv')
  114.  
  115.  
  116. columns = data.columns
  117.  
  118. for c in columns:
  119.     print(c)
  120.  
  121.  
  122. data.drop('Standard error of ladder score', axis=1, inplace=True)
  123. data.drop('upperwhisker', axis=1, inplace=True)
  124. data.drop('lowerwhisker', axis=1, inplace=True)
  125. data.drop('Ladder score in Dystopia', axis=1, inplace=True)
  126. data.drop('Explained by: Log GDP per capita', axis=1, inplace=True)
  127. data.drop('Explained by: Social support', axis=1, inplace=True)
  128. data.drop('Explained by: Healthy life expectancy', axis=1, inplace=True)
  129. data.drop('Explained by: Freedom to make life choices', axis=1, inplace=True)
  130. data.drop('Explained by: Generosity', axis=1, inplace=True)
  131. data.drop('Explained by: Perceptions of corruption', axis=1, inplace=True)
  132. data.drop('Dystopia + residual', axis=1, inplace=True)
  133.  
  134.  
  135. correlations = data.corr()
  136.  
  137. plt.clf()
  138. sns.heatmap(correlations)
  139. plt.figure()
  140.  
  141. plt.clf()
  142. sns.pairplot(data)
  143. plt.figure()
  144.  
  145.  
  146. print(data['Regional indicator'].unique())
  147.  
  148. print(data.groupby('Regional indicator').count())
  149.  
  150. sub_data = data[data['Regional indicator'] != 'East Asia']
  151. sub_data = sub_data[sub_data['Regional indicator'] != 'Southeast Asia']
  152. sub_data = sub_data[sub_data['Regional indicator'] != 'North America and ANZ']
  153. sub_data = sub_data[sub_data['Regional indicator'] != 'Commonwealth of Independent States']
  154.  
  155. print(sub_data.groupby('Regional indicator').count())
  156.  
  157.  
  158. plt.clf()
  159. sns.pairplot(sub_data, hue='Regional indicator')
  160. plt.figure()
  161.  
  162. plt.clf()
  163. # genders have similar regression lines
  164. sns.lmplot(data=sub_data, x='Ladder score', y='Healthy life expectancy', hue='Regional indicator')
  165. plt.figure()
RAW Paste Data