Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # workshop 2, 15.4.2021
- import numpy as np
- import pandas as pd
- import seaborn as sns
- import matplotlib.pyplot as plt
- from scipy import stats
- from sklearn import preprocessing
- # these datasets can be found in the workshop meeting comments
- gold = pd.read_csv('gold_price.csv')
- silver = pd.read_csv('silver_price.csv')
- copper = pd.read_csv('copper_price.csv')
- # process the gold data => yearly averages
- gold['date'] = gold['date'].astype('datetime64[ns, US/Eastern]')
- gold['year'] = gold['date'].dt.year
- gold['month'] = gold['date'].dt.month
- gold['day'] = gold['date'].dt.day
- gold.drop('date', axis=1, inplace=True)
- gold = gold.dropna()
- gold_averages = gold.groupby('year').mean()
- gold_averages.drop('month', axis=1, inplace=True)
- gold_averages.drop('day', axis=1, inplace=True)
- gold_averages = gold_averages.rename({'price': 'gold_price'}, axis=1)
- plt.clf()
- sns.lineplot(data=gold_averages)
- plt.figure()
- # process the silver data => yearly averages
- silver['date'] = silver['date'].astype('datetime64[ns, US/Eastern]')
- silver['year'] = silver['date'].dt.year
- silver['month'] = silver['date'].dt.month
- silver['day'] = silver['date'].dt.day
- silver.drop('date', axis=1, inplace=True)
- silver = silver.dropna()
- silver_averages = silver.groupby('year').mean()
- silver_averages.drop('month', axis=1, inplace=True)
- silver_averages.drop('day', axis=1, inplace=True)
- silver_averages = silver_averages.rename({'price': 'silver_price'}, axis=1)
- plt.clf()
- sns.lineplot(data=silver_averages)
- plt.figure()
- # let's combine the prices in one dataset
- metal_prices = pd.merge(gold_averages, silver_averages, on="year")
- correlations = metal_prices.corr()
- plt.clf()
- sns.lineplot(data=metal_prices)
- plt.figure()
- plt.clf()
- sns.pairplot(metal_prices)
- plt.figure()
- # this makes it easier to compare the trends with gold vs silver
- # because gold is many times more expensive than gold/silver
- # it's hard to compare the trends otherwise.
- x = metal_prices.values #returns a numpy array
- min_max_scaler = preprocessing.MinMaxScaler()
- x_scaled = min_max_scaler.fit_transform(x)
- df = pd.DataFrame(x_scaled)
- # this looks interesting
- plt.clf()
- sns.lineplot(data=df)
- plt.figure()
- # NEW FILE, how to open sqlite databases?
- import numpy as np
- import pandas as pd
- import seaborn as sns
- import matplotlib.pyplot as plt
- from scipy import stats
- import sqlite3
- # this dataset was use in this test:
- # https://www.kaggle.com/johntukey/clubhouse-dataset
- # Create your connection.
- cnx = sqlite3.connect('clubhouse.db')
- # we can filter out data with SQL syntax as we please
- # in this case, we filter out those users who do not use instagram
- club = pd.read_sql_query('SELECT * FROM user WHERE instagram != "null"', cnx)
- # drop rows with NaN -values
- club = club.dropna()
- # NEW FILE , world happiness ranking data
- # data is here: https://www.kaggle.com/anamvillalpando/world-happiness-ranking
- import numpy as np
- import pandas as pd
- import seaborn as sns
- import matplotlib.pyplot as plt
- from scipy import stats
- data = pd.read_csv('Happiness.csv')
- columns = data.columns
- for c in columns:
- print(c)
- data.drop('Standard error of ladder score', axis=1, inplace=True)
- data.drop('upperwhisker', axis=1, inplace=True)
- data.drop('lowerwhisker', axis=1, inplace=True)
- data.drop('Ladder score in Dystopia', axis=1, inplace=True)
- data.drop('Explained by: Log GDP per capita', axis=1, inplace=True)
- data.drop('Explained by: Social support', axis=1, inplace=True)
- data.drop('Explained by: Healthy life expectancy', axis=1, inplace=True)
- data.drop('Explained by: Freedom to make life choices', axis=1, inplace=True)
- data.drop('Explained by: Generosity', axis=1, inplace=True)
- data.drop('Explained by: Perceptions of corruption', axis=1, inplace=True)
- data.drop('Dystopia + residual', axis=1, inplace=True)
- correlations = data.corr()
- plt.clf()
- sns.heatmap(correlations)
- plt.figure()
- plt.clf()
- sns.pairplot(data)
- plt.figure()
- print(data['Regional indicator'].unique())
- print(data.groupby('Regional indicator').count())
- sub_data = data[data['Regional indicator'] != 'East Asia']
- sub_data = sub_data[sub_data['Regional indicator'] != 'Southeast Asia']
- sub_data = sub_data[sub_data['Regional indicator'] != 'North America and ANZ']
- sub_data = sub_data[sub_data['Regional indicator'] != 'Commonwealth of Independent States']
- print(sub_data.groupby('Regional indicator').count())
- plt.clf()
- sns.pairplot(sub_data, hue='Regional indicator')
- plt.figure()
- plt.clf()
- # genders have similar regression lines
- sns.lmplot(data=sub_data, x='Ladder score', y='Healthy life expectancy', hue='Regional indicator')
- plt.figure()
Add Comment
Please, Sign In to add comment