Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # workshop 3, 21.4.2021
- import numpy as np
- import pandas as pd
- import seaborn as sns
- import matplotlib.pyplot as plt
- from scipy import stats
- import ast
- from sklearn.preprocessing import MultiLabelBinarizer
- # load the data
- starbucks = pd.read_csv('starbucks_promotion.csv')
- # the channels-column is in string format
- # force it to be an actual list instead so we can split it later
- starbucks['channels'] = starbucks['channels'].apply(ast.literal_eval)
- # just a copy for inspection in variable explorer
- starbucks_copy = starbucks.copy()
- # scikit-learn => multilabelbinarizer can split the channels in to separate columns
- mlb = MultiLabelBinarizer()
- expandedLabelData = mlb.fit_transform(starbucks["channels"].tolist())
- labelClasses = mlb.classes_
- # Create a pandas.DataFrame from our output
- expandedLabels = pd.DataFrame(expandedLabelData, columns=labelClasses)
- # combine new columns with original data
- starbucks = pd.concat([starbucks, expandedLabels], axis=1)
- # remove unneeded channels
- starbucks.drop('channels', axis=1, inplace=True)
- starbucks.drop('id', axis=1, inplace=True)
- starbucks.drop('Unnamed: 0', axis=1, inplace=True)
- # column names for inspecting
- columns = starbucks.columns
- # basic correlation matrix
- # not enough data for this though...
- correlations = starbucks.corr()
- # basic pairplot
- plt.clf()
- sns.pairplot(starbucks, hue='offer_type')
- plt.figure()
- # NEW FILE
- # video game sales test data, csv file in MS Teams channel
- import numpy as np
- import pandas as pd
- import seaborn as sns
- import matplotlib.pyplot as plt
- from scipy import stats
- games = pd.read_csv('videogamesales.csv')
- fallout4 = games[games['Name'] == 'Fallout 4']
- # need for speed most wanted has been release 12 times on different platforms
- needforspeed = games[games['Name'] == 'Need for Speed: Most Wanted']
- # TODO
- # one interesting thing: what are the rankings if we combine
- # the sales of each game in all platforms
- correlations = games.corr()
- # what kind of games sell well in each region
- # if use you use [[ ]] instead of [] around NA_Sales
- # you will get a DataFrame instead of Series
- # reset index will take Genre out of index to a column
- NA_popular_genre = games.groupby('Genre')[['NA_Sales']].sum().reset_index()
- JP_popular_genre = games.groupby('Genre')[['JP_Sales']].sum().reset_index()
- # NA Sales, most popular genres
- plt.clf()
- plt.figure(figsize=(15,10))
- sns.barplot(x='Genre', y='NA_Sales', data=NA_popular_genre)
- plt.xticks(rotation=45)
- plt.show()
- # JP Sales, most popular genres
- plt.clf()
- plt.figure(figsize=(15,10))
- sns.barplot(x='Genre', y='JP_Sales', data=JP_popular_genre)
- plt.xticks(rotation=45)
- plt.show()
- # which publisher has released the most games
- # this would be interesting if only franchises/one game is counted
- # even if some game has been released on multiple platforms
- most_games_publishers = games['Publisher'].value_counts()
- # when was the last game released by THQ? it should be 2011-2013
- # last game, Company of Heroes 2 in 2013 (PC)
- thq = games[games['Publisher'] == 'THQ']
- # TODO
- # does platform, good critic and user rating mean good sales?
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement