Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # workshop 4, 27.4.2021
- import numpy as np
- import pandas as pd
- import seaborn as sns
- import matplotlib.pyplot as plt
- from scipy import stats
- # total sales of each platform, input by hand from Wikipedia
- def platform_sales(row):
- if row['Platform'] == 'Wii':
- return 101630000
- if row['Platform'] == 'NES':
- return 61910000
- if row['Platform'] == 'G':
- return 118690000
- if row['Platform'] == 'DS':
- return 154020000
- if row['Platform'] == 'X360':
- return 84000000
- if row['Platform'] == 'PS3':
- return 87400000
- if row['Platform'] == 'PS2':
- return 155000000
- if row['Platform'] == 'SNES':
- return 61910000
- if row['Platform'] == 'GBA':
- return 81510000
- if row['Platform'] == '3DS':
- return 75940000
- if row['Platform'] == 'PS4':
- return 114900000
- if row['Platform'] == 'N64':
- return 32930000
- if row['Platform'] == 'PS':
- return 102490000
- if row['Platform'] == 'X':
- return 24000000
- if row['Platform'] == 'PC':
- return np.nan
- if row['Platform'] == '2600':
- return 30000000
- if row['Platform'] == 'PSP':
- return 81000000
- if row['Platform'] == 'XOne':
- return 48690000
- if row['Platform'] == 'WiiU':
- return 13560000
- if row['Platform'] == 'GC':
- return 21740000
- if row['Platform'] == 'GEN':
- return 30750000
- if row['Platform'] == 'DC':
- return 9130000
- if row['Platform'] == 'PSV':
- return 12500000
- if row['Platform'] == 'SAT':
- return 9260000
- if row['Platform'] == 'SCD':
- return 2240000
- if row['Platform'] == 'WS':
- return 3500000
- if row['Platform'] == 'NG':
- return 1000000
- if row['Platform'] == 'TG16':
- return 5800000
- if row['Platform'] == '3DO':
- return 2000000
- if row['Platform'] == 'GG':
- return 10620000
- if row['Platform'] == 'PCFX':
- return 400000
- # for multiplatform games, calculate the whole sale amounts
- def franchise_sales(row):
- #result = games[games['Name'] == row['Name']]['Global_Sales'].sum()
- #result = games[games['Name'] == row['Name']].groupby('Name')[['Global_Sales']].sum()
- result = games[games['Name'] == row['Name']]['Global_Sales'].sum()
- return result
- games = pd.read_csv('videogamesales.csv')
- fallout4 = games[games['Name'] == 'Fallout 4']
- # need for speed most wanted has been release 12 times on different platforms
- needforspeed = games[games['Name'] == 'Need for Speed: Most Wanted']
- # TODO
- # one interesting thing: what are the rankings if we combine
- # the sales of each game in all platforms
- # what kind of games sell well in each region
- # if use you use [[ ]] instead of [] around NA_Sales
- # you will get a DataFrame instead of Series
- # reset index will take Genre out of index to a column
- NA_popular_genre = games.groupby('Genre')[['NA_Sales']].sum().reset_index()
- JP_popular_genre = games.groupby('Genre')[['JP_Sales']].sum().reset_index()
- # NA Sales, most popular genres
- plt.clf()
- plt.figure(figsize=(15,10))
- sns.barplot(x='Genre', y='NA_Sales', data=NA_popular_genre)
- plt.xticks(rotation=45)
- plt.show()
- # JP Sales, most popular genres
- plt.clf()
- plt.figure(figsize=(15,10))
- sns.barplot(x='Genre', y='JP_Sales', data=JP_popular_genre)
- plt.xticks(rotation=45)
- plt.show()
- fallout4_sales = games[games['Name'] == 'Fallout 4']['Global_Sales'].sum()
- # which publisher has released the most games
- # this would be interesting if only franchises/one game is counted
- # even if some game has been released on multiple platforms
- most_games_publishers = games['Publisher'].value_counts()
- # when was the last game released by THQ? it should be 2011-2013
- # last game, Company of Heroes 2 in 2013 (PC)
- thq = games[games['Publisher'] == 'THQ']
- unique_platforms = games['Platform'].unique()
- games['Platforms_Sold'] = games.apply(platform_sales, axis=1)
- games['Percentage_Owners'] = ((games['Global_Sales'] * 1000000) / games['Platforms_Sold']) * 100
- correlations = games.corr()
- games['Franchise_Global_Sales'] = games.apply(franchise_sales, axis=1)
- #plt.clf()
- #sns.pairplot(games)
- #plt.figure()
- # TODO
- # does platform, good critic and user scores mean good sales?
- games_copy = games.copy()
- # THE CORRELATIONS IN PAIRPLOT CAN BE IMPROVED WITH FOLLOWING
- # create a some sort of weighted critic+user score overall score, that emphasizes
- # the critic score more (based on critic and user count, for example)
- # the harder way is to get more scoring data for the games
- games['Overall_Score'] = ((games['Critic_Score']/10) + games['User_Score']) / 2
- games['Overall_Score_Count'] = games['Critic_Count'] + games['User_Count']
- games.drop('Critic_Score', axis=1, inplace=True)
- games.drop('User_Score', axis=1, inplace=True)
- games.drop('Critic_Count', axis=1, inplace=True)
- games.drop('User_Count', axis=1, inplace=True)
- rated_games = games[games['Overall_Score'].notnull()]
- rated_games.drop(['JP_Sales'], axis=1, inplace=True)
- rated_games.drop(['NA_Sales'], axis=1, inplace=True)
- rated_games.drop(['EU_Sales'], axis=1, inplace=True)
- rated_games.drop(['Other_Sales'], axis=1, inplace=True)
- rated_games.drop(['Rating'], axis=1, inplace=True)
- rated_games.drop(['Global_Sales'], axis=1, inplace=True)
- rated_games.drop(['Platforms_Sold'], axis=1, inplace=True)
- rated_games = rated_games[rated_games['Overall_Score_Count'] >= 50]
- rated_games.drop(['Overall_Score_Count'], axis=1, inplace=True)
- some_platforms = ['Wii', 'PS2', 'NES', 'PS3', 'DS', 'PS', 'X360']
- rated_games = rated_games[rated_games['Platform'].isin(some_platforms)]
- plt.clf()
- sns.pairplot(rated_games, hue='Platform')
- plt.figure()
- # NEW FILE - TESTING OUT DECISION TREES
- import matplotlib.pyplot as plt
- import seaborn as sns
- iris = sns.load_dataset("iris")
- plt.clf()
- sns.pairplot(iris, hue='species')
- plt.figure()
- from sklearn.datasets import load_iris
- from sklearn import tree
- X, y = load_iris(return_X_y = True)
- clf = tree.DecisionTreeClassifier()
- clf = clf.fit(X, y)
- tree.plot_tree(clf)
- iris = load_iris()
- import graphviz
- dot_data = tree.export_graphviz(clf, out_file=None,
- feature_names=iris['feature_names'],
- class_names=iris['target_names'],
- filled=True, rounded=True,
- special_characters=True)
- graph = graphviz.Source(dot_data)
- graph.render('iris')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement