Untitled

# workshop 4, 27.4.2021

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# total sales of each platform, input by hand from Wikipedia
def platform_sales(row):
    if row['Platform'] == 'Wii':
        return 101630000
    if row['Platform'] == 'NES':
        return 61910000
    if row['Platform'] == 'G':
        return 118690000
    if row['Platform'] == 'DS':
        return 154020000
    if row['Platform'] == 'X360':
        return 84000000
    if row['Platform'] == 'PS3':
        return 87400000
    if row['Platform'] == 'PS2':
        return 155000000
    if row['Platform'] == 'SNES':
        return 61910000
    if row['Platform'] == 'GBA':
        return 81510000
    if row['Platform'] == '3DS':
        return 75940000
    if row['Platform'] == 'PS4':
        return 114900000
    if row['Platform'] == 'N64':
        return 32930000
    if row['Platform'] == 'PS':
        return 102490000
    if row['Platform'] == 'X':
        return 24000000
    if row['Platform'] == 'PC':
        return np.nan
    if row['Platform'] == '2600':
        return 30000000
    if row['Platform'] == 'PSP':
        return 81000000
    if row['Platform'] == 'XOne':
        return 48690000
    if row['Platform'] == 'WiiU':
        return 13560000
    if row['Platform'] == 'GC':
        return 21740000
    if row['Platform'] == 'GEN':
        return 30750000
    if row['Platform'] == 'DC':
        return 9130000
    if row['Platform'] == 'PSV':
        return 12500000
    if row['Platform'] == 'SAT':
        return 9260000
    if row['Platform'] == 'SCD':
        return 2240000
    if row['Platform'] == 'WS':
        return 3500000
    if row['Platform'] == 'NG':
        return 1000000
    if row['Platform'] == 'TG16':
        return 5800000
    if row['Platform'] == '3DO':
        return 2000000
    if row['Platform'] == 'GG':
        return 10620000
    if row['Platform'] == 'PCFX':
        return 400000

# for multiplatform games, calculate the whole sale amounts
def franchise_sales(row):
    #result = games[games['Name'] == row['Name']]['Global_Sales'].sum()
    #result = games[games['Name'] == row['Name']].groupby('Name')[['Global_Sales']].sum()
    result = games[games['Name'] == row['Name']]['Global_Sales'].sum()

    return result


games = pd.read_csv('videogamesales.csv')

fallout4 = games[games['Name'] == 'Fallout 4']

# need for speed most wanted has been release 12 times on different platforms
needforspeed = games[games['Name'] == 'Need for Speed: Most Wanted']

# TODO
# one interesting thing: what are the rankings if we combine
# the sales of each game in all platforms


# what kind of games sell well in each region

# if use you use [[ ]] instead of [] around NA_Sales
# you will get a DataFrame instead of Series
# reset index will take Genre out of index to a column
NA_popular_genre = games.groupby('Genre')[['NA_Sales']].sum().reset_index()
JP_popular_genre = games.groupby('Genre')[['JP_Sales']].sum().reset_index()


# NA Sales, most popular genres
plt.clf()
plt.figure(figsize=(15,10))
sns.barplot(x='Genre', y='NA_Sales', data=NA_popular_genre)
plt.xticks(rotation=45)
plt.show()


# JP Sales, most popular genres
plt.clf()
plt.figure(figsize=(15,10))
sns.barplot(x='Genre', y='JP_Sales', data=JP_popular_genre)
plt.xticks(rotation=45)
plt.show()

fallout4_sales = games[games['Name'] == 'Fallout 4']['Global_Sales'].sum()

# which publisher has released the most games

# this would be interesting if only franchises/one game is counted
# even if some game has been released on multiple platforms

most_games_publishers = games['Publisher'].value_counts()

# when was the last game released by THQ? it should be 2011-2013
# last game, Company of Heroes 2 in 2013 (PC)
thq = games[games['Publisher'] == 'THQ']

unique_platforms = games['Platform'].unique()

games['Platforms_Sold'] = games.apply(platform_sales, axis=1)
games['Percentage_Owners'] = ((games['Global_Sales'] * 1000000) / games['Platforms_Sold']) * 100

correlations = games.corr()

games['Franchise_Global_Sales'] = games.apply(franchise_sales, axis=1)

#plt.clf()
#sns.pairplot(games)
#plt.figure()

# TODO
# does platform, good critic and user scores mean good sales?

games_copy = games.copy()

# THE CORRELATIONS IN PAIRPLOT CAN BE IMPROVED WITH FOLLOWING
# create a some sort of weighted critic+user score overall score, that emphasizes
# the critic score more (based on critic and user count, for example)
# the harder way is to get more scoring data for the games
games['Overall_Score'] = ((games['Critic_Score']/10) + games['User_Score']) / 2
games['Overall_Score_Count'] = games['Critic_Count'] + games['User_Count']

games.drop('Critic_Score', axis=1, inplace=True)
games.drop('User_Score', axis=1, inplace=True)
games.drop('Critic_Count', axis=1, inplace=True)
games.drop('User_Count', axis=1, inplace=True)

rated_games = games[games['Overall_Score'].notnull()]
rated_games.drop(['JP_Sales'], axis=1, inplace=True)
rated_games.drop(['NA_Sales'], axis=1, inplace=True)
rated_games.drop(['EU_Sales'], axis=1, inplace=True)
rated_games.drop(['Other_Sales'], axis=1, inplace=True)
rated_games.drop(['Rating'], axis=1, inplace=True)
rated_games.drop(['Global_Sales'], axis=1, inplace=True)
rated_games.drop(['Platforms_Sold'], axis=1, inplace=True)


rated_games = rated_games[rated_games['Overall_Score_Count'] >= 50]

rated_games.drop(['Overall_Score_Count'], axis=1, inplace=True)

some_platforms = ['Wii', 'PS2', 'NES', 'PS3', 'DS', 'PS', 'X360']

rated_games = rated_games[rated_games['Platform'].isin(some_platforms)]

plt.clf()
sns.pairplot(rated_games, hue='Platform')
plt.figure()

# NEW FILE - TESTING OUT DECISION TREES

import matplotlib.pyplot as plt
import seaborn as sns

iris = sns.load_dataset("iris")

plt.clf()
sns.pairplot(iris, hue='species')
plt.figure()

from sklearn.datasets import load_iris
from sklearn import tree

X, y = load_iris(return_X_y = True)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)

tree.plot_tree(clf)

iris = load_iris()

import graphviz
dot_data = tree.export_graphviz(clf, out_file=None,
                     feature_names=iris['feature_names'],
                     class_names=iris['target_names'],
                     filled=True, rounded=True,
                     special_characters=True)

graph = graphviz.Source(dot_data)
graph.render('iris')