Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import seaborn as sns
- from scipy import stats
- import matplotlib.pyplot as plt
- # column creation function for pandas
- # determine the console generation of the the console
- # https://en.wikipedia.org/wiki/Home_video_game_console_generations
- # for PC, year of release is used to deteremine the "generation"
- def create_console_generation(row):
- result = -1
- if row['Platform'] == '2600':
- result = 2
- elif row['Platform'] in ['NES', 'GB']:
- result = 3
- elif row['Platform'] in ['GEN', 'SNES']:
- result = 4
- elif row['Platform'] in ['PS', 'SAT', 'N64']:
- result = 5
- elif row['Platform'] in ['PS2', "DC", 'GC', "XB", 'PSP', 'GBA']:
- result = 6
- elif row['Platform'] in ['X360', "PS3", "Wii", 'PSV', 'DS']:
- result = 7
- elif row['Platform'] in ['WiiU', 'PS4', 'XOne', '3DS']:
- result = 8
- if row['Platform'] == 'PC':
- if row['Year_of_Release'] < 1987:
- result = 3
- elif 1987 <= row['Year_of_Release'] < 1993:
- result = 4
- elif 1993 <= row['Year_of_Release'] < 1998:
- result = 5
- elif 1998 <= row['Year_of_Release'] < 2005:
- result = 6
- elif 2005 <= row['Year_of_Release'] < 2012:
- result = 7
- else:
- result = 8
- return result
- # column creation function for pandas
- # determine the manufacture of the console based on the console
- def create_console_manufacturer(row):
- result = "Unknown"
- if row['Platform'] in ['NES', 'SNES', 'N64', 'DS', '3DS', 'GBA', 'GB', 'Wii', 'WiiU', 'GC']:
- result = "Nintendo"
- elif row['Platform'] in ['GEN', 'SAT', 'DC']:
- result = "Sega"
- elif row['Platform'] in ['2600']:
- result = "Atari"
- elif row['Platform'] in ['PC']:
- result = "PC"
- elif row['Platform'] in ['XB', "X360", "XOne"]:
- result = "Microsoft"
- elif row['Platform'] in ['PS', 'PS2', 'PS3', 'PS4', 'PSP', 'PSV']:
- result = "Sony"
- return result
- # column creation function for pandas
- # another way to measure how popular of the game was ON THE CONSOLE
- # value between 0 and 1.0, which is a percentage.
- # in other words 0.75 = 75%
- # the higher the number, the more popular this game was on its console
- # for example, Wii Sports => more than 80% of the console owners owned this game!
- def create_owner_percentage(row):
- result = 0.0
- if row['Platform'] == 'NES':
- result = row['Global_Sales'] / 61.91
- elif row['Platform'] == 'SNES':
- result = row['Global_Sales'] / 49.1
- elif row['Platform'] == 'N64':
- result = row['Global_Sales'] / 32.93
- elif row['Platform'] == 'GEN':
- result = row['Global_Sales'] / 30.75
- elif row['Platform'] == 'PS2':
- result = row['Global_Sales'] / 155.0
- elif row['Platform'] == 'DS':
- result = row['Global_Sales'] / 154.02
- elif row['Platform'] == 'GB':
- result = row['Global_Sales'] / 118.69
- elif row['Platform'] == 'PS4':
- result = row['Global_Sales'] / 116.9
- elif row['Platform'] == 'PS':
- result = row['Global_Sales'] / 102.49
- elif row['Platform'] == 'Wii':
- result = row['Global_Sales'] / 101.63
- elif row['Platform'] == 'PS3':
- result = row['Global_Sales'] / 87.4
- elif row['Platform'] == 'X360':
- result = row['Global_Sales'] / 84.0
- elif row['Platform'] == 'GBA':
- result = row['Global_Sales'] / 81.51
- elif row['Platform'] == 'PSP':
- result = row['Global_Sales'] / 81.0
- elif row['Platform'] == '3DS':
- result = row['Global_Sales'] / 75.94
- elif row['Platform'] == 'XOne':
- result = row['Global_Sales'] / 51.0
- elif row['Platform'] == 'Atari':
- result = row['Global_Sales'] / 30.0
- elif row['Platform'] == 'XB':
- result = row['Global_Sales'] / 24.0
- elif row['Platform'] == 'GC':
- result = row['Global_Sales'] / 21.74
- elif row['Platform'] == 'WiiU':
- result = row['Global_Sales'] / 13.56
- elif row['Platform'] == 'PSV':
- result = row['Global_Sales'] / 12.5
- elif row['Platform'] == 'SAT':
- result = row['Global_Sales'] / 9.26
- elif row['Platform'] == 'DC':
- result = row['Global_Sales'] / 9.13
- return result
- # read the dat afile
- df = pd.read_csv("videogamesales.csv")
- # initial correlations
- correlations = df.corr()
- # popular game genres BY REGION
- NA_popular_games = df.groupby('Genre')[['NA_Sales']].sum().sort_values(by=['NA_Sales'], ascending=False)
- EU_popular_games = df.groupby('Genre')[['EU_Sales']].sum().sort_values(by=['EU_Sales'], ascending=False)
- JP_popular_games = df.groupby('Genre')[['JP_Sales']].sum().sort_values(by=['JP_Sales'], ascending=False)
- Other_popular_games = df.groupby('Genre')[['Other_Sales']].sum().sort_values(by=['Other_Sales'], ascending=False)
- # what kind of games are in miscellaneous?
- misc_games = df[df['Genre'] == 'Misc']
- # multiplatform games, which are the most common?
- multiplatform_games = df['Name'].value_counts().reset_index()
- # seems that Need for Speed: Most Wanted has the most multiplatform versions
- most_wanted_games = df[df['Name'] == 'Need for Speed: Most Wanted']
- # total sales for all platforms, Need for Speed: Most Wanted
- total_most_wanted = most_wanted_games['Global_Sales'].sum()
- pc_games = df[df['Platform'] == 'PC']
- # who is the most active publisher
- most_published = df['Publisher'].value_counts()
- # we are missing most Sega Genesis / Mega Drive games in data :(
- sega_genesis_games = df[df['Platform'] == 'GEN']
- # reduce amount of platforms (based on total sales)
- # based on:
- # https://en.wikipedia.org/wiki/List_of_best-selling_game_consoles
- removed_platforms = ['NG', 'SCD', 'WS', '3DO', 'TG16', 'GG', 'PCFX']
- # remove low selling consoles from the list
- # there are fancier ways to do this
- # check Teams-conversation above
- for platform in removed_platforms:
- df = df[df['Platform'] != platform]
- all_platforms = df['Platform'].value_counts()
- # new column: Console generation
- # check the custom function in the beginning of the file
- # PC is also mapped into console generations
- # https://en.wikipedia.org/wiki/Home_video_game_console_generations
- df['Console_Generation'] = df.apply(create_console_generation, axis=1)
- # new column: Console manufacturer (Nintendo, Sony, Microsoft etc)
- df['Console_Manufacturer'] = df.apply(create_console_manufacturer, axis=1)
- # modify ratings:
- # RP, AO, M => M,
- # EC, E, KA = E,
- # E10+, T => T
- df.loc[df['Rating'] == 'RP', "Rating"] = 'M'
- df.loc[df['Rating'] == 'AO', "Rating"] = 'M'
- df.loc[df['Rating'] == 'EC', "Rating"] = 'E'
- df.loc[df['Rating'] == 'K-A', "Rating"] = 'E'
- df.loc[df['Rating'] == 'E10+', "Rating"] = 'T'
- # check that all ratings are now either E, T or M
- all_ratings = df['Rating'].value_counts()
- # critic/user score, replace what you can from other sources (https://api-docs.igdb.com/#about)
- # during the lectures, we'll just remove values with missing review scores, and save a copy into a second dataframe
- # let's make a separate copy
- df_scores = df.copy()
- df_scores = df_scores.dropna()
- # create new columns, average review score and number of reviewers
- df_scores['Average_Score'] = ((df_scores['Critic_Score'] / 10) + df_scores['User_Score'].astype('float')) / 2
- df_scores['Total_Reviews'] = df_scores['Critic_Count'] + df_scores['User_Count']
- # remove unneeded columns
- # EU sales and global sales seem to affect so little that they are removed too
- df_scores = df_scores.drop('Critic_Score', axis = 1)
- df_scores = df_scores.drop('User_Score', axis = 1)
- df_scores = df_scores.drop('Critic_Count', axis = 1)
- df_scores = df_scores.drop('User_Count', axis = 1)
- df_scores = df_scores.drop('Developer', axis = 1)
- df_scores = df_scores.drop('EU_Sales', axis = 1)
- df_scores = df_scores.drop('Other_Sales', axis = 1)
- # new column: how many % of the console owners have this game?
- # combine the critic and user score? 50% / 50%? or should weigh based on the counts?
- df['Owner_Percentage'] = df.apply(create_owner_percentage, axis=1)
- # because of nan-values, we have to clean this up
- # str.contains will break if there's even a single nan -value
- # let's make another copy of the dataframe to just try out the data without
- # critic scores
- df_sales = df.copy()
- df_sales = df_sales.drop('Critic_Score', axis = 1)
- df_sales = df_sales.drop('Critic_Count', axis = 1)
- df_sales = df_sales.drop('User_Score', axis = 1)
- df_sales = df_sales.drop('User_Count', axis = 1)
- df_sales = df_sales.drop('Developer', axis = 1)
- df_sales = df_sales.drop('EU_Sales', axis = 1)
- df_sales = df_sales.drop('Other_Sales', axis = 1)
- # df_sales = df_sales.drop('Rating', axis = 1)
- # remove the other missing value rows
- df_sales = df_sales.dropna()
- # How about the Halo -series?
- halo_games = df_sales[df_sales['Name'].str.contains('Halo')]
- total_halo = halo_games['Global_Sales'].sum()
- # Minecraft!
- # this totals to 27 million copies
- # it should be close to 100 million copies
- # which editions are missing? (needs Googling)
- # most likely the data is incomplete due to missing sales data (VGChartz)
- minecraft_games = df_sales[df_sales['Name'].str.contains("Minecraft")]
- total_minecraft = minecraft_games['Global_Sales'].sum()
- # it seems, some very known (even legendary) games are missing, like
- # Contra (also known as Probotector) for the NES
- # VGChartz database implies sales data for this title is not available yet
- contra_games = df_sales[df_sales['Name'].str.contains("Contra")]
- # MANY OF THE PAIRPLOTS ARE COMMENTED OUT BECAUSE OF THE DATASET BEING LARGE
- # if they are all on, it will be slow to run this code
- # you can uncomment pairplots to test out them
- # it might a better idea to use more accurate plot types
- # jointplots, box plots, scatterplots etc.
- # THE PLOTS
- # this shows many things about the video game market
- # Microsoft consoles did never take off in the Japanese market
- # Japanese market, Sony and Nintendo are strong
- # Globally, Sony is most successful
- # Microsoft has its majority of sales in North America
- plt.clf()
- #sns.pairplot(df_sales, hue='Console_Manufacturer')
- plt.show()
- #plt.clf()
- #sns.pairplot(df)
- #plt.show()
- # some pair plots, can we see anything interesting
- plt.clf()
- #sns.pairplot(df_sales, hue='Platform')
- plt.show()
- # some pair plots, can we see anything interesting
- plt.clf()
- #sns.pairplot(df_scores, hue='Rating')
- plt.show()
- # hmm, M-rating seems to have many well selling games with good reviews
- # ... oh, Call of Duty, Grand Theft Auto, Halo, .... oh well.
- # no wonder there are many clones on these games? just a thought
- df_scores_M = df_scores[df_scores['Rating'] == 'M']
- # correlations.... well, some kind of correlation with the amount of review
- # and amount of sales. so, all publicity is good publicity?
- score_correlations = df_scores.corr()
- plt.clf()
- #sns.pairplot(df_sales, hue='Genre')
- plt.show()
- # based on the rating, it seems T-rated games were
- # popular in 2000s, then E -games were popular between 2000-2010,
- # and after that M-rating got higher
- # seems NA prefers M-rating games
- # and Japan prefers E + T
- plt.clf()
- sns.pairplot(df_sales, hue='Rating')
- plt.show()
- # SOME THOUGHTS FOR FURTHER DEVELOPMENT
- # a good (but difficult) extra step to improve this dataset would be adding
- # the missing games and their sales data from another database
- # see previous comments, Minecraft and Contra/Probotector
- # can we find the average selling price for each game? is there a database for these?
- # Sega Genesis (GEN) had a game library of 713 games (approximately)
- # but the dataset only has 29 games
- # can we find the development budgets of each game too?
- # in other words, can find out how much money the game made
- # after all costs?
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement