Advertisement
tuomasvaltanen

Untitled

May 5th, 2022
861
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.91 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import seaborn as sns
  4. from scipy import stats
  5. import matplotlib.pyplot as plt
  6.  
  7. # column creation function for pandas
  8. # determine the console generation of the the console
  9. # https://en.wikipedia.org/wiki/Home_video_game_console_generations
  10. # for PC, year of release is used to deteremine the "generation"
  11. def create_console_generation(row):
  12.     result = -1
  13.    
  14.     if row['Platform'] == '2600':
  15.         result = 2
  16.     elif row['Platform'] in ['NES', 'GB']:
  17.         result = 3
  18.     elif row['Platform'] in ['GEN', 'SNES']:
  19.         result = 4
  20.     elif row['Platform'] in ['PS', 'SAT', 'N64']:
  21.         result = 5
  22.     elif row['Platform'] in ['PS2', "DC", 'GC', "XB", 'PSP', 'GBA']:
  23.         result = 6
  24.     elif row['Platform'] in ['X360', "PS3", "Wii", 'PSV', 'DS']:
  25.         result = 7
  26.     elif row['Platform'] in ['WiiU', 'PS4', 'XOne', '3DS']:
  27.         result = 8
  28.    
  29.     if row['Platform'] == 'PC':
  30.         if row['Year_of_Release'] < 1987:
  31.             result = 3
  32.         elif 1987 <= row['Year_of_Release'] < 1993:
  33.             result = 4
  34.         elif 1993 <= row['Year_of_Release'] < 1998:
  35.             result = 5
  36.         elif 1998 <= row['Year_of_Release'] < 2005:
  37.             result = 6
  38.         elif 2005 <= row['Year_of_Release'] < 2012:
  39.             result = 7
  40.         else:
  41.             result = 8
  42.            
  43.     return result
  44.  
  45.  
  46. # column creation function for pandas
  47. # determine the manufacture of the console based on the console
  48. def create_console_manufacturer(row):
  49.     result = "Unknown"
  50.    
  51.     if row['Platform'] in ['NES', 'SNES', 'N64', 'DS', '3DS', 'GBA', 'GB', 'Wii', 'WiiU', 'GC']:
  52.         result = "Nintendo"
  53.     elif row['Platform'] in ['GEN', 'SAT', 'DC']:
  54.         result = "Sega"
  55.     elif row['Platform'] in ['2600']:
  56.         result = "Atari"
  57.     elif row['Platform'] in ['PC']:
  58.         result = "PC"
  59.     elif row['Platform'] in ['XB', "X360", "XOne"]:
  60.         result = "Microsoft"
  61.     elif row['Platform'] in ['PS', 'PS2', 'PS3', 'PS4', 'PSP', 'PSV']:
  62.         result = "Sony"
  63.    
  64.     return result
  65.  
  66.  
  67. # column creation function for pandas
  68. # another way to measure how popular of the game was ON THE CONSOLE
  69. # value between 0 and 1.0, which is a percentage.
  70. # in other words 0.75 = 75%
  71. # the higher the number, the more popular this game was on its console
  72. # for example, Wii Sports => more than 80% of the console owners owned this game!
  73. def create_owner_percentage(row):
  74.     result = 0.0
  75.    
  76.     if row['Platform'] == 'NES':
  77.         result = row['Global_Sales'] / 61.91
  78.        
  79.     elif row['Platform'] == 'SNES':
  80.         result = row['Global_Sales'] / 49.1
  81.        
  82.     elif row['Platform'] == 'N64':
  83.         result = row['Global_Sales'] / 32.93
  84.    
  85.     elif row['Platform'] == 'GEN':
  86.         result = row['Global_Sales'] / 30.75
  87.    
  88.     elif row['Platform'] == 'PS2':
  89.         result = row['Global_Sales'] / 155.0
  90.    
  91.     elif row['Platform'] == 'DS':
  92.         result = row['Global_Sales'] / 154.02
  93.    
  94.     elif row['Platform'] == 'GB':
  95.         result = row['Global_Sales'] / 118.69
  96.    
  97.     elif row['Platform'] == 'PS4':
  98.         result = row['Global_Sales'] / 116.9
  99.        
  100.     elif row['Platform'] == 'PS':
  101.         result = row['Global_Sales'] / 102.49
  102.    
  103.     elif row['Platform'] == 'Wii':
  104.         result = row['Global_Sales'] / 101.63
  105.    
  106.     elif row['Platform'] == 'PS3':
  107.         result = row['Global_Sales'] / 87.4
  108.    
  109.     elif row['Platform'] == 'X360':
  110.         result = row['Global_Sales'] / 84.0
  111.    
  112.     elif row['Platform'] == 'GBA':
  113.         result = row['Global_Sales'] / 81.51
  114.    
  115.     elif row['Platform'] == 'PSP':
  116.         result = row['Global_Sales'] / 81.0
  117.    
  118.     elif row['Platform'] == '3DS':
  119.         result = row['Global_Sales'] / 75.94
  120.    
  121.     elif row['Platform'] == 'XOne':
  122.         result = row['Global_Sales'] / 51.0
  123.        
  124.     elif row['Platform'] == 'Atari':
  125.         result = row['Global_Sales'] / 30.0
  126.    
  127.     elif row['Platform'] == 'XB':
  128.         result = row['Global_Sales'] / 24.0
  129.    
  130.     elif row['Platform'] == 'GC':
  131.         result = row['Global_Sales'] / 21.74
  132.        
  133.     elif row['Platform'] == 'WiiU':
  134.         result = row['Global_Sales'] / 13.56
  135.        
  136.     elif row['Platform'] == 'PSV':
  137.         result = row['Global_Sales'] / 12.5
  138.    
  139.     elif row['Platform'] == 'SAT':
  140.         result = row['Global_Sales'] / 9.26
  141.        
  142.     elif row['Platform'] == 'DC':
  143.         result = row['Global_Sales'] / 9.13
  144.    
  145.     return result
  146.  
  147.  
  148. # read the dat afile
  149. df = pd.read_csv("videogamesales.csv")
  150.  
  151. # initial correlations
  152. correlations = df.corr()
  153.  
  154. # popular game genres BY REGION
  155. NA_popular_games = df.groupby('Genre')[['NA_Sales']].sum().sort_values(by=['NA_Sales'], ascending=False)
  156. EU_popular_games = df.groupby('Genre')[['EU_Sales']].sum().sort_values(by=['EU_Sales'], ascending=False)
  157. JP_popular_games = df.groupby('Genre')[['JP_Sales']].sum().sort_values(by=['JP_Sales'], ascending=False)
  158. Other_popular_games = df.groupby('Genre')[['Other_Sales']].sum().sort_values(by=['Other_Sales'], ascending=False)
  159.  
  160. # what kind of games are in miscellaneous?
  161. misc_games = df[df['Genre'] == 'Misc']
  162.  
  163. # multiplatform games, which are the most common?
  164. multiplatform_games = df['Name'].value_counts().reset_index()
  165.  
  166. # seems that Need for Speed: Most Wanted has the most multiplatform versions
  167. most_wanted_games = df[df['Name'] == 'Need for Speed: Most Wanted']
  168.  
  169. # total sales for all platforms, Need for Speed: Most Wanted
  170. total_most_wanted = most_wanted_games['Global_Sales'].sum()
  171.  
  172. pc_games = df[df['Platform'] == 'PC']
  173.  
  174.  
  175. # who is the most active publisher
  176. most_published = df['Publisher'].value_counts()
  177.  
  178. # we are missing most Sega Genesis / Mega Drive games in data :(
  179. sega_genesis_games = df[df['Platform'] == 'GEN']
  180.  
  181.  
  182. # reduce amount of platforms (based on total sales)
  183. # based on:
  184. # https://en.wikipedia.org/wiki/List_of_best-selling_game_consoles
  185. removed_platforms = ['NG', 'SCD', 'WS', '3DO', 'TG16', 'GG', 'PCFX']
  186.  
  187. # remove low selling consoles from the list
  188. # there are fancier ways to do this
  189. # check Teams-conversation above
  190. for platform in removed_platforms:
  191.     df = df[df['Platform'] != platform]
  192.  
  193. all_platforms = df['Platform'].value_counts()
  194.  
  195. # new column: Console generation
  196. # check the custom function in the beginning of the file
  197. # PC is also mapped into console generations
  198. # https://en.wikipedia.org/wiki/Home_video_game_console_generations
  199. df['Console_Generation'] = df.apply(create_console_generation, axis=1)
  200.  
  201. # new column: Console manufacturer (Nintendo, Sony, Microsoft etc)
  202. df['Console_Manufacturer'] = df.apply(create_console_manufacturer, axis=1)
  203.  
  204. # modify ratings:
  205. # RP, AO, M => M,
  206. # EC, E, KA = E,
  207. # E10+, T => T
  208.  
  209. df.loc[df['Rating'] == 'RP', "Rating"] = 'M'
  210. df.loc[df['Rating'] == 'AO', "Rating"] = 'M'
  211. df.loc[df['Rating'] == 'EC', "Rating"] = 'E'
  212. df.loc[df['Rating'] == 'K-A', "Rating"] = 'E'
  213. df.loc[df['Rating'] == 'E10+', "Rating"] = 'T'
  214.  
  215. # check that all ratings are now either E, T or M
  216. all_ratings = df['Rating'].value_counts()
  217.  
  218. # critic/user score, replace what you can from other sources (https://api-docs.igdb.com/#about)
  219. # during the lectures, we'll just remove values with missing review scores, and save a copy into a second dataframe
  220.  
  221. # let's make a separate copy
  222. df_scores = df.copy()
  223. df_scores = df_scores.dropna()
  224.  
  225. # create new columns, average review score and number of reviewers
  226. df_scores['Average_Score'] = ((df_scores['Critic_Score'] / 10) + df_scores['User_Score'].astype('float')) / 2
  227. df_scores['Total_Reviews'] = df_scores['Critic_Count'] + df_scores['User_Count']
  228.  
  229. # remove unneeded columns
  230. # EU sales and global sales seem to affect so little that they are removed too
  231. df_scores = df_scores.drop('Critic_Score', axis = 1)
  232. df_scores = df_scores.drop('User_Score', axis = 1)
  233. df_scores = df_scores.drop('Critic_Count', axis = 1)
  234. df_scores = df_scores.drop('User_Count', axis = 1)
  235. df_scores = df_scores.drop('Developer', axis = 1)
  236. df_scores = df_scores.drop('EU_Sales', axis = 1)
  237. df_scores = df_scores.drop('Other_Sales', axis = 1)
  238.  
  239. # new column: how many % of the console owners have this game?
  240. # combine the critic and user score? 50% / 50%? or should weigh based on the counts?
  241. df['Owner_Percentage'] = df.apply(create_owner_percentage, axis=1)
  242.  
  243.  
  244. # because of nan-values, we have to clean this up
  245. # str.contains will break if there's even a single nan -value
  246. # let's make another copy of the dataframe to just try out the data without
  247. # critic scores
  248. df_sales = df.copy()
  249. df_sales = df_sales.drop('Critic_Score', axis = 1)
  250. df_sales = df_sales.drop('Critic_Count', axis = 1)
  251. df_sales = df_sales.drop('User_Score', axis = 1)
  252. df_sales = df_sales.drop('User_Count', axis = 1)
  253. df_sales = df_sales.drop('Developer', axis = 1)
  254. df_sales = df_sales.drop('EU_Sales', axis = 1)
  255. df_sales = df_sales.drop('Other_Sales', axis = 1)
  256. # df_sales = df_sales.drop('Rating', axis = 1)
  257.  
  258. # remove the other missing value rows
  259. df_sales = df_sales.dropna()
  260.  
  261. # How about the Halo -series?
  262. halo_games = df_sales[df_sales['Name'].str.contains('Halo')]
  263. total_halo = halo_games['Global_Sales'].sum()
  264.  
  265. # Minecraft!
  266. # this totals to 27 million copies
  267. # it should be close to 100 million copies
  268. # which editions are missing? (needs Googling)
  269. # most likely the data is incomplete due to missing sales data (VGChartz)
  270. minecraft_games = df_sales[df_sales['Name'].str.contains("Minecraft")]
  271. total_minecraft = minecraft_games['Global_Sales'].sum()
  272.  
  273. # it seems, some very known (even legendary) games are missing, like
  274. # Contra (also known as Probotector) for the NES
  275. # VGChartz database implies sales data for this title is not available yet
  276. contra_games = df_sales[df_sales['Name'].str.contains("Contra")]
  277.  
  278. # MANY OF THE PAIRPLOTS ARE COMMENTED OUT BECAUSE OF THE DATASET BEING LARGE
  279. # if they are all on, it will be slow to run this code
  280. # you can uncomment pairplots to test out them
  281.  
  282. # it might a better idea to use more accurate plot types
  283. # jointplots, box plots, scatterplots etc.
  284.  
  285. # THE PLOTS
  286.  
  287. # this shows many things about the video game market
  288. # Microsoft consoles did never take off in the Japanese market
  289. # Japanese market, Sony and Nintendo are strong
  290. # Globally, Sony is most successful
  291. # Microsoft has its majority of sales in North America
  292. plt.clf()
  293. #sns.pairplot(df_sales, hue='Console_Manufacturer')
  294. plt.show()
  295.  
  296. #plt.clf()
  297. #sns.pairplot(df)
  298. #plt.show()
  299.  
  300. # some pair plots, can we see anything interesting
  301. plt.clf()
  302. #sns.pairplot(df_sales, hue='Platform')
  303. plt.show()
  304.  
  305. # some pair plots, can we see anything interesting
  306. plt.clf()
  307. #sns.pairplot(df_scores, hue='Rating')
  308. plt.show()
  309.  
  310. # hmm, M-rating seems to have many well selling games with good reviews
  311. # ... oh, Call of Duty, Grand Theft Auto, Halo, .... oh well.
  312. # no wonder there are many clones on these games? just a thought
  313. df_scores_M = df_scores[df_scores['Rating'] == 'M']
  314.  
  315. # correlations.... well, some kind of correlation with the amount of review
  316. # and amount of sales. so, all publicity is good publicity?
  317. score_correlations = df_scores.corr()
  318.  
  319. plt.clf()
  320. #sns.pairplot(df_sales, hue='Genre')
  321. plt.show()
  322.  
  323. # based on the rating, it seems T-rated games were
  324. # popular in 2000s, then E -games were popular between 2000-2010,
  325. # and after that M-rating got higher
  326.  
  327. # seems NA prefers M-rating games
  328. # and Japan prefers E + T
  329. plt.clf()
  330. sns.pairplot(df_sales, hue='Rating')
  331. plt.show()
  332.  
  333. # SOME THOUGHTS FOR FURTHER DEVELOPMENT
  334.  
  335. # a good (but difficult) extra step to improve this dataset would be adding
  336. # the missing games and their sales data from another database
  337. # see previous comments, Minecraft and Contra/Probotector
  338.  
  339. # can we find the average selling price for each game? is there a database for these?
  340.  
  341. # Sega Genesis (GEN) had a game library of 713 games (approximately)
  342. # but the dataset only has 29 games
  343.  
  344. # can we find the development budgets of each game too?
  345. # in other words, can find out how much money the game made
  346. # after all costs?
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement