# Untitled

May 5th, 2022
845
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. import numpy as np
2. import pandas as pd
3. import seaborn as sns
4. from scipy import stats
5. import matplotlib.pyplot as plt
6.
7. # column creation function for pandas
8. # determine the console generation of the the console
9. # https://en.wikipedia.org/wiki/Home_video_game_console_generations
10. # for PC, year of release is used to deteremine the "generation"
11. def create_console_generation(row):
12.     result = -1
13.
14.     if row['Platform'] == '2600':
15.         result = 2
16.     elif row['Platform'] in ['NES', 'GB']:
17.         result = 3
18.     elif row['Platform'] in ['GEN', 'SNES']:
19.         result = 4
20.     elif row['Platform'] in ['PS', 'SAT', 'N64']:
21.         result = 5
22.     elif row['Platform'] in ['PS2', "DC", 'GC', "XB", 'PSP', 'GBA']:
23.         result = 6
24.     elif row['Platform'] in ['X360', "PS3", "Wii", 'PSV', 'DS']:
25.         result = 7
26.     elif row['Platform'] in ['WiiU', 'PS4', 'XOne', '3DS']:
27.         result = 8
28.
29.     if row['Platform'] == 'PC':
30.         if row['Year_of_Release'] < 1987:
31.             result = 3
32.         elif 1987 <= row['Year_of_Release'] < 1993:
33.             result = 4
34.         elif 1993 <= row['Year_of_Release'] < 1998:
35.             result = 5
36.         elif 1998 <= row['Year_of_Release'] < 2005:
37.             result = 6
38.         elif 2005 <= row['Year_of_Release'] < 2012:
39.             result = 7
40.         else:
41.             result = 8
42.
43.     return result
44.
45.
46. # column creation function for pandas
47. # determine the manufacture of the console based on the console
48. def create_console_manufacturer(row):
49.     result = "Unknown"
50.
51.     if row['Platform'] in ['NES', 'SNES', 'N64', 'DS', '3DS', 'GBA', 'GB', 'Wii', 'WiiU', 'GC']:
52.         result = "Nintendo"
53.     elif row['Platform'] in ['GEN', 'SAT', 'DC']:
54.         result = "Sega"
55.     elif row['Platform'] in ['2600']:
56.         result = "Atari"
57.     elif row['Platform'] in ['PC']:
58.         result = "PC"
59.     elif row['Platform'] in ['XB', "X360", "XOne"]:
60.         result = "Microsoft"
61.     elif row['Platform'] in ['PS', 'PS2', 'PS3', 'PS4', 'PSP', 'PSV']:
62.         result = "Sony"
63.
64.     return result
65.
66.
67. # column creation function for pandas
68. # another way to measure how popular of the game was ON THE CONSOLE
69. # value between 0 and 1.0, which is a percentage.
70. # in other words 0.75 = 75%
71. # the higher the number, the more popular this game was on its console
72. # for example, Wii Sports => more than 80% of the console owners owned this game!
73. def create_owner_percentage(row):
74.     result = 0.0
75.
76.     if row['Platform'] == 'NES':
77.         result = row['Global_Sales'] / 61.91
78.
79.     elif row['Platform'] == 'SNES':
80.         result = row['Global_Sales'] / 49.1
81.
82.     elif row['Platform'] == 'N64':
83.         result = row['Global_Sales'] / 32.93
84.
85.     elif row['Platform'] == 'GEN':
86.         result = row['Global_Sales'] / 30.75
87.
88.     elif row['Platform'] == 'PS2':
89.         result = row['Global_Sales'] / 155.0
90.
91.     elif row['Platform'] == 'DS':
92.         result = row['Global_Sales'] / 154.02
93.
94.     elif row['Platform'] == 'GB':
95.         result = row['Global_Sales'] / 118.69
96.
97.     elif row['Platform'] == 'PS4':
98.         result = row['Global_Sales'] / 116.9
99.
100.     elif row['Platform'] == 'PS':
101.         result = row['Global_Sales'] / 102.49
102.
103.     elif row['Platform'] == 'Wii':
104.         result = row['Global_Sales'] / 101.63
105.
106.     elif row['Platform'] == 'PS3':
107.         result = row['Global_Sales'] / 87.4
108.
109.     elif row['Platform'] == 'X360':
110.         result = row['Global_Sales'] / 84.0
111.
112.     elif row['Platform'] == 'GBA':
113.         result = row['Global_Sales'] / 81.51
114.
115.     elif row['Platform'] == 'PSP':
116.         result = row['Global_Sales'] / 81.0
117.
118.     elif row['Platform'] == '3DS':
119.         result = row['Global_Sales'] / 75.94
120.
121.     elif row['Platform'] == 'XOne':
122.         result = row['Global_Sales'] / 51.0
123.
124.     elif row['Platform'] == 'Atari':
125.         result = row['Global_Sales'] / 30.0
126.
127.     elif row['Platform'] == 'XB':
128.         result = row['Global_Sales'] / 24.0
129.
130.     elif row['Platform'] == 'GC':
131.         result = row['Global_Sales'] / 21.74
132.
133.     elif row['Platform'] == 'WiiU':
134.         result = row['Global_Sales'] / 13.56
135.
136.     elif row['Platform'] == 'PSV':
137.         result = row['Global_Sales'] / 12.5
138.
139.     elif row['Platform'] == 'SAT':
140.         result = row['Global_Sales'] / 9.26
141.
142.     elif row['Platform'] == 'DC':
143.         result = row['Global_Sales'] / 9.13
144.
145.     return result
146.
147.
148. # read the dat afile
150.
151. # initial correlations
152. correlations = df.corr()
153.
154. # popular game genres BY REGION
155. NA_popular_games = df.groupby('Genre')[['NA_Sales']].sum().sort_values(by=['NA_Sales'], ascending=False)
156. EU_popular_games = df.groupby('Genre')[['EU_Sales']].sum().sort_values(by=['EU_Sales'], ascending=False)
157. JP_popular_games = df.groupby('Genre')[['JP_Sales']].sum().sort_values(by=['JP_Sales'], ascending=False)
158. Other_popular_games = df.groupby('Genre')[['Other_Sales']].sum().sort_values(by=['Other_Sales'], ascending=False)
159.
160. # what kind of games are in miscellaneous?
161. misc_games = df[df['Genre'] == 'Misc']
162.
163. # multiplatform games, which are the most common?
164. multiplatform_games = df['Name'].value_counts().reset_index()
165.
166. # seems that Need for Speed: Most Wanted has the most multiplatform versions
167. most_wanted_games = df[df['Name'] == 'Need for Speed: Most Wanted']
168.
169. # total sales for all platforms, Need for Speed: Most Wanted
170. total_most_wanted = most_wanted_games['Global_Sales'].sum()
171.
172. pc_games = df[df['Platform'] == 'PC']
173.
174.
175. # who is the most active publisher
176. most_published = df['Publisher'].value_counts()
177.
178. # we are missing most Sega Genesis / Mega Drive games in data :(
179. sega_genesis_games = df[df['Platform'] == 'GEN']
180.
181.
182. # reduce amount of platforms (based on total sales)
183. # based on:
184. # https://en.wikipedia.org/wiki/List_of_best-selling_game_consoles
185. removed_platforms = ['NG', 'SCD', 'WS', '3DO', 'TG16', 'GG', 'PCFX']
186.
187. # remove low selling consoles from the list
188. # there are fancier ways to do this
189. # check Teams-conversation above
190. for platform in removed_platforms:
191.     df = df[df['Platform'] != platform]
192.
193. all_platforms = df['Platform'].value_counts()
194.
195. # new column: Console generation
196. # check the custom function in the beginning of the file
197. # PC is also mapped into console generations
198. # https://en.wikipedia.org/wiki/Home_video_game_console_generations
199. df['Console_Generation'] = df.apply(create_console_generation, axis=1)
200.
201. # new column: Console manufacturer (Nintendo, Sony, Microsoft etc)
202. df['Console_Manufacturer'] = df.apply(create_console_manufacturer, axis=1)
203.
204. # modify ratings:
205. # RP, AO, M => M,
206. # EC, E, KA = E,
207. # E10+, T => T
208.
209. df.loc[df['Rating'] == 'RP', "Rating"] = 'M'
210. df.loc[df['Rating'] == 'AO', "Rating"] = 'M'
211. df.loc[df['Rating'] == 'EC', "Rating"] = 'E'
212. df.loc[df['Rating'] == 'K-A', "Rating"] = 'E'
213. df.loc[df['Rating'] == 'E10+', "Rating"] = 'T'
214.
215. # check that all ratings are now either E, T or M
216. all_ratings = df['Rating'].value_counts()
217.
218. # critic/user score, replace what you can from other sources (https://api-docs.igdb.com/#about)
219. # during the lectures, we'll just remove values with missing review scores, and save a copy into a second dataframe
220.
221. # let's make a separate copy
222. df_scores = df.copy()
223. df_scores = df_scores.dropna()
224.
225. # create new columns, average review score and number of reviewers
226. df_scores['Average_Score'] = ((df_scores['Critic_Score'] / 10) + df_scores['User_Score'].astype('float')) / 2
227. df_scores['Total_Reviews'] = df_scores['Critic_Count'] + df_scores['User_Count']
228.
229. # remove unneeded columns
230. # EU sales and global sales seem to affect so little that they are removed too
231. df_scores = df_scores.drop('Critic_Score', axis = 1)
232. df_scores = df_scores.drop('User_Score', axis = 1)
233. df_scores = df_scores.drop('Critic_Count', axis = 1)
234. df_scores = df_scores.drop('User_Count', axis = 1)
235. df_scores = df_scores.drop('Developer', axis = 1)
236. df_scores = df_scores.drop('EU_Sales', axis = 1)
237. df_scores = df_scores.drop('Other_Sales', axis = 1)
238.
239. # new column: how many % of the console owners have this game?
240. # combine the critic and user score? 50% / 50%? or should weigh based on the counts?
241. df['Owner_Percentage'] = df.apply(create_owner_percentage, axis=1)
242.
243.
244. # because of nan-values, we have to clean this up
245. # str.contains will break if there's even a single nan -value
246. # let's make another copy of the dataframe to just try out the data without
247. # critic scores
248. df_sales = df.copy()
249. df_sales = df_sales.drop('Critic_Score', axis = 1)
250. df_sales = df_sales.drop('Critic_Count', axis = 1)
251. df_sales = df_sales.drop('User_Score', axis = 1)
252. df_sales = df_sales.drop('User_Count', axis = 1)
253. df_sales = df_sales.drop('Developer', axis = 1)
254. df_sales = df_sales.drop('EU_Sales', axis = 1)
255. df_sales = df_sales.drop('Other_Sales', axis = 1)
256. # df_sales = df_sales.drop('Rating', axis = 1)
257.
258. # remove the other missing value rows
259. df_sales = df_sales.dropna()
260.
261. # How about the Halo -series?
262. halo_games = df_sales[df_sales['Name'].str.contains('Halo')]
263. total_halo = halo_games['Global_Sales'].sum()
264.
265. # Minecraft!
266. # this totals to 27 million copies
267. # it should be close to 100 million copies
268. # which editions are missing? (needs Googling)
269. # most likely the data is incomplete due to missing sales data (VGChartz)
270. minecraft_games = df_sales[df_sales['Name'].str.contains("Minecraft")]
271. total_minecraft = minecraft_games['Global_Sales'].sum()
272.
273. # it seems, some very known (even legendary) games are missing, like
274. # Contra (also known as Probotector) for the NES
275. # VGChartz database implies sales data for this title is not available yet
276. contra_games = df_sales[df_sales['Name'].str.contains("Contra")]
277.
278. # MANY OF THE PAIRPLOTS ARE COMMENTED OUT BECAUSE OF THE DATASET BEING LARGE
279. # if they are all on, it will be slow to run this code
280. # you can uncomment pairplots to test out them
281.
282. # it might a better idea to use more accurate plot types
283. # jointplots, box plots, scatterplots etc.
284.
285. # THE PLOTS
286.
287. # this shows many things about the video game market
288. # Microsoft consoles did never take off in the Japanese market
289. # Japanese market, Sony and Nintendo are strong
290. # Globally, Sony is most successful
291. # Microsoft has its majority of sales in North America
292. plt.clf()
293. #sns.pairplot(df_sales, hue='Console_Manufacturer')
294. plt.show()
295.
296. #plt.clf()
297. #sns.pairplot(df)
298. #plt.show()
299.
300. # some pair plots, can we see anything interesting
301. plt.clf()
302. #sns.pairplot(df_sales, hue='Platform')
303. plt.show()
304.
305. # some pair plots, can we see anything interesting
306. plt.clf()
307. #sns.pairplot(df_scores, hue='Rating')
308. plt.show()
309.
310. # hmm, M-rating seems to have many well selling games with good reviews
311. # ... oh, Call of Duty, Grand Theft Auto, Halo, .... oh well.
312. # no wonder there are many clones on these games? just a thought
313. df_scores_M = df_scores[df_scores['Rating'] == 'M']
314.
315. # correlations.... well, some kind of correlation with the amount of review
316. # and amount of sales. so, all publicity is good publicity?
317. score_correlations = df_scores.corr()
318.
319. plt.clf()
320. #sns.pairplot(df_sales, hue='Genre')
321. plt.show()
322.
323. # based on the rating, it seems T-rated games were
324. # popular in 2000s, then E -games were popular between 2000-2010,
325. # and after that M-rating got higher
326.
327. # seems NA prefers M-rating games
328. # and Japan prefers E + T
329. plt.clf()
330. sns.pairplot(df_sales, hue='Rating')
331. plt.show()
332.
333. # SOME THOUGHTS FOR FURTHER DEVELOPMENT
334.
335. # a good (but difficult) extra step to improve this dataset would be adding
336. # the missing games and their sales data from another database
337. # see previous comments, Minecraft and Contra/Probotector
338.
339. # can we find the average selling price for each game? is there a database for these?
340.
341. # Sega Genesis (GEN) had a game library of 713 games (approximately)
342. # but the dataset only has 29 games
343.
344. # can we find the development budgets of each game too?
345. # in other words, can find out how much money the game made
346. # after all costs?