Advertisement
tuomasvaltanen

Untitled

Apr 27th, 2021 (edited)
966
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.47 KB | None | 0 0
  1. # workshop 4, 27.4.2021
  2.  
  3. import numpy as np
  4. import pandas as pd
  5. import seaborn as sns
  6. import matplotlib.pyplot as plt
  7. from scipy import stats
  8.  
  9. # total sales of each platform, input by hand from Wikipedia
  10. def platform_sales(row):
  11.     if row['Platform'] == 'Wii':
  12.         return 101630000
  13.     if row['Platform'] == 'NES':
  14.         return 61910000
  15.     if row['Platform'] == 'G':
  16.         return 118690000
  17.     if row['Platform'] == 'DS':
  18.         return 154020000
  19.     if row['Platform'] == 'X360':
  20.         return 84000000
  21.     if row['Platform'] == 'PS3':
  22.         return 87400000
  23.     if row['Platform'] == 'PS2':
  24.         return 155000000
  25.     if row['Platform'] == 'SNES':
  26.         return 61910000
  27.     if row['Platform'] == 'GBA':
  28.         return 81510000
  29.     if row['Platform'] == '3DS':
  30.         return 75940000
  31.     if row['Platform'] == 'PS4':
  32.         return 114900000
  33.     if row['Platform'] == 'N64':
  34.         return 32930000
  35.     if row['Platform'] == 'PS':
  36.         return 102490000
  37.     if row['Platform'] == 'X':
  38.         return 24000000
  39.     if row['Platform'] == 'PC':
  40.         return np.nan
  41.     if row['Platform'] == '2600':
  42.         return 30000000
  43.     if row['Platform'] == 'PSP':
  44.         return 81000000
  45.     if row['Platform'] == 'XOne':
  46.         return 48690000
  47.     if row['Platform'] == 'WiiU':
  48.         return 13560000
  49.     if row['Platform'] == 'GC':
  50.         return 21740000
  51.     if row['Platform'] == 'GEN':
  52.         return 30750000
  53.     if row['Platform'] == 'DC':
  54.         return 9130000
  55.     if row['Platform'] == 'PSV':
  56.         return 12500000
  57.     if row['Platform'] == 'SAT':
  58.         return 9260000
  59.     if row['Platform'] == 'SCD':
  60.         return 2240000
  61.     if row['Platform'] == 'WS':
  62.         return 3500000
  63.     if row['Platform'] == 'NG':
  64.         return 1000000
  65.     if row['Platform'] == 'TG16':
  66.         return 5800000
  67.     if row['Platform'] == '3DO':
  68.         return 2000000
  69.     if row['Platform'] == 'GG':
  70.         return 10620000
  71.     if row['Platform'] == 'PCFX':
  72.         return 400000
  73.  
  74. # for multiplatform games, calculate the whole sale amounts
  75. def franchise_sales(row):
  76.     #result = games[games['Name'] == row['Name']]['Global_Sales'].sum()
  77.     #result = games[games['Name'] == row['Name']].groupby('Name')[['Global_Sales']].sum()
  78.     result = games[games['Name'] == row['Name']]['Global_Sales'].sum()
  79.  
  80.     return result
  81.  
  82.  
  83. games = pd.read_csv('videogamesales.csv')
  84.  
  85. fallout4 = games[games['Name'] == 'Fallout 4']
  86.  
  87. # need for speed most wanted has been release 12 times on different platforms
  88. needforspeed = games[games['Name'] == 'Need for Speed: Most Wanted']
  89.  
  90. # TODO
  91. # one interesting thing: what are the rankings if we combine
  92. # the sales of each game in all platforms
  93.  
  94.  
  95.  
  96. # what kind of games sell well in each region
  97.  
  98. # if use you use [[ ]] instead of [] around NA_Sales
  99. # you will get a DataFrame instead of Series
  100. # reset index will take Genre out of index to a column
  101. NA_popular_genre = games.groupby('Genre')[['NA_Sales']].sum().reset_index()
  102. JP_popular_genre = games.groupby('Genre')[['JP_Sales']].sum().reset_index()
  103.  
  104.  
  105. # NA Sales, most popular genres
  106. plt.clf()
  107. plt.figure(figsize=(15,10))
  108. sns.barplot(x='Genre', y='NA_Sales', data=NA_popular_genre)
  109. plt.xticks(rotation=45)
  110. plt.show()
  111.  
  112.  
  113. # JP Sales, most popular genres
  114. plt.clf()
  115. plt.figure(figsize=(15,10))
  116. sns.barplot(x='Genre', y='JP_Sales', data=JP_popular_genre)
  117. plt.xticks(rotation=45)
  118. plt.show()
  119.  
  120. fallout4_sales = games[games['Name'] == 'Fallout 4']['Global_Sales'].sum()
  121.  
  122. # which publisher has released the most games
  123.  
  124. # this would be interesting if only franchises/one game is counted
  125. # even if some game has been released on multiple platforms
  126.  
  127. most_games_publishers = games['Publisher'].value_counts()
  128.  
  129. # when was the last game released by THQ? it should be 2011-2013
  130. # last game, Company of Heroes 2 in 2013 (PC)
  131. thq = games[games['Publisher'] == 'THQ']
  132.  
  133. unique_platforms = games['Platform'].unique()
  134.  
  135. games['Platforms_Sold'] = games.apply(platform_sales, axis=1)
  136. games['Percentage_Owners'] = ((games['Global_Sales'] * 1000000) / games['Platforms_Sold']) * 100
  137.  
  138. correlations = games.corr()
  139.  
  140. games['Franchise_Global_Sales'] = games.apply(franchise_sales, axis=1)
  141.  
  142. #plt.clf()
  143. #sns.pairplot(games)
  144. #plt.figure()
  145.  
  146. # TODO
  147. # does platform, good critic and user scores mean good sales?
  148.  
  149. games_copy = games.copy()
  150.  
  151. # THE CORRELATIONS IN PAIRPLOT CAN BE IMPROVED WITH FOLLOWING
  152. # create a some sort of weighted critic+user score overall score, that emphasizes
  153. # the critic score more (based on critic and user count, for example)
  154. # the harder way is to get more scoring data for the games
  155. games['Overall_Score'] = ((games['Critic_Score']/10) + games['User_Score']) / 2
  156. games['Overall_Score_Count'] = games['Critic_Count'] + games['User_Count']
  157.  
  158. games.drop('Critic_Score', axis=1, inplace=True)
  159. games.drop('User_Score', axis=1, inplace=True)
  160. games.drop('Critic_Count', axis=1, inplace=True)
  161. games.drop('User_Count', axis=1, inplace=True)
  162.  
  163. rated_games = games[games['Overall_Score'].notnull()]
  164. rated_games.drop(['JP_Sales'], axis=1, inplace=True)
  165. rated_games.drop(['NA_Sales'], axis=1, inplace=True)
  166. rated_games.drop(['EU_Sales'], axis=1, inplace=True)
  167. rated_games.drop(['Other_Sales'], axis=1, inplace=True)
  168. rated_games.drop(['Rating'], axis=1, inplace=True)
  169. rated_games.drop(['Global_Sales'], axis=1, inplace=True)
  170. rated_games.drop(['Platforms_Sold'], axis=1, inplace=True)
  171.  
  172.  
  173.  
  174. rated_games = rated_games[rated_games['Overall_Score_Count'] >= 50]
  175.  
  176. rated_games.drop(['Overall_Score_Count'], axis=1, inplace=True)
  177.  
  178. some_platforms = ['Wii', 'PS2', 'NES', 'PS3', 'DS', 'PS', 'X360']
  179.  
  180. rated_games = rated_games[rated_games['Platform'].isin(some_platforms)]
  181.  
  182. plt.clf()
  183. sns.pairplot(rated_games, hue='Platform')
  184. plt.figure()
  185.  
  186. # NEW FILE - TESTING OUT DECISION TREES
  187.  
  188. import matplotlib.pyplot as plt
  189. import seaborn as sns
  190.  
  191. iris = sns.load_dataset("iris")
  192.  
  193. plt.clf()
  194. sns.pairplot(iris, hue='species')
  195. plt.figure()
  196.  
  197. from sklearn.datasets import load_iris
  198. from sklearn import tree
  199.  
  200. X, y = load_iris(return_X_y = True)
  201.  
  202. clf = tree.DecisionTreeClassifier()
  203. clf = clf.fit(X, y)
  204.  
  205. tree.plot_tree(clf)
  206.  
  207. iris = load_iris()
  208.  
  209. import graphviz
  210. dot_data = tree.export_graphviz(clf, out_file=None,
  211.                      feature_names=iris['feature_names'],  
  212.                      class_names=iris['target_names'],  
  213.                      filled=True, rounded=True,  
  214.                      special_characters=True)
  215.  
  216. graph = graphviz.Source(dot_data)  
  217. graph.render('iris')
  218.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement