# import libraries import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from sklearn.model_selection import train_test_split # Load the data FILE_PATH = './games.csv' games = pd.read_csv(FILE_PATH) # Print the names of the columns & shapes in games print(games.columns) print(games.shape) # Make a histogram of all the ratings in the average_rating column plt.hist(games["average_rating"]) plt.show() # Print the first row of all the games with zero scores print(games[games['average_rating'] == 0].iloc[0]) # Print the first row of games with scores grater than 0 print(games[games['average_rating'] > 0].iloc[0]) # Remove any rows without user reviews games = games[games['users_rated'] > 0] # Remove any rows with missing values games = games.dropna(axis=0) # Make a histogram of all the average ratings plt.hist(games['average_rating']) plt.show() # Correlation matrix corrmat = games.corr() fig = plt.figure(figsize=(12, 9)) # Plot using seaborn sns.heatmap(corrmat, vmax=.8, square=True) plt.show() # Get all the columns from the dataframe columns = games.columns.tolist() # Filter the columns to remove data we do not want columns = [c for c in columns if c not in ["bayes_average_rating", "average_rating", "type", "name", "id"]] # Store the variable we`ll be predicting on target = "average_rating" # Generate training and test datasets train_X, test_X, train_Y, test_Y = train_test_split(games[columns], games[target], train_size=0.8, test_size=0.2, random_state=1) # Print shapes print(train_X.shape) print(test_X.shape) # Import linear regression model from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error # Initialize the model class LR = LinearRegression() # Fit the model the training data LR.fit(train_X, train_Y) # Generate prediction for the test set predictions = LR.predict(test_X) # Compute error between our test prediction and actual values mean_squared_error(predictions, test_Y) # Import the random forest model from sklearn.ensemble import RandomForestRegressor # Initialize the model RFR = RandomForestRegressor(n_estimators=100, min_samples_leaf=10, random_state=1) # Fit the model the training data RFR.fit(train_X, train_Y) # Generate prediction for the test set predictions = RFR.predict(test_X) # Compute error between our test prediction and actual values mean_squared_error(predictions, test_Y) test_X.iloc[1] test_Y.iloc[1] # Make prediction with both models rating_LR = LR.predict(test_X.iloc[1].values.reshape(1, -1)) rating_RFR = RFR.predict(test_X.iloc[1].values.reshape(1, -1)) # Print out the predictions print(rating_LR) print(rating_RFR)