Untitled

# import libraries
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import  train_test_split

# Load the data
FILE_PATH = './games.csv'
games = pd.read_csv(FILE_PATH)

# Print the names of the columns & shapes in games
print(games.columns)
print(games.shape)

# Make a histogram of all the ratings in the average_rating column
plt.hist(games["average_rating"])
plt.show()

# Print the first row of all the games with zero scores
print(games[games['average_rating'] == 0].iloc[0])

# Print the first row of games with scores grater than 0
print(games[games['average_rating'] > 0].iloc[0])

# Remove any rows without user reviews
games = games[games['users_rated'] > 0]

# Remove any rows with missing values
games = games.dropna(axis=0)

# Make a histogram of all the average ratings
plt.hist(games['average_rating'])
plt.show()

# Correlation matrix
corrmat = games.corr()
fig = plt.figure(figsize=(12, 9))

# Plot using seaborn
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()

# Get all the columns from the dataframe
columns = games.columns.tolist()

# Filter the columns to remove data we do not want
columns = [c for c in columns if c not in ["bayes_average_rating", "average_rating", "type", "name", "id"]]

# Store the variable we`ll be predicting on
target = "average_rating"

# Generate training and test datasets
train_X, test_X, train_Y, test_Y = train_test_split(games[columns], games[target], train_size=0.8, test_size=0.2, random_state=1)
# Print shapes
print(train_X.shape)
print(test_X.shape)

# Import linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize the model class
LR = LinearRegression()

# Fit the model the training data
LR.fit(train_X, train_Y)

# Generate prediction for the test set
predictions = LR.predict(test_X)

# Compute error between our test prediction and actual values
mean_squared_error(predictions, test_Y)