Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # import libraries
- import matplotlib.pyplot as plt
- import pandas as pd
- import seaborn as sns
- from sklearn.model_selection import train_test_split
- # Load the data
- FILE_PATH = './games.csv'
- games = pd.read_csv(FILE_PATH)
- # Print the names of the columns & shapes in games
- print(games.columns)
- print(games.shape)
- # Make a histogram of all the ratings in the average_rating column
- plt.hist(games["average_rating"])
- plt.show()
- # Print the first row of all the games with zero scores
- print(games[games['average_rating'] == 0].iloc[0])
- # Print the first row of games with scores grater than 0
- print(games[games['average_rating'] > 0].iloc[0])
- # Remove any rows without user reviews
- games = games[games['users_rated'] > 0]
- # Remove any rows with missing values
- games = games.dropna(axis=0)
- # Make a histogram of all the average ratings
- plt.hist(games['average_rating'])
- plt.show()
- # Correlation matrix
- corrmat = games.corr()
- fig = plt.figure(figsize=(12, 9))
- # Plot using seaborn
- sns.heatmap(corrmat, vmax=.8, square=True)
- plt.show()
- # Get all the columns from the dataframe
- columns = games.columns.tolist()
- # Filter the columns to remove data we do not want
- columns = [c for c in columns if c not in ["bayes_average_rating", "average_rating", "type", "name", "id"]]
- # Store the variable we`ll be predicting on
- target = "average_rating"
- # Generate training and test datasets
- train_X, test_X, train_Y, test_Y = train_test_split(games[columns], games[target], train_size=0.8, test_size=0.2, random_state=1)
- # Print shapes
- print(train_X.shape)
- print(test_X.shape)
- # Import linear regression model
- from sklearn.linear_model import LinearRegression
- from sklearn.metrics import mean_squared_error
- # Initialize the model class
- LR = LinearRegression()
- # Fit the model the training data
- LR.fit(train_X, train_Y)
- # Generate prediction for the test set
- predictions = LR.predict(test_X)
- # Compute error between our test prediction and actual values
- mean_squared_error(predictions, test_Y)
- # Import the random forest model
- from sklearn.ensemble import RandomForestRegressor
- # Initialize the model
- RFR = RandomForestRegressor(n_estimators=100, min_samples_leaf=10, random_state=1)
- # Fit the model the training data
- RFR.fit(train_X, train_Y)
Advertisement
Add Comment
Please, Sign In to add comment