Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- setwd("D:/HVA/Big Data/individual-movie/ml-20m/")
- getwd()
- library(dplyr)
- library(readr)
- library(DBI)
- library(rJava)
- library(RJDBC)
- #cleanup old variables
- rm(dat, dataset.g,dataset,links,movies,ratings, imdb, imdb_basic, imdb_data, imdb_ratings, ratings_filtered, ratings_per_movie, final_set, DF, tmdb_5000_movies)
- #import movielens data
- movies <- read_csv("movies.csv")
- ratings <- read_csv("ratings.csv")
- #get mean per movie
- ratings_per_movie <- aggregate(ratings[,2-3], list(ratings$movieId), mean)
- ratings_filtered <- ratings_per_movie[,2:3]
- #transform to base 10 rating
- ratings_filtered$rating <- (ratings_filtered$rating/5) * 10
- #round to 2 digits
- ratings_filtered$rating <- round(ratings_filtered$rating, digits = 1)
- colnames(ratings_filtered) <- c("movieId", "ratings_movielens")
- #import movielens link data
- links <- read_csv("links.csv")
- links$imdbId <- as.integer(links$imdbId)
- dataset <- left_join(movies, links, by="movieId")
- dataset <- left_join(dataset, ratings_filtered, by="movieId")
- #read imdb dataset
- imdb_ratings <- read_delim("data.tsv", delim='\t')
- imdb_basic <- read_delim("imdb-title-data.tsv", delim='\t')[ ,c("tconst","isAdult","genres")]
- imdb <- left_join(imdb_basic, imdb_ratings, by="tconst")
- imdb$imdbId <- as.integer(substring(imdb_basic$tconst, 3))
- colnames(imdb) <- c("tconst","isAdult","genres_imdb","ratings_imdb","numVotes","imdbId")
- dataset <- left_join(dataset, imdb, by="imdbId")
- #read tmdb dataset Skipped this since the dataset is too small but left it to show it works
- # tmdb_5000_movies <- read_csv("tmdb_5000_movies.csv")
- # tmdb_5000_movies$tmdbId <- tmdb_5000_movies$id
- # dataset <- left_join(dataset, tmdb_5000_movies, by="tmdbId")
- # Clean dataset remove unused variables
- dataset$tconst <- NULL
- dataset$imdbId <- NULL
- dataset$tmdbId <- NULL
- dataset$isAdult <- NULL
- #create function to replace nan's
- is.nan.data.frame <- function(x)
- do.call(cbind, lapply(x, is.nan))
- # remove incomplete rows
- dataset <- dataset[complete.cases(dataset),]
- # imdb_basic$genres <- strsplit(imdb_basic$genres, "\\,")
- # movies$genres <- strsplit(movies$genres, "\\|")
- dataset$genres <- strsplit(dataset$genres, "\\|")
- dataset$genres_imdb <- strsplit(dataset$genres_imdb, "\\,")
- # connect to mySQL & import Dataset
- drv <- JDBC("com.mysql.jdbc.Driver", "D:/HVA/Big Data/mysql-connector-java-5.1.45/mysql-connector-java-5.1.45-bin.jar")
- conn <- dbConnect(drv, "jdbc:mysql://localhost/movies", "root", "3kkDZOE#", useSSL=FALSE)
- dbWriteTable(conn,name="dataset", value=dataset , append=TRUE, row.names=FALSE, overwrite=FALSE)
- dbDisconnect(conn)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement