Advertisement
Guest User

Untitled

a guest
Mar 11th, 2018
88
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.60 KB | None | 0 0
  1. setwd("D:/HVA/Big Data/individual-movie/ml-20m/")
  2. getwd()
  3.  
  4. library(dplyr)
  5. library(readr)
  6. library(DBI)
  7. library(rJava)
  8. library(RJDBC)
  9.  
  10. #cleanup old variables
  11. rm(dat, dataset.g,dataset,links,movies,ratings, imdb, imdb_basic, imdb_data, imdb_ratings, ratings_filtered, ratings_per_movie, final_set, DF, tmdb_5000_movies)
  12.  
  13. #import movielens data
  14. movies <- read_csv("movies.csv")
  15. ratings <- read_csv("ratings.csv")
  16.  
  17. #get mean per movie
  18. ratings_per_movie <- aggregate(ratings[,2-3], list(ratings$movieId), mean)
  19. ratings_filtered <- ratings_per_movie[,2:3]
  20.  
  21. #transform to base 10 rating
  22. ratings_filtered$rating <- (ratings_filtered$rating/5) * 10
  23. #round to 2 digits
  24. ratings_filtered$rating <- round(ratings_filtered$rating, digits = 1)
  25.  
  26. colnames(ratings_filtered) <- c("movieId", "ratings_movielens")
  27.  
  28. #import movielens link data
  29. links <- read_csv("links.csv")
  30. links$imdbId <- as.integer(links$imdbId)
  31.  
  32. dataset <- left_join(movies, links, by="movieId")
  33. dataset <- left_join(dataset, ratings_filtered, by="movieId")
  34.  
  35.  
  36.  
  37. #read imdb dataset
  38. imdb_ratings <- read_delim("data.tsv", delim='\t')
  39. imdb_basic <- read_delim("imdb-title-data.tsv", delim='\t')[ ,c("tconst","isAdult","genres")]
  40.  
  41.  
  42. imdb <- left_join(imdb_basic, imdb_ratings, by="tconst")
  43. imdb$imdbId <- as.integer(substring(imdb_basic$tconst, 3))
  44. colnames(imdb) <- c("tconst","isAdult","genres_imdb","ratings_imdb","numVotes","imdbId")
  45. dataset <- left_join(dataset, imdb, by="imdbId")
  46.  
  47.  
  48. #read tmdb dataset Skipped this since the dataset is too small but left it to show it works
  49. # tmdb_5000_movies <- read_csv("tmdb_5000_movies.csv")
  50. # tmdb_5000_movies$tmdbId <- tmdb_5000_movies$id
  51. # dataset <- left_join(dataset, tmdb_5000_movies, by="tmdbId")
  52.  
  53.  
  54. # Clean dataset remove unused variables
  55. dataset$tconst <- NULL
  56. dataset$imdbId <- NULL
  57. dataset$tmdbId <- NULL
  58. dataset$isAdult <- NULL
  59.  
  60.  
  61. #create function to replace nan's
  62. is.nan.data.frame <- function(x)
  63. do.call(cbind, lapply(x, is.nan))
  64.  
  65. # remove incomplete rows
  66. dataset <- dataset[complete.cases(dataset),]
  67.  
  68. # imdb_basic$genres <- strsplit(imdb_basic$genres, "\\,")
  69. # movies$genres <- strsplit(movies$genres, "\\|")
  70.  
  71. dataset$genres <- strsplit(dataset$genres, "\\|")
  72. dataset$genres_imdb <- strsplit(dataset$genres_imdb, "\\,")
  73.  
  74. # connect to mySQL & import Dataset
  75. drv <- JDBC("com.mysql.jdbc.Driver", "D:/HVA/Big Data/mysql-connector-java-5.1.45/mysql-connector-java-5.1.45-bin.jar")
  76. conn <- dbConnect(drv, "jdbc:mysql://localhost/movies", "root", "3kkDZOE#", useSSL=FALSE)
  77. dbWriteTable(conn,name="dataset", value=dataset , append=TRUE, row.names=FALSE, overwrite=FALSE)
  78. dbDisconnect(conn)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement