Guest User

Untitled

a guest
Dec 2nd, 2018
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.39 KB | None | 0 0
  1. ---
  2. title: "fianl project"
  3. author: "Xuan Rong"
  4. date: "11/20/2018"
  5. output: html_document
  6. ---
  7.  
  8. ```{r setup, include=FALSE}
  9. knitr::opts_chunk$set(echo = TRUE)
  10. ```
  11.  
  12. ```{r}
  13. require('RPostgreSQL')
  14. drv <- dbDriver('PostgreSQL')
  15. con <- dbConnect(drv, dbname = 'movie', host = '35.196.38.197',
  16. port = 5432, user = 'postgres', password = 'pwd4APAN5310')
  17. ```
  18.  
  19. ```{r}
  20. create_table <- "
  21. create table director(
  22. director_id serial,
  23. director_name varchar(100),
  24. director_facebook_likes integer,
  25. primary key (director_id)
  26. );
  27. create table actor(
  28. actor_id serial,
  29. actor_name varchar(100),
  30. actor_facebook_likes integer,
  31. primary key(actor_id)
  32. );
  33. create table plot_keyword(
  34. keyword_id serial,
  35. keyword_name varchar(200),
  36. primary key (keyword_id)
  37. );
  38. create table genre(
  39. genres_id serial,
  40. genres_name varchar(50),
  41. primary key(genres_id)
  42. );
  43. create table movie(
  44. movie_id serial,
  45. movie_title varchar(100) not null,
  46. director_id serial,
  47. color varchar(50),
  48. language varchar(20),
  49. country varchar(50),
  50. content_rating varchar(10),
  51. duration numeric(5,2),
  52. number_user_for_review integer,
  53. num_critic_for_reviews integer,
  54. num_voted_users integer,
  55. movie_facebook_likes integer,
  56. cast_total_facebook_likes integer,
  57. imdb_score numeric(3,1),
  58. facenumber_in_poster integer,
  59. title_year numeric(4,0),
  60. aspect_ratio numeric(4,2),
  61. movie_imdb_link varchar(320),
  62. gross numeric(13,2),
  63. budget numeric(13,2),
  64. primary key (movie_id),
  65. foreign key (director_id) references director (director_id)
  66. );
  67. create table movie_keyword(
  68. movie_id serial,
  69. keyword_id serial,
  70. primary key (movie_id, keyword_id),
  71. foreign key (movie_id) references movie (movie_id),
  72. foreign key (keyword_id) references plot_keyword (keyword_id)
  73. );
  74. create table movie_genre(
  75. movie_id serial,
  76. genres_id serial,
  77. primary key (movie_id, genres_id),
  78. foreign key (movie_id) references movie (movie_id),
  79. foreign key (genres_id) references genre (genres_id)
  80. );
  81. create table movie_actor(
  82. movie_id serial,
  83. actor_id serial,
  84. primary key (movie_id, actor_id),
  85. foreign key (movie_id) references movie (movie_id),
  86. foreign key (actor_id) references actor (actor_id)
  87. );"
  88.  
  89. dbGetQuery(con, create_table)
  90. ```
  91.  
  92. ```{r}
  93. df <- read.csv(file = 'movie_metadata.csv',header = TRUE,stringsAsFactors = FALSE)
  94. attach(df)
  95. df
  96. ```
  97.  
  98. ```{r}
  99. #Reshape by actors
  100. actor_name <- data.frame(actor_1_name,actor_2_name,actor_3_name)
  101. actor_name <- data.frame(actor_name = c(t(actor_name)))
  102.  
  103. actor_facebook_likes <- data.frame(actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes)
  104. actor_facebook_likes <- c(t(actor_facebook_likes))
  105.  
  106. df1 <- data.frame(df[rep(seq_len(nrow(df)), each=3),], actor_name,actor_facebook_likes)
  107.  
  108. #Reshape by genres and keywords
  109. #install.packages('tidyr')
  110. library(tidyr)
  111. df2 <- df1 %>% separate_rows(genres, sep = '\\|')
  112. df3 <- df2 %>% separate_rows(plot_keywords, sep = '\\|')
  113.  
  114. df3
  115. ```
  116.  
  117. ```{r}
  118. #Setting
  119. #install.packages('data.table')
  120. library(data.table)
  121. setDT(df3)[, movie_id := .GRP, by = movie_title]
  122. setDT(df3)[, director_id := .GRP, by = director_name]
  123. setDT(df3)[, genres_id := .GRP, by = genres]
  124. setDT(df3)[, keyword_id := .GRP, by = plot_keywords]
  125. setDT(df3)[, actor_id := .GRP, by = actor_name]
  126. df3
  127. ```
  128.  
  129. ```{r}
  130. names(df3)
  131. ```
  132.  
  133. ```{r}
  134. actor <- subset(df3, select=c('actor_id','actor_name','actor_facebook_likes'))
  135. actor <- actor[!duplicated(actor$actor_id),]
  136. actor
  137.  
  138. dbWriteTable(con, 'actor', actor, row.names=FALSE, append=TRUE)
  139. ```
  140.  
  141. ```{r}
  142. genre <- subset(df3, select=c('genres_id','genres'))
  143. genre <- genre[!duplicated(genre),]
  144. names(genre) <- c('genres_id','genres_name')
  145.  
  146. dbWriteTable(con, 'genre', genre, row.names=FALSE, append=TRUE)
  147. ```
  148.  
  149. ```{r}
  150. plot_keyword <- subset(df3, select=c('keyword_id','plot_keywords'))
  151. plot_keyword <- plot_keyword[!duplicated(plot_keyword),]
  152. names(plot_keyword) <- c('keyword_id','keyword_name')
  153.  
  154. dbWriteTable(con, 'plot_keyword', plot_keyword, row.names=FALSE, append=TRUE)
  155. ```
  156.  
  157. ```{r}
  158. director <- subset(df3, select=c('director_id','director_name','director_facebook_likes'))
  159. director <- director[!duplicated(director_id),]
  160. director
  161.  
  162. dbWriteTable(con, 'director', director, row.names=FALSE, append=TRUE)
  163. ```
  164. ```{r}
  165. df3
  166. ```
  167.  
  168.  
  169. ```{r}
  170. movie <- subset(df3, select=c('movie_id','movie_title','director_id','color','language','country','content_rating','duration','num_user_for_reviews','num_critic_for_reviews','num_voted_users','movie_facebook_likes','cast_total_facebook_likes','imdb_score','facenumber_in_poster','title_year','aspect_ratio','movie_imdb_link','gross','budget'))
  171. movie <- movie[!duplicated(movie_id),]
  172. names(movie) <- c('movie_id','movie_title','director_id','color','language','country','content_rating','duration','number_user_for_review','num_critic_for_reviews','num_voted_users','movie_facebook_likes','cast_total_facebook_likes','imdb_score','facenumber_in_poster','title_year','aspect_ratio','movie_imdb_link','gross','budget')
  173. movie
  174.  
  175. dbWriteTable(con, 'movie', movie, row.names=FALSE, append=TRUE)
  176. ```
  177.  
  178. ```{r}
  179. movie_actor <- subset(df3, select=c('movie_id','actor_id'))
  180. movie_actor <- movie_actor[!duplicated(movie_actor),]
  181. movie_actor
  182.  
  183. dbWriteTable(con, 'movie_actor', movie_actor, row.names=FALSE, append=TRUE)
  184. ```
  185.  
  186. ```{r}
  187. movie_genre <- subset(df3, select=c('movie_id','genres_id'))
  188. movie_genre <- movie_genre[!duplicated(movie_genre),]
  189. movie_genre
  190.  
  191. dbWriteTable(con, 'movie_genre', movie_genre, row.names=FALSE, append=TRUE)
  192. ```
  193.  
  194. ```{r}
  195. movie_keyword <- subset(df3, select=c('movie_id','keyword_id'))
  196. movie_keyword <- movie_keyword[!duplicated(movie_keyword),]
  197. movie_keyword
  198.  
  199. dbWriteTable(con, 'movie_keyword', movie_keyword, row.names=FALSE, append=TRUE)
  200. ```
Add Comment
Please, Sign In to add comment