Advertisement
Guest User

dupa_kerasowa

a guest
Jan 21st, 2019
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.66 KB | None | 0 0
  1. library(data.table)
  2. library(tidytext)
  3. library(magrittr)
  4. library(dplyr)
  5. library(kerasR)
  6.  
  7. options(scipen = 999)
  8.  
  9. train <- fread('../input/train.csv', data.table = FALSE)
  10. test <- fread('../input/test.csv', data.table = FALSE)
  11.  
  12. max_words <- 15000 # Maximum number of words to consider as features
  13. maxlen <- 64 # Text cutoff after n words
  14.  
  15. # Prepare to tokenize the text
  16.  
  17. texts <- train$question_text
  18.  
  19. tokenizer <- text_tokenizer(num_words = max_words) %>%
  20. fit_text_tokenizer(texts)
  21.  
  22. # Tokenize - i.e. convert text into a sequence of integers
  23.  
  24. sequences <- texts_to_sequences(tokenizer, texts)
  25. word_index <- tokenizer$word_index
  26.  
  27. # Pad out texts so everything is the same length
  28.  
  29. data = pad_sequences(sequences, maxlen = maxlen)
  30.  
  31. # Split back into train and test
  32.  
  33. train_matrix = data[1:nrow(train),]
  34.  
  35. # Prepare training labels
  36.  
  37. labels = train$target
  38.  
  39.  
  40. # Prepare a validation set
  41.  
  42. set.seed(1337)
  43.  
  44. training_samples = nrow(train_matrix)*0.90
  45. validation_samples = nrow(train_matrix)*0.10
  46.  
  47. indices = sample(1:nrow(train_matrix))
  48. training_indices = indices[1:training_samples]
  49. validation_indices = indices[(training_samples + 1): (training_samples + validation_samples)]
  50.  
  51. x_train = train_matrix[training_indices,]
  52. y_train = labels[training_indices]
  53.  
  54. x_val = train_matrix[validation_indices,]
  55. y_val = labels[validation_indices]
  56.  
  57. # Training dimensions
  58.  
  59. dim(x_train)
  60. table(y_train)
  61.  
  62. lines <- readLines('../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec')
  63.  
  64. fastwiki_embeddings_index = new.env(hash = TRUE, parent = emptyenv())
  65.  
  66. lines <- lines[2:length(lines)]
  67.  
  68. pb <- txtProgressBar(min = 0, max = length(lines), style = 3)
  69. for (i in 1:length(lines)){
  70. line <- lines[[i]]
  71. values <- strsplit(line, " ")[[1]]
  72. word<- values[[1]]
  73. fastwiki_embeddings_index[[word]] = as.double(values[-1])
  74. setTxtProgressBar(pb, i)
  75. }
  76.  
  77. # Create our embedding matrix
  78.  
  79. fastwiki_embedding_dim = 300
  80. fastwiki_embedding_matrix = array(0, c(max_words, fastwiki_embedding_dim))
  81.  
  82. for (word in names(word_index)){
  83. index <- word_index[[word]]
  84. if (index < max_words){
  85. fastwiki_embedding_vector = fastwiki_embeddings_index[[word]]
  86. if (!is.null(fastwiki_embedding_vector))
  87. fastwiki_embedding_matrix[index+1,] <- fastwiki_embedding_vector # Words without an embedding are all zeros
  88. }
  89. }
  90.  
  91.  
  92. gc()
  93.  
  94. # Setup input
  95.  
  96. input <- layer_input(
  97. shape = list(NULL),
  98. dtype = "int32",
  99. name = "input"
  100. )
  101.  
  102. # Model layers
  103.  
  104. embedding <- input %>%
  105. layer_embedding(input_dim = max_words, output_dim = fastwiki_embedding_dim, name = "embedding")
  106.  
  107. lstm <- embedding %>%
  108. layer_lstm(units = maxlen,dropout = 0.25, recurrent_dropout = 0.25, return_sequences = FALSE, name = "lstm")
  109.  
  110. dense <- lstm %>%
  111. layer_dense(units = 128, activation = "relu", name = "dense")
  112.  
  113. predictions <- dense %>%
  114. layer_dense(units = 1, activation = "sigmoid", name = "predictions")
  115.  
  116.  
  117. # Bring model together
  118.  
  119. model <- keras_model(input, predictions)
  120.  
  121. # Freeze the embedding weights initially to prevent updates propgating back through and ruining our embedding
  122.  
  123. get_layer(model, name = "embedding") %>%
  124. set_weights(list(fastwiki_embedding_matrix)) %>%
  125. freeze_weights()
  126.  
  127.  
  128. # Compile
  129.  
  130. model %>% compile(
  131. optimizer = optimizer_adam(),
  132. loss = "binary_crossentropy",
  133. metrics = "binary_accuracy"
  134. )
  135.  
  136. # Print architecture (plot_model isn't implemented in the R package yet)
  137.  
  138. print(model)
  139.  
  140. # Train model
  141.  
  142. history <- model %>% fit(
  143. x_train,
  144. y_train,
  145. batch_size = 2048,
  146. validation_data = list(x_val, y_val),
  147. epochs = 35,
  148. view_metrics = FALSE,
  149. verbose = 0
  150. )
  151.  
  152. # Look at training results
  153.  
  154. print(history)
  155. plot(history)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement