Advertisement
Guest User

Untitled

a guest
Nov 30th, 2015
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.75 KB | None | 0 0
  1. import org.deeplearning4j.models.embeddings.WeightLookupTable;
  2. import org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable;
  3. import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
  4. import org.deeplearning4j.models.word2vec.Word2Vec;
  5. import org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache;
  6. import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
  7. import org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator;
  8. import org.deeplearning4j.text.sentenceiterator.LineSentenceIterator;
  9.  
  10. import org.deeplearning4j.text.sentenceiterator.SentencePreProcessor;
  11. import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
  12. import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
  13. import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
  14. import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
  15. import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.EndingPreProcessor;
  16.  
  17. import org.slf4j.Logger;
  18. import org.slf4j.LoggerFactory;
  19. import java.io.File;
  20. //import org.springframework.core.io.ClassPathResource;
  21.  
  22. import java.util.ArrayList;
  23. import java.util.Collection;
  24.  
  25. public class w2v {
  26.  
  27. private static Logger log = LoggerFactory.getLogger(w2v.class);
  28.  
  29.  
  30. public static void main(String[] args) throws Exception {
  31.  
  32. String filePath = "/home/xyang/workspace/resources/raw_sentences.txt";
  33.  
  34. log.info("Load & Vectorize Sentences....");
  35. // Strip white space before and after for each line
  36. SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
  37. // Split on white spaces in the line to get words
  38. TokenizerFactory t = new DefaultTokenizerFactory();
  39. t.setTokenPreProcessor(new CommonPreprocessor());
  40.  
  41. InMemoryLookupCache cache = new InMemoryLookupCache();
  42. WeightLookupTable table = new InMemoryLookupTable.Builder()
  43. .vectorLength(100)
  44. .useAdaGrad(false)
  45. .cache(cache)
  46. .lr(0.025f).build();
  47.  
  48. log.info("Building model....");
  49. Word2Vec vec = new Word2Vec.Builder()
  50. .minWordFrequency(5).iterations(1)
  51. .layerSize(100).lookupTable(table)
  52. .stopWords(new ArrayList<String>())
  53. .vocabCache(cache).seed(42)
  54. .windowSize(5).iterate(iter).tokenizerFactory(t).build();
  55.  
  56. log.info("Fitting Word2Vec model....");
  57. vec.fit();
  58.  
  59. log.info("Writing word vectors to text file....");
  60. // Write word
  61. WordVectorSerializer.writeWordVectors(vec, "pathToWriteto.txt");
  62.  
  63. log.info("Closest Words:");
  64. Collection<String> lst = vec.wordsNearest("day", 10);
  65. System.out.println(lst);
  66. }
  67. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement