Advertisement
Guest User

Untitled

a guest
Mar 26th, 2019
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.65 KB | None | 0 0
  1. import pandas as pd
  2. from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
  3. from sklearn.feature_extraction.text import CountVectorizer
  4. data = pd.read_csv('dataset.tsv.txt', sep='\t')
  5. data.columns = ['a','b']
  6. data = data[data.b==1]
  7. data = data.a.values
  8. vectorizer = CountVectorizer(min_df=5, max_df=0.9,
  9. stop_words='english', lowercase=True,
  10. token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
  11.  
  12. data_vectorized = vectorizer.fit_transform(data)
  13. nmf_model = NMF(n_components=5000)
  14. nmf_Z = nmf_model.fit_transform(data_vectorized)
  15. print(nmf_Z.shape) # (NO_DOCUMENTS, NO_TOPICS)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement