Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
- from sklearn.feature_extraction.text import CountVectorizer
- data = pd.read_csv('dataset.tsv.txt', sep='\t')
- data.columns = ['a','b']
- data = data[data.b==1]
- data = data.a.values
- vectorizer = CountVectorizer(min_df=5, max_df=0.9,
- stop_words='english', lowercase=True,
- token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
- data_vectorized = vectorizer.fit_transform(data)
- nmf_model = NMF(n_components=5000)
- nmf_Z = nmf_model.fit_transform(data_vectorized)
- print(nmf_Z.shape) # (NO_DOCUMENTS, NO_TOPICS)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement