Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- from sklearn import preprocessing
- from sklearn.feature_extraction.text import CountVectorizer
- class LDA:
- def __init__(self, doc_set, num_topic):
- self.alpha = 0.05
- self.beta = 0.01
- self.D = doc_set.D
- self.W = doc_set.W
- self.z = [[] for _ in range(self.D)]
- self.num_topic = num_topic
- self.documents = doc_set.get_documents()
- self.vocabulary = doc_set.get_vocabulary()
- self.nk = [0 for _ in range(self.num_topic)]
- self.nkj = np.zeros((self.D, self.num_topic), dtype=int)
- self.nwk = np.zeros((self.num_topic, self.W), dtype=int)
- def train(self):
- for d in range(self.D):
- for w in self.documents[d]:
- topic = np.random.random_integers(0, self.num_topic - 1)
- self.z[d].append(topic)
- self._assign(d, w, topic)
- for i in range(1000):
- for d, (doc, topics) in enumerate(zip(self.documents, self.z)):
- for j, (w, t) in enumerate(zip(doc, topics)):
- self._forget(d, w, t)
- topic = self._sample(d, w)
- self.z[d][j] = topic
- self._assign(d, w, topic)
- def train_doc_probs(self):
- return preprocessing.normalize(self.nkj + self.alpha, norm='l1')
- def train_topic_probs(self):
- return preprocessing.normalize(self.nwk + self.beta, norm='l1')
- def _sample(self, d, w):
- p = [0 for _ in range(self.num_topic)]
- for k in range(self.num_topic):
- p[k] = ((self.nkj[d][k] + self.alpha) * (self.nwk[k][w] + self.beta)) / \
- (self.nk[k] + self.W + self.beta)
- p = preprocessing.normalize(p, norm='l1')[0]
- return np.nonzero(np.random.multinomial(1, p))[0][0]
- def _assign(self, d, w, t):
- self.nk[t] += 1
- self.nkj[d, t] += 1
- self.nwk[t, w] += 1
- def _forget(self, d, w, t):
- self.nk[t] -= 1
- self.nkj[d, t] -= 1
- self.nwk[t, w] -= 1
- class DocumentSet:
- def __init__(self, path):
- self._vectorizer = CountVectorizer(token_pattern=r"\b\w+\b")
- vector = self._vectorizer.fit_transform(
- l for l in open(path, encoding='utf-8').readlines()
- )
- self._vocabulary = self._vectorizer.get_feature_names()
- self.D, self.W = vector.shape
- self._documents = [[] for _ in range(self.D)]
- for d, w in np.transpose(vector.nonzero()):
- self._documents[d].append(w)
- def get_documents(self):
- return self._documents
- def get_vocabulary(self):
- return self._vocabulary
- class Multinomial:
- def __init__(self, mu):
- self.mu = mu
- def p(self, i):
- return self.mu[i]
- if __name__ == '__main__':
- ds = DocumentSet('./document.txt')
- lda = LDA(ds, 3)
- lda.train()
- print(lda.train_doc_probs())
- for t, topicprobs in enumerate(lda.train_topic_probs()):
- for word, prob in sorted(zip(ds.get_vocabulary(), topicprobs), key=lambda x: x[1], reverse=True):
- print(t, word, prob)
Add Comment
Please, Sign In to add comment