Untitled

import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer

class LDA:
    def __init__(self, doc_set, num_topic):
        self.alpha = 0.05
        self.beta = 0.01
        self.D = doc_set.D
        self.W = doc_set.W
        self.z = [[] for _ in range(self.D)]
        self.num_topic = num_topic
        self.documents = doc_set.get_documents()
        self.vocabulary = doc_set.get_vocabulary()
        self.nk = [0 for _ in range(self.num_topic)]
        self.nkj = np.zeros((self.D, self.num_topic), dtype=int)
        self.nwk = np.zeros((self.num_topic, self.W), dtype=int)

    def train(self):
        for d in range(self.D):
            for w in self.documents[d]:
                topic = np.random.random_integers(0, self.num_topic - 1)
                self.z[d].append(topic)
                self._assign(d, w, topic)

        for i in range(1000):
            for d, (doc, topics) in enumerate(zip(self.documents, self.z)):
                for j, (w, t) in enumerate(zip(doc, topics)):
                    self._forget(d, w, t)
                    topic = self._sample(d, w)
                    self.z[d][j] = topic
                    self._assign(d, w, topic)

    def train_doc_probs(self):
        return preprocessing.normalize(self.nkj + self.alpha, norm='l1')

    def train_topic_probs(self):
        return preprocessing.normalize(self.nwk + self.beta, norm='l1')

    def _sample(self, d, w):
        p = [0 for _ in range(self.num_topic)]
        for k in range(self.num_topic):
            p[k] = ((self.nkj[d][k] + self.alpha) * (self.nwk[k][w] + self.beta)) / \
                   (self.nk[k] + self.W + self.beta)

        p = preprocessing.normalize(p, norm='l1')[0]
        return np.nonzero(np.random.multinomial(1, p))[0][0]

    def _assign(self, d, w, t):
        self.nk[t] += 1
        self.nkj[d, t] += 1
        self.nwk[t, w] += 1

    def _forget(self, d, w, t):
        self.nk[t] -= 1
        self.nkj[d, t] -= 1
        self.nwk[t, w] -= 1


class DocumentSet:
    def __init__(self, path):
        self._vectorizer = CountVectorizer(token_pattern=r"\b\w+\b")
        vector = self._vectorizer.fit_transform(
            l for l in open(path, encoding='utf-8').readlines()
        )
        self._vocabulary = self._vectorizer.get_feature_names()
        self.D, self.W = vector.shape
        self._documents = [[] for _ in range(self.D)]
        for d, w in np.transpose(vector.nonzero()):
            self._documents[d].append(w)

    def get_documents(self):
        return self._documents

    def get_vocabulary(self):
        return self._vocabulary

class Multinomial:
    def __init__(self, mu):
        self.mu = mu

    def p(self, i):
        return self.mu[i]

if __name__ == '__main__':
    ds = DocumentSet('./document.txt')
    lda = LDA(ds, 3)
    lda.train()
    print(lda.train_doc_probs())
    for t, topicprobs in enumerate(lda.train_topic_probs()):
        for word, prob in sorted(zip(ds.get_vocabulary(), topicprobs), key=lambda x: x[1], reverse=True):
            print(t, word, prob)