Guest User

Untitled

a guest
Nov 20th, 2017
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.03 KB | None | 0 0
  1. import numpy as np
  2. from sklearn import preprocessing
  3. from sklearn.feature_extraction.text import CountVectorizer
  4.  
  5. class LDA:
  6. def __init__(self, doc_set, num_topic):
  7. self.alpha = 0.05
  8. self.beta = 0.01
  9. self.D = doc_set.D
  10. self.W = doc_set.W
  11. self.z = [[] for _ in range(self.D)]
  12. self.num_topic = num_topic
  13. self.documents = doc_set.get_documents()
  14. self.vocabulary = doc_set.get_vocabulary()
  15. self.nk = [0 for _ in range(self.num_topic)]
  16. self.nkj = np.zeros((self.D, self.num_topic), dtype=int)
  17. self.nwk = np.zeros((self.num_topic, self.W), dtype=int)
  18.  
  19. def train(self):
  20. for d in range(self.D):
  21. for w in self.documents[d]:
  22. topic = np.random.random_integers(0, self.num_topic - 1)
  23. self.z[d].append(topic)
  24. self._assign(d, w, topic)
  25.  
  26. for i in range(1000):
  27. for d, (doc, topics) in enumerate(zip(self.documents, self.z)):
  28. for j, (w, t) in enumerate(zip(doc, topics)):
  29. self._forget(d, w, t)
  30. topic = self._sample(d, w)
  31. self.z[d][j] = topic
  32. self._assign(d, w, topic)
  33.  
  34. def train_doc_probs(self):
  35. return preprocessing.normalize(self.nkj + self.alpha, norm='l1')
  36.  
  37. def train_topic_probs(self):
  38. return preprocessing.normalize(self.nwk + self.beta, norm='l1')
  39.  
  40. def _sample(self, d, w):
  41. p = [0 for _ in range(self.num_topic)]
  42. for k in range(self.num_topic):
  43. p[k] = ((self.nkj[d][k] + self.alpha) * (self.nwk[k][w] + self.beta)) / \
  44. (self.nk[k] + self.W + self.beta)
  45.  
  46. p = preprocessing.normalize(p, norm='l1')[0]
  47. return np.nonzero(np.random.multinomial(1, p))[0][0]
  48.  
  49. def _assign(self, d, w, t):
  50. self.nk[t] += 1
  51. self.nkj[d, t] += 1
  52. self.nwk[t, w] += 1
  53.  
  54. def _forget(self, d, w, t):
  55. self.nk[t] -= 1
  56. self.nkj[d, t] -= 1
  57. self.nwk[t, w] -= 1
  58.  
  59.  
  60. class DocumentSet:
  61. def __init__(self, path):
  62. self._vectorizer = CountVectorizer(token_pattern=r"\b\w+\b")
  63. vector = self._vectorizer.fit_transform(
  64. l for l in open(path, encoding='utf-8').readlines()
  65. )
  66. self._vocabulary = self._vectorizer.get_feature_names()
  67. self.D, self.W = vector.shape
  68. self._documents = [[] for _ in range(self.D)]
  69. for d, w in np.transpose(vector.nonzero()):
  70. self._documents[d].append(w)
  71.  
  72. def get_documents(self):
  73. return self._documents
  74.  
  75. def get_vocabulary(self):
  76. return self._vocabulary
  77.  
  78. class Multinomial:
  79. def __init__(self, mu):
  80. self.mu = mu
  81.  
  82. def p(self, i):
  83. return self.mu[i]
  84.  
  85. if __name__ == '__main__':
  86. ds = DocumentSet('./document.txt')
  87. lda = LDA(ds, 3)
  88. lda.train()
  89. print(lda.train_doc_probs())
  90. for t, topicprobs in enumerate(lda.train_topic_probs()):
  91. for word, prob in sorted(zip(ds.get_vocabulary(), topicprobs), key=lambda x: x[1], reverse=True):
  92. print(t, word, prob)
Add Comment
Please, Sign In to add comment