Advertisement
Guest User

Untitled

a guest
Mar 22nd, 2017
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.95 KB | None | 0 0
  1. import random
  2. import copy
  3. import numpy as np
  4. from scipy.sparse import lil_matrix
  5.  
  6. class ATM:
  7. def __init__(self, K, alpha, beta, max_iter, verbose=0):
  8. self.K=K
  9. self.alpha = alpha
  10. self.beta = beta
  11. self.max_iter = max_iter
  12. self.verbose=verbose
  13.  
  14. def fit(self,W,A,V,S):
  15. self._W = W
  16. self._A = A
  17. self._D = len(W) # number of documents
  18. self._V = V # number of vocabularies
  19. self._S = S # number of distinct authors
  20.  
  21. self.Z = self._init_Z()
  22. self.Y = self._init_Y()
  23. self.nak = self._init_nak()
  24. self.nkv = self._init_nkv()
  25. nkv_sum = self.nkv.sum(axis=1)
  26. nak_sum = self.nak.sum(axis=1)
  27.  
  28. self._max_score = -1
  29. self.max_Z = None
  30. self.max_Y = None
  31. remained_iter = self.max_iter
  32. while True:
  33. if self.verbose: print remained_iter
  34. for d in np.random.choice(self._D, self._D, replace=False):
  35. # Sample Z and Y
  36. for i in np.random.choice(len(self._W[d]), len(self._W[d]), replace=False):
  37. k = self.Z[d][i] # topic
  38. v = self._W[d][i] # word index
  39. j = self.Y[d][i] # author_index within document d
  40. a = self._A[d][j] # author
  41.  
  42. self.nak[a][k] -= 1
  43. self.nkv[k][v] -= 1
  44. nkv_sum[k] -= 1
  45. nak_sum[a] -= 1
  46.  
  47. self.Z[d][i], self.Y[d][i] = self._sample_z_and_y(d,v,nkv_sum,nak_sum)
  48.  
  49. new_a = self._A[d][self.Y[d][i]]
  50. new_k = self.Z[d][i]
  51. self.nak[new_a][new_k] += 1
  52. self.nkv[new_k][v] += 1
  53. nkv_sum[new_k] += 1
  54. nak_sum[new_a] += 1
  55.  
  56. s = self.score(nkv_sum,nak_sum)
  57. if s > self._max_score:
  58. self.max_score = s
  59. self.max_Z = copy.copy(self.Z)
  60. self.max_Y = copy.copy(self.Y)
  61. remained_iter -= 1
  62. if remained_iter <= 0: break
  63. return self
  64.  
  65. def _init_Z(self):
  66. Z = []
  67. for d in range(len(self._W)):
  68. Z.append(np.random.randint(low=0, high=self.K, size=len(self._W[d])))
  69. return Z
  70.  
  71. def _init_Y(self):
  72. Y = []
  73. for d in range(len(self._W)):
  74. Y.append(np.random.randint(low=0, high=len(self._A[d]), size=len(self._W[d])))
  75. return Y
  76.  
  77. def _init_nak(self):
  78. nak = np.zeros((self._S,self.K))
  79. for d in range(self._D):
  80. for i in range(len(self._W[d])):
  81. k = self.Z[d][i]
  82. j = self.Y[d][i]
  83. a = self._A[d][j]
  84. nak[a,k]+=1
  85. return nak
  86.  
  87. def _init_nkv(self):
  88. nkv = np.zeros((self.K,self._V))
  89. for d in range(self._D):
  90. for i in range(len(self._W[d])):
  91. k = self.Z[d][i]
  92. v = self._W[d][i]
  93. nkv[k,v]+=1
  94. return nkv
  95.  
  96. def _sample_z_and_y(self,d,v,nkv_sum,nak_sum):
  97. nkv = self.nkv[:,v] # k-dimensional vector
  98. na = len(self._A[d]) # number of authors in document d
  99. prob = []
  100. p1 = ((nkv+self.beta) / (nkv_sum+self.beta*self._V))
  101. for j in range(na):
  102. a = self._A[d][j]
  103. pa = p1 * ((self.nak[a]+self.alpha) / (nak_sum[a]+self.alpha*self.K))
  104. prob.append(pa)
  105. prob = np.array(prob).flatten()
  106. prob = prob/prob.sum()
  107. zy = np.random.multinomial(n=1, pvals=prob).argmax()
  108. z = zy%self.K
  109. y = zy/self.K
  110. return z,y
  111.  
  112. def score(self,nkv_sum,nak_sum):
  113. s = 0
  114. for d in range(self._D):
  115. for i in range(len(self._W[d])):
  116. v = self._W[d][i]
  117. k = self.Z[d][i]
  118. a = self._A[d][self.Y[d][i]]
  119. s += ((self.nkv[k,v]+self.beta) / (nkv_sum[k]+self.beta*self._V)) * ((self.nak[a,k]+self.alpha) / (nak_sum[a]+self.alpha*self.K))
  120. return s
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement