Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import collections
- import matplotlib.pyplot as plt
- #import nltk
- ITERATIONS = 100
- px = [0,1,2]
- py = [0,1,0]
- n = 50 # Number of documents
- k = 3 # Number of topics
- v = 100 # Number of distinct words
- # Dirichlet parameters
- alpha = .01
- lamb = .01
- # How much each document pertains to a perticular topic
- a = np.matrix(np.zeros((n,k)))
- # How much each word pertains to a perticular topic
- b = np.matrix(np.zeros((k,v)))
- # Generate random documents for testing
- minLength = 25
- maxLength = 50
- sentenceLengths = range(int(np.random.uniform(minLength,maxLength)))
- # Labeled dataset, each word is replaced with its topic index
- topicAssignment = [[int(np.random.uniform(0,k))
- for _ in sentenceLengths]
- for _ in range(n)]
- # Labeled dataset, each word is replaced with its bag of words index
- wordIndexes = [[int(np.random.uniform(0,v))
- for _ in sentenceLengths]
- for _ in range(n)]
- def rouletteArg(vector):
- # Uncomment the next line for greedy
- #return np.argmax(vector)
- vector /= np.sum(vector)
- val = np.random.uniform()
- #print(vector)
- for i in range(len(vector)):
- val -= vector[i]
- if val <= 0:
- return i
- return len(vector)-1
- def update(a,b,topicAssignment):
- # Count the number of times words from each topic are used
- # in the document
- for document in range(n):
- occurance = collections.Counter(topicAssignment[document])
- for topic in range(k):
- a[document,topic] = occurance[topic]
- a[document] /= np.sum(a[document] + alpha)
- # Count the number of times a word is used in a particular topic
- for document in range(n):
- doc = wordIndexes[document]
- for wordIndex in range(len(doc)):
- topic = topicAssignment[document][wordIndex]
- word = wordIndexes[document][wordIndex]
- b[topic,word] += 1
- # Normalize
- for i in range(len(b.T)):
- b.T[i]/= np.sum(b.T[i] + lamb)
- # Update the assignment of topics
- for document in range(n):
- for wordIndex in range(len(topicAssignment[document])):
- vec = np.zeros(k)
- for topic in range(k):
- word = wordIndexes[document][wordIndex]
- vec[topic] = (a[document,topic] + alpha) * (b[topic,word] + lamb)
- topicAssignment[document][wordIndex] = rouletteArg(vec)
- return a,b,topicAssignment
- a,b,topicAssignment = update(a,b,topicAssignment)
- costs = np.zeros(ITERATIONS)
- for iteration in range(ITERATIONS):
- lastA = a.copy()
- lastB = b.copy()
- a,b,topicAssignment = update(a,b,topicAssignment)
- cost = np.sum(np.abs(a-lastA)) + np.sum(np.abs(b-lastB))
- costs[iteration] = cost
- if cost <= 1e-7:
- break
- plt.figure(1)
- x = a * np.matrix(px).T
- y = a * np.matrix(py).T
- plt.plot(x,y,'bo',alpha=((1.0-iteration/ITERATIONS)*.5))
- plt.figure(2)
- x = b.T * np.matrix(px).T
- y = b.T * np.matrix(py).T
- plt.plot(x,y,'bo',alpha=((1.0-iteration/ITERATIONS)*.5))
- print(iteration,cost)
- plt.figure(1)
- fig = plt.gcf()
- fig.canvas.set_window_title('Document Distribution')
- x = a * np.matrix(px).T
- y = a * np.matrix(py).T
- plt.plot(x,y,'ro',ms=10)
- plt.plot(px,py,'k-')
- plt.figure(2)
- fig = plt.gcf()
- fig.canvas.set_window_title('Word Distribution')
- x = b.T * np.matrix(px).T
- y = b.T * np.matrix(py).T
- plt.plot(x,y,'ro',ms=10)
- plt.plot(px,py,'k-')
- plt.figure(3)
- fig = plt.gcf()
- fig.canvas.set_window_title('Cost Function')
- plt.plot(costs)
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement