Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import collections
- import numpy as np
- import pickle
- from collections import Counter
- from nltk.tokenize import RegexpTokenizer
- from nltk.corpus import stopwords
- # matrices of probabilities
- prob_prob_Z_prob_D_W = [] # P(z | d, w)
- prob_W_Z = [] # P(w | z)
- prob_Z_D = [] # P(z | d)
- prob_P_D = [] # P( d )
- prob_D_W = [] # P( d | w)
- prob_W_D = [] # P(w | d)
- prob_Z_D_W = []
- data_words_articles = []
- length_topic = 0
- length_words = 0
- length_articles = 0
- words = []
- def initialize_parameters(data_words_articles):
- # data set
- data_words_articles = data_words_articles[:, :]
- # number of topic, given in instruction
- length_topic = 10
- # number of words occurec in row
- length_words = len(data_words_articles[0])
- # articles - 3113
- length_articles = len(data_words_articles)
- # generate matrices with 0
- prob_Z_D = np.zeros([length_topic, length_articles], dtype=np.float) # P(z | d)
- prob_W_Z = np.zeros([length_words, length_topic], dtype=np.float) # P(z | d)
- prob_P_D = np.zeros((length_articles,), dtype=np.float)
- prob_Z_D_W = np.zeros([length_topic, length_articles, length_words], dtype=np.float) # P(z | d)
- prob_W_D = np.zeros([length_words, length_articles], dtype=np.float) # P(z | d)
- prob_D_W = np.zeros([length_words, length_articles], dtype=np.float) # P(z | d)
- # assign random values
- prob_Z_D = np.random.random(size=(length_topic, length_articles))
- prob_W_Z = np.random.random(size=(length_words, length_topic))
- # probability od document
- prob_P_D = 1.0 / length_articles
- for i in range(length_articles):
- prob_P_D[i] = prob_P_D
- def create_list_most_common_words():
- l_words = []
- # merging list of words from articles
- for words_l in words_la:
- l_words = words_l + l_words
- # remove elements with less then 3 words
- l_words = remove_elements(list_of_words, 3)
- # takes only 20 % of most common words from list
- counter = collections.Counter(l_words)
- words = [word for word, word_count in counter.most_common(len(l_words))]
- words = words[round(len(words) * 0.2):]
- length_words = len(words)
- def create_matrix():
- # creating matrix where columns are words
- df_words = np.zeros([.length_articles, .length_words])
- # iterate over documents and then over word in article
- for index_d in range(length_articles):
- for index_w in range(length_words):
- if words[index_d, index_w] in words:
- df_words.loc[index_d, index_w] = df_words.loc[index_d, index_w] + 1
- data_words_articles = df_words
- print(df_words)
- def load_data(path):
- # pickle data set in form of tuple
- pickle_in = open(path, "rb")
- data_set = pickle.load(pickle_in)
- # articles (documents)
- documents = []
- for data in data_set:
- documents.append(data[4])
- print("Total number of documents: ", len(documents))
- pickle_in.close()
- return documents
- def process_words():
- pickle_in = open('file.pickle', "rb")
- data_set = pickle.load(pickle_in)
- documents = []
- for data in data_set:
- documents.append(data[4])
- print("Total number of documents: ", len(documents))
- pickle_in.close()
- articles = load_data(file_path)
- tokenizer = RegexpTokenizer(r'\w+')
- en_stop = set(stopwords.words('english'))
- for article in articles:
- raw = article.lower()
- tokens = tokenizer.tokenize(raw)
- stopped_tokens = [i for i in tokens if not i in en_stop]
- words.append(stopped_tokens)
- def remove_elements(lst, k):
- counted = Counter(lst)
- return [el for el in lst if counted[el] >= k]
- def generate_E_step():
- """
- calculates E step
- """
- for d in range(length_articles):
- for w in range(length_words):
- norm = 0.0
- for z in range(length_topic):
- prob_Z_D_W[z, d, w] = prob_W_Z[w, z] * prob_Z_D[z, d]
- norm = norm + prob_Z_D_W[z, d, w]
- # normalization
- for z in range(length_topic):
- prob_Z_D_W[z, d, w] = prob_Z_D_W[z, d, w] / norm if norm != 0 else 0
- def generate_M_step():
- """
- calculates M - step
- """
- # update prob_W_Z
- prob_W_Z = np.zeros([length_words, length_topic], dtype=np.float)
- for z in range(length_topic):
- norm = 0.0
- for w in range(length_words):
- sum = 0.0
- for d in range(length_articles):
- sum = sum + data_words_articles[d, w] * prob_Z_D_W[z, d, w]
- prob_W_Z[w, z] = sum
- norm = norm + sum
- for w in range(length_words):
- prob_W_Z[w, z] = prob_W_Z[w, z] / norm if norm != 0 else 0
- # update P(z | d)
- for d in range(length_articles):
- for z in range(length_topic):
- s = 0
- for w in range(length_words):
- count = data_words_articles[d][w]
- s = s + count * prob_Z_D_W[z, d, w]
- prob_Z_D[z][d] = s
- prob_Z_D[z][d] = prob_Z_D[z][d] / np.sum(data_words_articles[d]) if np.sum(data_words_articles[d]) != 0 else 0
- def calculate_prob_D_W():
- """
- calculates probabilities
- """
- # probability P(w,d)
- for d in range(length_articles):
- norm = 0.0
- for w in range(length_words):
- sum = 0.0
- for z in range(length_topic):
- sum = sum + prob_W_Z[w, z] * prob_Z_D[z, d]
- prob_W_D[w, d] = sum
- norm = norm + sum
- for w in range(length_words):
- prob_W_D[w, d] = prob_W_D[w, d] / norm if norm != 0 else 0
- # probability P(d, w)
- for w in range(length_words):
- norm = 0.0
- for d in range(length_articles):
- prob_D_W[d, w] = prob_P_D[d] * prob_W_D[w, d]
- norm = norm + prob_D_W[d, w]
- for d in range(length_articles):
- prob_D_W[d, w] = prob_D_W[d, w] / norm if norm != 0 else 0
- def generate_log():
- """
- generate logs-likelihood
- """
- L = 0.0
- for d in range(length_articles):
- for w in range(length_words):
- for z in range(length_topic):
- L = L + prob_D_W[d, w] * (
- np.log(prob_Z_D[z, d] * prob_W_Z[w, z]) if prob_Z_D[z, d] * prob_W_Z[w, z] != 0 else 0)
- print(L)
- def print_aspects():
- """
- prints aspects from each document
- """
- list_of_topics = []
- for d in range(d):
- ind = np.argpartition(prob_Z_D[:, d], -10)[-10:]
- for i in ind:
- print(ind[i])
- initialize_parameters()
- # STARTS HERE
- # calculation
- for i in range(1000):
- generate_E_step()
- generate_M_step()
- calculate_prob_D_W()
- generate_log()
- print_aspects()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement