Advertisement
Guest User

Untitled

a guest
Feb 17th, 2020
112
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.70 KB | None | 0 0
  1. import collections
  2. import numpy as np
  3. import pickle
  4. from collections import Counter
  5. from nltk.tokenize import RegexpTokenizer
  6. from nltk.corpus import stopwords
  7.  
  8. # matrices of probabilities
  9. prob_prob_Z_prob_D_W = []  # P(z | d,  w)
  10. prob_W_Z = []  # P(w | z)
  11. prob_Z_D = []  # P(z | d)
  12. prob_P_D = []  # P( d )
  13. prob_D_W = []  # P( d | w)
  14. prob_W_D = []  # P(w | d)
  15. prob_Z_D_W = []
  16.  
  17. data_words_articles = []
  18. length_topic = 0
  19. length_words = 0
  20. length_articles = 0
  21.  
  22. words = []
  23.  
  24.  
  25. def initialize_parameters(data_words_articles):
  26.     # data set
  27.     data_words_articles = data_words_articles[:, :]
  28.  
  29.     # number of topic, given in instruction
  30.     length_topic = 10
  31.  
  32.     # number of words occurec in row
  33.     length_words = len(data_words_articles[0])
  34.  
  35.     # articles - 3113
  36.     length_articles = len(data_words_articles)
  37.  
  38.     # generate matrices with 0
  39.     prob_Z_D = np.zeros([length_topic, length_articles], dtype=np.float)  # P(z | d)
  40.     prob_W_Z = np.zeros([length_words, length_topic], dtype=np.float)  # P(z | d)
  41.     prob_P_D = np.zeros((length_articles,), dtype=np.float)
  42.     prob_Z_D_W = np.zeros([length_topic, length_articles, length_words], dtype=np.float)  # P(z | d)
  43.  
  44.     prob_W_D = np.zeros([length_words, length_articles], dtype=np.float)  # P(z | d)
  45.     prob_D_W = np.zeros([length_words, length_articles], dtype=np.float)  # P(z | d)
  46.  
  47.     # assign random values
  48.     prob_Z_D = np.random.random(size=(length_topic, length_articles))
  49.     prob_W_Z = np.random.random(size=(length_words, length_topic))
  50.  
  51.     # probability od document
  52.     prob_P_D = 1.0 / length_articles
  53.     for i in range(length_articles):
  54.         prob_P_D[i] = prob_P_D
  55.  
  56. def create_list_most_common_words():
  57.     l_words = []
  58.  
  59.     # merging list of words from articles
  60.     for words_l in words_la:
  61.         l_words = words_l + l_words
  62.  
  63.     # remove elements with less then 3 words
  64.     l_words = remove_elements(list_of_words, 3)
  65.  
  66.  
  67.     # takes only 20 % of most common words from list
  68.     counter = collections.Counter(l_words)
  69.     words = [word for word, word_count in counter.most_common(len(l_words))]
  70.     words = words[round(len(words) * 0.2):]
  71.  
  72.     length_words = len(words)
  73.  
  74. def create_matrix():
  75.  
  76.     # creating matrix where columns are words
  77.     df_words = np.zeros([.length_articles, .length_words])
  78.  
  79.     # iterate over documents and  then over word in article
  80.     for index_d in range(length_articles):
  81.         for index_w in range(length_words):
  82.             if words[index_d, index_w] in words:
  83.                 df_words.loc[index_d, index_w] = df_words.loc[index_d, index_w] + 1
  84.  
  85.  
  86.     data_words_articles = df_words
  87.     print(df_words)
  88.  
  89.  
  90. def load_data(path):
  91.     # pickle data set in form of tuple
  92.     pickle_in = open(path, "rb")
  93.     data_set = pickle.load(pickle_in)
  94.  
  95.     # articles (documents)
  96.     documents = []
  97.  
  98.     for data in data_set:
  99.         documents.append(data[4])
  100.  
  101.     print("Total number of documents: ", len(documents))
  102.     pickle_in.close()
  103.  
  104.     return documents
  105.  
  106. def process_words():
  107.     pickle_in = open('file.pickle', "rb")
  108.     data_set = pickle.load(pickle_in)
  109.  
  110.     documents = []
  111.  
  112.     for data in data_set:
  113.         documents.append(data[4])
  114.  
  115.     print("Total number of documents: ", len(documents))
  116.     pickle_in.close()
  117.  
  118.     articles = load_data(file_path)
  119.     tokenizer = RegexpTokenizer(r'\w+')
  120.     en_stop = set(stopwords.words('english'))
  121.  
  122.     for article in articles:
  123.         raw = article.lower()
  124.         tokens = tokenizer.tokenize(raw)
  125.  
  126.         stopped_tokens = [i for i in tokens if not i in en_stop]
  127.  
  128.         words.append(stopped_tokens)
  129.  
  130.  
  131. def remove_elements(lst, k):
  132.     counted = Counter(lst)
  133.     return [el for el in lst if counted[el] >= k]
  134.  
  135.  
  136. def generate_E_step():
  137.     """
  138.    calculates E step
  139.    """
  140.     for d in range(length_articles):
  141.         for w in range(length_words):
  142.             norm = 0.0
  143.             for z in range(length_topic):
  144.                 prob_Z_D_W[z, d, w] = prob_W_Z[w, z] * prob_Z_D[z, d]
  145.                 norm = norm + prob_Z_D_W[z, d, w]
  146.             # normalization
  147.             for z in range(length_topic):
  148.                 prob_Z_D_W[z, d, w] = prob_Z_D_W[z, d, w] / norm if norm != 0 else 0
  149.  
  150.  
  151. def generate_M_step():
  152.     """
  153.    calculates M - step
  154.    """
  155.     # update prob_W_Z
  156.     prob_W_Z = np.zeros([length_words, length_topic], dtype=np.float)
  157.     for z in range(length_topic):
  158.         norm = 0.0
  159.         for w in range(length_words):
  160.             sum = 0.0
  161.             for d in range(length_articles):
  162.                 sum = sum + data_words_articles[d, w] * prob_Z_D_W[z, d, w]
  163.             prob_W_Z[w, z] = sum
  164.             norm = norm + sum
  165.         for w in range(length_words):
  166.             prob_W_Z[w, z] = prob_W_Z[w, z] / norm if norm != 0 else 0
  167.  
  168.     # update P(z | d)
  169.     for d in range(length_articles):
  170.         for z in range(length_topic):
  171.             s = 0
  172.             for w in range(length_words):
  173.                 count = data_words_articles[d][w]
  174.                 s = s + count * prob_Z_D_W[z, d, w]
  175.             prob_Z_D[z][d] = s
  176.             prob_Z_D[z][d] = prob_Z_D[z][d] / np.sum(data_words_articles[d]) if np.sum(data_words_articles[d]) != 0 else 0
  177.  
  178.  
  179. def calculate_prob_D_W():
  180.     """
  181.    calculates probabilities
  182.    """
  183.     # probability P(w,d)
  184.     for d in range(length_articles):
  185.         norm = 0.0
  186.         for w in range(length_words):
  187.             sum = 0.0
  188.             for z in range(length_topic):
  189.                 sum = sum + prob_W_Z[w, z] * prob_Z_D[z, d]
  190.             prob_W_D[w, d] = sum
  191.             norm = norm + sum
  192.         for w in range(length_words):
  193.             prob_W_D[w, d] = prob_W_D[w, d] / norm if norm != 0 else 0
  194.  
  195.     # probability P(d, w)
  196.     for w in range(length_words):
  197.         norm = 0.0
  198.         for d in range(length_articles):
  199.             prob_D_W[d, w] = prob_P_D[d] * prob_W_D[w, d]
  200.             norm = norm + prob_D_W[d, w]
  201.  
  202.         for d in range(length_articles):
  203.             prob_D_W[d, w] = prob_D_W[d, w] / norm if norm != 0 else 0
  204.  
  205.  
  206. def generate_log():
  207.     """
  208.    generate logs-likelihood
  209.    """
  210.     L = 0.0
  211.     for d in range(length_articles):
  212.         for w in range(length_words):
  213.             for z in range(length_topic):
  214.                 L = L + prob_D_W[d, w] * (
  215.                     np.log(prob_Z_D[z, d] * prob_W_Z[w, z]) if prob_Z_D[z, d] * prob_W_Z[w, z] != 0 else 0)
  216.     print(L)
  217.  
  218.  
  219. def print_aspects():
  220.     """
  221.    prints aspects from each document
  222.    """
  223.     list_of_topics = []
  224.     for d in range(d):
  225.         ind = np.argpartition(prob_Z_D[:, d], -10)[-10:]
  226.         for i in ind:
  227.             print(ind[i])
  228.  
  229.  
  230. initialize_parameters()
  231.  
  232. # STARTS HERE
  233. # calculation
  234. for i in range(1000):
  235.     generate_E_step()
  236.     generate_M_step()
  237.     calculate_prob_D_W()
  238.     generate_log()
  239.  
  240. print_aspects()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement