Advertisement
Guest User

Untitled

a guest
May 20th, 2018
120
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.16 KB | None | 0 0
  1. from scipy.stats import dirichlet
  2. from scipy.special import gamma
  3. import collections
  4. from nltk.corpus import stopwords
  5. import operator
  6. import numpy as np
  7. import codecs
  8. import spacy
  9. import seaborn as sns
  10. import matplotlib.pyplot as plt
  11.  
  12.  
  13. # PART 1
  14. def likelihood(D, theta, K):
  15.     likelihood = 1
  16.     for k in range(1, K + 1):
  17.         likelihood *= theta[k - 1] ** D.count(k)
  18.     return likelihood
  19.  
  20.  
  21. def priori(theta, alpha, K):
  22.     priori = np.prod(gamma(alpha))
  23.     priori /= gamma(np.sum(alpha))
  24.     for k in range(1, K + 1):
  25.         priori *= theta[k - 1] ** (alpha[k - 1] - 1)
  26.     return priori
  27.  
  28.  
  29. def posteriori2(D, theta, alpha, K):
  30.     N = []
  31.     for k in range(1, K + 1):
  32.         N.append(D.count(k))
  33.     new_list = np.add(alpha, N)
  34.     return dirichlet.pdf(theta, new_list)
  35.  
  36.  
  37. def priori2(theta, alpha):
  38.     return dirichlet.pdf(theta, alpha)
  39.  
  40.  
  41. def posteriori(likelihood, priori):
  42.     return likelihood * priori
  43.  
  44.  
  45. # PART 2
  46. def predictive_posterior(alpha, D, j, K):
  47.     denom = 0
  48.     for k in range(1, K + 1):
  49.         denom += alpha[k - 1] + D.count(k)
  50.  
  51.     return (alpha[j - 1] + D.count(j)) / denom
  52.  
  53.  
  54. K = 6
  55. D = [1, 2, 4, 5, 6, 2, 2, 1, 4, 6, 2]
  56. theta = [0.2, 0.1, 0.1, 0.1, 0.3, 0.2]
  57. alpha = [2, 4, 0.1, 2, 1, 2]
  58.  
  59. likelihood = likelihood(D, theta, K)
  60. priori = priori(theta, alpha, K)
  61. posteriori = posteriori(likelihood, priori)
  62.  
  63. priori2 = priori2(theta, alpha)
  64. posteriori2 = posteriori2(D, theta, alpha, K)
  65.  
  66. '''        
  67. print('Likelihood: ' + str(likelihood))
  68. print('Priori: ' + str(priori))
  69. print('Posteriori: ' + str(posteriori))  
  70.  
  71. print('Priori2: ' + str(priori2))
  72. print('Posteriori2: ' + str(posteriori2))
  73.  
  74. for k in range(1, K+1):
  75.  print(str(k) + ' : ' + str(predictive_posterior(alpha, D, k, K)))
  76. '''
  77.  
  78.  
  79. # PART 3
  80. def load_file(filename):
  81.     return codecs.open(filename, "r", encoding="utf-8", errors="ignore")
  82.  
  83. def lemmatize_and_filter(file):
  84.     nlp = spacy.load('en')
  85.     doc = nlp(file.read())
  86.     lemmas = []
  87.     for token in doc:
  88.         lemmas.append(token.lemma_)
  89.     filtered_words = [word for word in lemmas if word not in stopwords.words('english')]
  90.     filtered_words = list(filter(lambda w: w != '-PRON-' and w != '\r\n' and w != '\r\n\r\n', filtered_words))
  91.     return filtered_words
  92.  
  93. def get_train_and_test_data(array):
  94.     train_size = int(len(array) * 0.8)
  95.     return array[:train_size], list(set(array[train_size:]))
  96.  
  97. def predictive_posterior_words(train, word, test_size):
  98.     return (1 + train.count(word)) / (test_size + len(train))
  99.  
  100.  
  101. sns.set(style="whitegrid")
  102. f, ax = plt.subplots(ncols=2)
  103.  
  104. #with lemmatization
  105. print('WITH LEMMATIZATION:')
  106. file = load_file('three_brothers.txt')
  107. words = lemmatize_and_filter(file)
  108. train, test = get_train_and_test_data(words)
  109. counter = collections.Counter(train)
  110.  
  111. print('COUNTER TRAIN SET: ' + str(counter))
  112. print('TEST SET: ' + str(test))
  113.  
  114. word_probs = {}
  115. for word in test:
  116.     word_probs.update({word : [predictive_posterior_words(train, word, len(test))]})
  117. word_probs = sorted(word_probs.items(), key=operator.itemgetter(1), reverse=True)
  118.  
  119. x = []
  120. y = []
  121. for tuple in word_probs:
  122.     y.append(tuple[0])
  123.     x.append(tuple[1][0])
  124.  
  125. print('PREDICTIVE WORDS: ' + str(word_probs))
  126. sns.barplot(x=x, y=y, color="b", ax=ax[0]).set_title("With lemmatization")
  127.  
  128. print()
  129. #without lemmatization
  130. print('WITHOUT LEMMATIZATION:')
  131. file2 = load_file('three_brothers.txt')
  132. words2 = file2.read().split(' ')
  133. words2 = [word for word in words2 if word not in stopwords.words('english')]
  134. train2, test2 = get_train_and_test_data(words2)
  135. counter2 = collections.Counter(train2)
  136.  
  137. print('COUNTER TRAIN SET: ' + str(counter2))
  138. print('TEST SET: ' + str(test2))
  139.  
  140. word_probs2 = {}
  141. for word2 in test2:
  142.     word_probs2.update({word2 : [predictive_posterior_words(train2, word2, len(test2))]})
  143. word_probs2 = sorted(word_probs2.items(), key=operator.itemgetter(1), reverse=True)
  144.  
  145. x = []
  146. y = []
  147. for tuple in word_probs2:
  148.     y.append(tuple[0])
  149.     x.append(tuple[1][0])
  150.  
  151. print('PREDICTIVE WORDS: ' + str(word_probs2))
  152. sns.set_color_codes("pastel")
  153. sns.barplot(x=x, y=y, color="b", ax=ax[1]).set_title("Without lemmatization")
  154. sns.despine(left=True, bottom=True)
  155.  
  156. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement