Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from scipy.stats import dirichlet
- from scipy.special import gamma
- import collections
- from nltk.corpus import stopwords
- import operator
- import numpy as np
- import codecs
- import spacy
- import seaborn as sns
- import matplotlib.pyplot as plt
- # PART 1
- def likelihood(D, theta, K):
- likelihood = 1
- for k in range(1, K + 1):
- likelihood *= theta[k - 1] ** D.count(k)
- return likelihood
- def priori(theta, alpha, K):
- priori = np.prod(gamma(alpha))
- priori /= gamma(np.sum(alpha))
- for k in range(1, K + 1):
- priori *= theta[k - 1] ** (alpha[k - 1] - 1)
- return priori
- def posteriori2(D, theta, alpha, K):
- N = []
- for k in range(1, K + 1):
- N.append(D.count(k))
- new_list = np.add(alpha, N)
- return dirichlet.pdf(theta, new_list)
- def priori2(theta, alpha):
- return dirichlet.pdf(theta, alpha)
- def posteriori(likelihood, priori):
- return likelihood * priori
- # PART 2
- def predictive_posterior(alpha, D, j, K):
- denom = 0
- for k in range(1, K + 1):
- denom += alpha[k - 1] + D.count(k)
- return (alpha[j - 1] + D.count(j)) / denom
- K = 6
- D = [1, 2, 4, 5, 6, 2, 2, 1, 4, 6, 2]
- theta = [0.2, 0.1, 0.1, 0.1, 0.3, 0.2]
- alpha = [2, 4, 0.1, 2, 1, 2]
- likelihood = likelihood(D, theta, K)
- priori = priori(theta, alpha, K)
- posteriori = posteriori(likelihood, priori)
- priori2 = priori2(theta, alpha)
- posteriori2 = posteriori2(D, theta, alpha, K)
- '''
- print('Likelihood: ' + str(likelihood))
- print('Priori: ' + str(priori))
- print('Posteriori: ' + str(posteriori))
- print('Priori2: ' + str(priori2))
- print('Posteriori2: ' + str(posteriori2))
- for k in range(1, K+1):
- print(str(k) + ' : ' + str(predictive_posterior(alpha, D, k, K)))
- '''
- # PART 3
- def load_file(filename):
- return codecs.open(filename, "r", encoding="utf-8", errors="ignore")
- def lemmatize_and_filter(file):
- nlp = spacy.load('en')
- doc = nlp(file.read())
- lemmas = []
- for token in doc:
- lemmas.append(token.lemma_)
- filtered_words = [word for word in lemmas if word not in stopwords.words('english')]
- filtered_words = list(filter(lambda w: w != '-PRON-' and w != '\r\n' and w != '\r\n\r\n', filtered_words))
- return filtered_words
- def get_train_and_test_data(array):
- train_size = int(len(array) * 0.8)
- return array[:train_size], list(set(array[train_size:]))
- def predictive_posterior_words(train, word, test_size):
- return (1 + train.count(word)) / (test_size + len(train))
- sns.set(style="whitegrid")
- f, ax = plt.subplots(ncols=2)
- #with lemmatization
- print('WITH LEMMATIZATION:')
- file = load_file('three_brothers.txt')
- words = lemmatize_and_filter(file)
- train, test = get_train_and_test_data(words)
- counter = collections.Counter(train)
- print('COUNTER TRAIN SET: ' + str(counter))
- print('TEST SET: ' + str(test))
- word_probs = {}
- for word in test:
- word_probs.update({word : [predictive_posterior_words(train, word, len(test))]})
- word_probs = sorted(word_probs.items(), key=operator.itemgetter(1), reverse=True)
- x = []
- y = []
- for tuple in word_probs:
- y.append(tuple[0])
- x.append(tuple[1][0])
- print('PREDICTIVE WORDS: ' + str(word_probs))
- sns.barplot(x=x, y=y, color="b", ax=ax[0]).set_title("With lemmatization")
- print()
- #without lemmatization
- print('WITHOUT LEMMATIZATION:')
- file2 = load_file('three_brothers.txt')
- words2 = file2.read().split(' ')
- words2 = [word for word in words2 if word not in stopwords.words('english')]
- train2, test2 = get_train_and_test_data(words2)
- counter2 = collections.Counter(train2)
- print('COUNTER TRAIN SET: ' + str(counter2))
- print('TEST SET: ' + str(test2))
- word_probs2 = {}
- for word2 in test2:
- word_probs2.update({word2 : [predictive_posterior_words(train2, word2, len(test2))]})
- word_probs2 = sorted(word_probs2.items(), key=operator.itemgetter(1), reverse=True)
- x = []
- y = []
- for tuple in word_probs2:
- y.append(tuple[0])
- x.append(tuple[1][0])
- print('PREDICTIVE WORDS: ' + str(word_probs2))
- sns.set_color_codes("pastel")
- sns.barplot(x=x, y=y, color="b", ax=ax[1]).set_title("Without lemmatization")
- sns.despine(left=True, bottom=True)
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement