Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python2
- # coding: utf-8
- #
- # authors:
- # Paweł Koniarski
- # Filip Konieczny
- from ngram import Ngram
- from math import log
- from operator import mul
- class TFIDF(object):
- def __init__(self, *names):
- self.__getNgrams(*names)
- self.__allNgrams()
- self.__calculateIDF()
- self.__calculateTF()
- self.__calculateTFIDF()
- self.__compare(names)
- def __getNgrams(self, *names):
- ''' gets ngrams from all the article names
- in 'names' using the class Ngram '''
- self.articles = {}
- for i in names:
- self.articles[i] = Ngram(i).getNgrams()
- def __allNgrams(self):
- ''' makes a list of all ngrams from all articles '''
- l = []
- for i in self.articles.values():
- l.extend(i)
- # makes a unique list
- # (using the fact that seen.add(...) always returns false)
- seen = set()
- seen_add = seen.add
- self.ngrams = [ i for i in l if i not in seen and not seen_add(i) ]
- def __calculateIDF(self):
- ''' calculates IDF value for all ngrams from 'self.ngrams'
- idf = log(l/n) for every ngram
- l - number of articles,
- n - number of articles where that ngram is present '''
- self.idf = []
- l = len(self.articles)
- for i in self.ngrams:
- n = 0
- for j in self.articles.values():
- if i in j: n += 1
- self.idf.append(log(1.0 * l/n))
- def __calculateTF(self):
- ''' calculates TF value for every ngram in every article
- tf = n/l for every ngram
- n - number of ngrams in that article
- l - number of all ngrams in that article '''
- self.tf = {}
- for i in self.articles:
- m = self.articles[i]
- t = []
- l = sum(m.values())
- for j in self.ngrams:
- t.append(1.0 * m[j] / l)
- self.tf[i] = t
- def __calculateTFIDF(self):
- ''' calculates TFIDF value for every ngram in every article
- tfidf = tf * idf '''
- self.tfidf = {}
- for i in self.articles:
- m = self.articles[i]
- t = []
- for j in xrange(len(self.ngrams)):
- t.append(self.tf[i][j] * self.idf[j])
- self.tfidf[i] = t
- def __calculateSimilarity(self, first, second):
- ''' calculates similarity of two articles
- by getting a dot product of their tfidf vectors
- (it also multiplies the result by 1000 to see the numbers better) '''
- return sum( map( mul, first, second)) * 10**3
- def __compare(self, names):
- ''' generates pairs of articles up to 'len(names)' and shows
- the similarity of texts using __calculateSimilarity method '''
- i = 0
- l = len(names)
- while i < l:
- j = i + 1
- while j < l:
- print '"{0}" i "{1}": {2:.2%}'.format(names[i], names[j],\
- self.__calculateSimilarity( self.tfidf[names[i]], self.tfidf[names[j]] ))
- j += 1
- i += 1
- if __name__ == '__main__':
- TFIDF('scutisorex somereni', 'ryjówka aksamitna', 'sorex cinereus', 'nectogale elegans', 'ryjówka malutka')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement