Advertisement
lewapkon

tfidf.py

Feb 22nd, 2014
312
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.31 KB | None | 0 0
  1. #!/usr/bin/python2
  2. # coding: utf-8
  3. #
  4. # authors:
  5. # Paweł Koniarski
  6. # Filip Konieczny
  7.  
  8. from ngram import Ngram
  9. from math import log
  10. from operator import mul
  11.  
  12. class TFIDF(object):
  13.  
  14.     def __init__(self, *names):
  15.         self.__getNgrams(*names)
  16.         self.__allNgrams()
  17.         self.__calculateIDF()
  18.         self.__calculateTF()
  19.         self.__calculateTFIDF()
  20.         self.__compare(names)
  21.        
  22.     def __getNgrams(self, *names):
  23.         ''' gets ngrams from all the article names
  24.            in 'names' using the class Ngram '''
  25.         self.articles = {}
  26.         for i in names:
  27.             self.articles[i] = Ngram(i).getNgrams()
  28.  
  29.     def __allNgrams(self):
  30.         ''' makes a list of all ngrams from all articles '''
  31.         l = []
  32.         for i in self.articles.values():
  33.             l.extend(i)
  34.         # makes a unique list
  35.         # (using the fact that seen.add(...) always returns false)
  36.         seen = set()
  37.         seen_add = seen.add
  38.         self.ngrams = [ i for i in l if i not in seen and not seen_add(i) ]
  39.  
  40.     def __calculateIDF(self):
  41.         ''' calculates IDF value for all ngrams from 'self.ngrams'
  42.            idf = log(l/n) for every ngram
  43.            l - number of articles,
  44.            n - number of articles where that ngram is present '''
  45.         self.idf = []
  46.         l = len(self.articles)
  47.         for i in self.ngrams:
  48.             n = 0
  49.             for j in self.articles.values():
  50.                 if i in j: n += 1
  51.             self.idf.append(log(1.0 * l/n))
  52.    
  53.     def __calculateTF(self):
  54.         ''' calculates TF value for every ngram in every article
  55.            tf = n/l for every ngram
  56.            n - number of ngrams in that article
  57.            l - number of all ngrams in that article '''
  58.         self.tf = {}
  59.         for i in self.articles:
  60.             m = self.articles[i]
  61.             t = []
  62.             l = sum(m.values())
  63.             for j in self.ngrams:
  64.                 t.append(1.0 * m[j] / l)
  65.             self.tf[i] = t
  66.  
  67.     def __calculateTFIDF(self):
  68.         ''' calculates TFIDF value for every ngram in every article
  69.            tfidf = tf * idf '''
  70.         self.tfidf = {}
  71.         for i in self.articles:
  72.             m = self.articles[i]
  73.             t = []
  74.             for j in xrange(len(self.ngrams)):
  75.                 t.append(self.tf[i][j] * self.idf[j])
  76.             self.tfidf[i] = t
  77.  
  78.     def __calculateSimilarity(self, first, second):
  79.         ''' calculates similarity of two articles
  80.            by getting a dot product of their tfidf vectors
  81.            (it also multiplies the result by 1000 to see the numbers better) '''
  82.         return sum( map( mul, first, second)) * 10**3
  83.  
  84.     def __compare(self, names):
  85.         ''' generates pairs of articles up to 'len(names)' and shows
  86.            the similarity of texts using __calculateSimilarity method '''
  87.         i = 0
  88.         l = len(names)
  89.         while i < l:
  90.             j = i + 1
  91.             while j < l:
  92.                 print '"{0}" i "{1}": {2:.2%}'.format(names[i], names[j],\
  93.                     self.__calculateSimilarity( self.tfidf[names[i]], self.tfidf[names[j]] ))
  94.                 j += 1
  95.             i += 1
  96.    
  97.  
  98. if __name__ == '__main__':
  99.     TFIDF('scutisorex somereni', 'ryjówka aksamitna', 'sorex cinereus', 'nectogale elegans', 'ryjówka malutka')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement