tfidf.py

#!/usr/bin/python2
# coding: utf-8
#
# authors:
# Paweł Koniarski
# Filip Konieczny

from ngram import Ngram
from math import log
from operator import mul

class TFIDF(object):

    def __init__(self, *names):
        self.__getNgrams(*names)
        self.__allNgrams()
        self.__calculateIDF()
        self.__calculateTF()
        self.__calculateTFIDF()
        self.__compare(names)

    def __getNgrams(self, *names):
        ''' gets ngrams from all the article names
            in 'names' using the class Ngram '''
        self.articles = {}
        for i in names:
            self.articles[i] = Ngram(i).getNgrams()

    def __allNgrams(self):
        ''' makes a list of all ngrams from all articles '''
        l = []
        for i in self.articles.values():
            l.extend(i)
        # makes a unique list
        # (using the fact that seen.add(...) always returns false)
        seen = set()
        seen_add = seen.add
        self.ngrams = [ i for i in l if i not in seen and not seen_add(i) ]

    def __calculateIDF(self):
        ''' calculates IDF value for all ngrams from 'self.ngrams'
            idf = log(l/n) for every ngram
            l - number of articles,
            n - number of articles where that ngram is present '''
        self.idf = []
        l = len(self.articles)
        for i in self.ngrams:
            n = 0
            for j in self.articles.values():
                if i in j: n += 1
            self.idf.append(log(1.0 * l/n))

    def __calculateTF(self):
        ''' calculates TF value for every ngram in every article
            tf = n/l for every ngram
            n - number of ngrams in that article
            l - number of all ngrams in that article '''
        self.tf = {}
        for i in self.articles:
            m = self.articles[i]
            t = []
            l = sum(m.values())
            for j in self.ngrams:
                t.append(1.0 * m[j] / l)
            self.tf[i] = t

    def __calculateTFIDF(self):
        ''' calculates TFIDF value for every ngram in every article
            tfidf = tf * idf '''
        self.tfidf = {}
        for i in self.articles:
            m = self.articles[i]
            t = []
            for j in xrange(len(self.ngrams)):
                t.append(self.tf[i][j] * self.idf[j])
            self.tfidf[i] = t

    def __calculateSimilarity(self, first, second):
        ''' calculates similarity of two articles
            by getting a dot product of their tfidf vectors
            (it also multiplies the result by 1000 to see the numbers better) '''
        return sum( map( mul, first, second)) * 10**3

    def __compare(self, names):
        ''' generates pairs of articles up to 'len(names)' and shows
            the similarity of texts using __calculateSimilarity method '''
        i = 0
        l = len(names)
        while i < l:
            j = i + 1
            while j < l:
                print '"{0}" i "{1}": {2:.2%}'.format(names[i], names[j],\
                    self.__calculateSimilarity( self.tfidf[names[i]], self.tfidf[names[j]] ))
                j += 1
            i += 1


if __name__ == '__main__':
    TFIDF('scutisorex somereni', 'ryjówka aksamitna', 'sorex cinereus', 'nectogale elegans', 'ryjówka malutka')