AIspit

#lab1 drva

"""

Задача 1 Problem 1 (2 / 12)
Да се промени класата за дрво на одлука да чува и информација на кое ниво во дрвото се наоѓа јазолот.
Потоа да се променат и функциите за градење и печатење на дрвото така што за секој јазол ќе се печати
и нивото. Коренот е на нулто ниво. На излез со функцијата printTree треба да се испечати даденото
тренинг множество. Прочитана инстанца од стандарден влез да се додаде на тренинг множеството
и потоа да се истренира и испечати истото.

"""

trainingData=[['slashdot','USA','yes',18,'None'],
        ['google','France','yes',23,'Premium'],
        ['google','France','yes',23,'Basic'],
        ['google','France','yes',23,'Basic'],
        ['digg','USA','yes',24,'Basic'],
        ['kiwitobes','France','yes',23,'Basic'],
        ['google','UK','no',21,'Premium'],
        ['(direct)','New Zealand','no',12,'None'],
        ['(direct)','UK','no',21,'Basic'],
        ['google','USA','no',24,'Premium'],
        ['slashdot','France','yes',19,'None'],
        ['digg','USA','no',18,'None'],
        ['google','UK','no',18,'None'],
        ['kiwitobes','UK','no',19,'None'],
        ['digg','New Zealand','yes',12,'Basic'],
        ['slashdot','UK','no',21,'None'],
        ['google','UK','yes',18,'Basic'],
        ['kiwitobes','France','yes',19,'Basic']]

class decisionnode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None, l=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb
        self.l = l


def sporedi_broj(row, column, value):
    return row[column] >= value


def sporedi_string(row, column, value):
    return row[column] == value


# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows, column, value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function = None
    if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
        # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
        split_function = sporedi_broj
    else:
        # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
        split_function = sporedi_string

    # Divide the rows into two sets and return them
    set_false = []
    set_true = []
    for row in rows:
        if split_function(row, column, value):
            set_true.append(row)
        else:
            set_false.append(row)
    set1 = [row for row in rows if
            split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
    set2 = [row for row in rows if
            not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
    # return (set1, set2)
    return (set_true, set_false)


# Create counts of possible results (the last column of
# each row is the result)
def uniquecounts(rows):
    results = {}
    for row in rows:
        # The result is the last column
        r = row[-1]
        results.setdefault(r, 0)
        results[r] += 1

    return results


def log2(x):
    from math import log
    l2 = log(x) / log(2)
    return l2


def entropy(rows):
    results = uniquecounts(rows)
    # Now calculate the entropy
    ent = 0.0
    for r in results.keys():
        p = float(results[r]) / len(rows)
        ent = ent - p * log2(p)
    return ent


def buildtree(rows, l=-1, scoref=entropy):
    if len(rows) == 0: return decisionnode()
    current_score = scoref(rows)

    # Set up some variables to track the best criteria
    best_gain = 0.0
    best_column = -1
    #best_value = None
    #best_subsetf = None
    #best_subsett = None
    best_criteria = None
    best_sets = None


    column_count = len(rows[0]) - 1
    for col in range(column_count):
        # Generate the list of different values in
        # this column
        column_values = {}
        for row in rows:
            column_values[row[col]] = 1
        # Now try dividing the rows up for each value
        # in this column
        for value in column_values:
            (set1, set2) = divideset(rows, col, value)

            # Information gain
            p = float(len(set1)) / len(rows)
            gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
            if gain > best_gain and len(set1) > 0 and len(set2) > 0:
                best_gain = gain
                #best_column = col
                #best_value = value
                #best_subsett = set1
                #best_subsetf = set2
                best_criteria = (col, value)
                best_sets = (set1, set2)

    # Create the subbranches
    if best_gain > 0:
        l = l+1
        trueBranch = buildtree(best_sets[0],l, scoref)
        falseBranch = buildtree(best_sets[1],l, scoref)
        return decisionnode(col=best_criteria[0], value=best_criteria[1],
                            tb=trueBranch, fb=falseBranch,l=l)
    else:
        return decisionnode(results=uniquecounts(rows))


def printtree(tree, indent=''):
    # Is this a leaf node?
    if tree.results != None:
        print str(tree.results)
    else:
        # Print the criteria
        print(str(tree.col) + ':' + str(tree.value) + '? ') + 'Level=' + str(tree.l)
        # Print the branches
        print(indent + 'T->'),
        printtree(tree.tb, indent + '  ')
        print(indent + 'F->'),
        printtree(tree.fb, indent + '  ')


def classify(observation, tree):
    if tree.results != None:
        return tree.results
    else:
        vrednost = observation[tree.col]
        branch = None

        if isinstance(vrednost, int) or isinstance(vrednost, float):
            if vrednost >= tree.value:
                branch = tree.tb
            else:
                branch = tree.fb
        else:
            if vrednost == tree.value:
                branch = tree.tb
            else:
                branch = tree.fb

        return classify(observation, branch)


if __name__ == "__main__":
    referrer=input()
    location=input()
    readFAQ=input()
    pagesVisited=input()
    serviceChosen=input()

    #referrer = 'google'
    #location = 'UK'
    #readFAQ = 'no',
    #pagesVisited = 18
    #serviceChosen = 'None'


    tmp = [referrer,location,readFAQ,pagesVisited,serviceChosen]
    trainingData.append(tmp)
    t = buildtree(trainingData)

    printtree(t)
-------------------------------------------------------------------------------------------------------
#lab2 Drva
"""
Да се промени функцијата за предвидување, така што таа ќе ја печати само класата
која ја предвидува (а не речник како сега). Притоа да се проверува дали во листот
има повеќе од една класа. Ако има само една класа тогаш се предвидува истата, но
ако има повеќе од една треба да се испечати таа со најголем број на инстанци. Ако
во листот има неколку класи со ист број на инстанци да се предвиде првата класа по азбучен ред.
"""

trainingData=[['slashdot','USA','yes',18,'None'],
        ['google','France','yes',23,'Premium'],
        ['google','France','yes',23,'Basic'],
        ['google','France','yes',23,'Basic'],
        ['digg','USA','yes',24,'Basic'],
        ['kiwitobes','France','yes',23,'Basic'],
        ['google','UK','no',21,'Premium'],
        ['(direct)','New Zealand','no',12,'None'],
        ['(direct)','UK','no',21,'Basic'],
        ['google','USA','no',24,'Premium'],
        ['slashdot','France','yes',19,'None'],
        ['digg','USA','no',18,'None'],
        ['google','UK','no',18,'None'],
        ['kiwitobes','UK','no',19,'None'],
        ['digg','New Zealand','yes',12,'Basic'],
        ['slashdot','UK','no',21,'None'],
        ['google','UK','yes',18,'Basic'],
        ['kiwitobes','France','yes',19,'Basic']]

class decisionnode:
    #lel init konstruktor?
      def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
         self.col=col
         self.value=value
         self.results=results
         self.tb=tb
         self.fb=fb

def sporedi_broj(row,column,value):
  return row[column]>=value

def sporedi_string(row,column,value):
  return row[column]==value

# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows,column,value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function=None #Flag za proverka
    if isinstance(value,int) or isinstance(value,float):
       # ako vrednosta so koja sporeduvame e od tip int ili float
       split_function=sporedi_broj
    else:
       # ako vrednosta so koja sporeduvame e od drug tip (string)
       split_function=sporedi_string

    # Divide the rows into two sets and return them
    set1=[row for row in rows if split_function(row,column,value)]  # za sekoj row od rows za koj split_function vrakja true
    set2=[row for row in rows if not split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja false
    return (set1,set2)

# Create counts of possible results
#(the last column (vertical result) of each row is the result)

def uniquecounts(rows):
  results={}
  for row in rows:
     # The result is the last column
     r=row[len(row)-1]
     if r not in results:
            results[r]=0
     results[r]+=1
  return results

# Entropy is the sum of p(x)log(p(x)) across all
# the different possible results
def entropy(rows):
      from math import log
      log2=lambda x:log(x)/log(2)
      results=uniquecounts(rows)
      # Now calculate the entropy
      ent=0.0
      for r in results.keys():
            p=float(results[r])/len(rows)
            ent=ent-p*log2(p)
      return ent

def buildtree(rows,scoref=entropy):
      if len(rows)==0: return decisionnode()
      current_score=scoref(rows)

      # Set up some variables to track the best criteria
      best_gain=0.0
      best_criteria=None
      best_sets=None

      column_count=len(rows[0])-1
      for col in range(0,column_count):
            # Generate the list of different values in
            # this column
            column_values={}
            for row in rows:
                  column_values[row[col]]=1

            # Now try dividing the rows up for each value
            # in this column
            for value in column_values.keys():
                  (set1,set2)=divideset(rows,col,value)

                  # Information gain
                  p=float(len(set1))/len(rows)
                  gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
                  if gain>best_gain and len(set1)>0 and len(set2)>0:
                        best_gain=gain
                        best_criteria=(col,value)
                        best_sets=(set1,set2)

      # Create the subbranches
      if best_gain>0:
            trueBranch=buildtree(best_sets[0])
            falseBranch=buildtree(best_sets[1])
            return decisionnode(col=best_criteria[0],value=best_criteria[1],tb=trueBranch, fb=falseBranch)
      else:
            return decisionnode(results=uniquecounts(rows))


def classify(observation,tree):
    if tree.results!=None:
        recnik=tree.results;
        lista=[]
        for k in recnik.keys():
            torka=(k,recnik[k])
            lista.append(torka)
        brKlasi=len(torka)
        if brKlasi==1:
            return lista[0][0]
        lista.sort()
        return lista[0][0]
    else:
        vrednost=observation[tree.col]
        branch=None

        if isinstance(vrednost,int) or isinstance(vrednost,float):
            if vrednost>=tree.value: branch=tree.tb
            else: branch=tree.fb
        else:
           if vrednost==tree.value: branch=tree.tb
           else: branch=tree.fb

    return classify(observation,branch)

if __name__ == "__main__":
    # referrer='slashdot'
    # location='UK'
    # readFAQ='no'
    # pagesVisited=21
    # serviceChosen='Unknown'

    referrer=input()
    location=input()
    readFAQ=input()
    pagesVisited=input()
    serviceChosen=input()

    testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
    t=buildtree(trainingData)
    print classify(testCase,t)
-------------------------------------------------------------------------------------------------------
#januari 2017
"""
Да се промени алгоритмот за дрво на одлука така што ќе се изградат 2 дрва на одлука.
Едното дрво на одлука ќе ја користи првата половина од податочното множество, а другото дрво,
втората половина.
Доколку двете дрва на одлука на тест примерот го дадат истиот резултат, да се испечати тој резултат.
Доколку дадат различен резултат, да се испечати KONTRADIKCIJA.
Доколку некое од дрвата има само една класа тогаш се предвидува истата,
но ако има повеќе од една треба да се избере таа со најголем број на инстанци.
Ако во листот има неколку класи со ист број на инстанци да се предвиде првата класа по азбучен ред.
"""
trainingData=[
[6.3,2.9,5.6,1.8,'I. virginica'],
[6.5,3.0,5.8,2.2,'I. virginica'],
[7.6,3.0,6.6,2.1,'I. virginica'],
[4.9,2.5,4.5,1.7,'I. virginica'],
[7.3,2.9,6.3,1.8,'I. virginica'],
[6.7,2.5,5.8,1.8,'I. virginica'],
[7.2,3.6,6.1,2.5,'I. virginica'],
[6.5,3.2,5.1,2.0,'I. virginica'],
[6.4,2.7,5.3,1.9,'I. virginica'],
[6.8,3.0,5.5,2.1,'I. virginica'],
[5.7,2.5,5.0,2.0,'I. virginica'],
[5.8,2.8,5.1,2.4,'I. virginica'],
[6.4,3.2,5.3,2.3,'I. virginica'],
[6.5,3.0,5.5,1.8,'I. virginica'],
[7.7,3.8,6.7,2.2,'I. virginica'],
[7.7,2.6,6.9,2.3,'I. virginica'],
[6.0,2.2,5.0,1.5,'I. virginica'],
[6.9,3.2,5.7,2.3,'I. virginica'],
[5.6,2.8,4.9,2.0,'I. virginica'],
[7.7,2.8,6.7,2.0,'I. virginica'],
[6.3,2.7,4.9,1.8,'I. virginica'],
[6.7,3.3,5.7,2.1,'I. virginica'],
[7.2,3.2,6.0,1.8,'I. virginica'],
[6.2,2.8,4.8,1.8,'I. virginica'],
[6.1,3.0,4.9,1.8,'I. virginica'],
[6.4,2.8,5.6,2.1,'I. virginica'],
[7.2,3.0,5.8,1.6,'I. virginica'],
[7.4,2.8,6.1,1.9,'I. virginica'],
[7.9,3.8,6.4,2.0,'I. virginica'],
[6.4,2.8,5.6,2.2,'I. virginica'],
[6.3,2.8,5.1,1.5,'I. virginica'],
[6.1,2.6,5.6,1.4,'I. virginica'],
[7.7,3.0,6.1,2.3,'I. virginica'],
[6.3,3.4,5.6,2.4,'I. virginica'],
[5.1,3.5,1.4,0.2,'I. setosa'],
[4.9,3.0,1.4,0.2,'I. setosa'],
[4.7,3.2,1.3,0.2,'I. setosa'],
[4.6,3.1,1.5,0.2,'I. setosa'],
[5.0,3.6,1.4,0.2,'I. setosa'],
[5.4,3.9,1.7,0.4,'I. setosa'],
[4.6,3.4,1.4,0.3,'I. setosa'],
[5.0,3.4,1.5,0.2,'I. setosa'],
[4.4,2.9,1.4,0.2,'I. setosa'],
[4.9,3.1,1.5,0.1,'I. setosa'],
[5.4,3.7,1.5,0.2,'I. setosa'],
[4.8,3.4,1.6,0.2,'I. setosa'],
[4.8,3.0,1.4,0.1,'I. setosa'],
[4.3,3.0,1.1,0.1,'I. setosa'],
[5.8,4.0,1.2,0.2,'I. setosa'],
[5.7,4.4,1.5,0.4,'I. setosa'],
[5.4,3.9,1.3,0.4,'I. setosa'],
[5.1,3.5,1.4,0.3,'I. setosa'],
[5.7,3.8,1.7,0.3,'I. setosa'],
[5.1,3.8,1.5,0.3,'I. setosa'],
[5.4,3.4,1.7,0.2,'I. setosa'],
[5.1,3.7,1.5,0.4,'I. setosa'],
[4.6,3.6,1.0,0.2,'I. setosa'],
[5.1,3.3,1.7,0.5,'I. setosa'],
[4.8,3.4,1.9,0.2,'I. setosa'],
[5.0,3.0,1.6,0.2,'I. setosa'],
[5.0,3.4,1.6,0.4,'I. setosa'],
[5.2,3.5,1.5,0.2,'I. setosa'],
[5.2,3.4,1.4,0.2,'I. setosa'],
[5.5,2.3,4.0,1.3,'I. versicolor'],
[6.5,2.8,4.6,1.5,'I. versicolor'],
[5.7,2.8,4.5,1.3,'I. versicolor'],
[6.3,3.3,4.7,1.6,'I. versicolor'],
[4.9,2.4,3.3,1.0,'I. versicolor'],
[6.6,2.9,4.6,1.3,'I. versicolor'],
[5.2,2.7,3.9,1.4,'I. versicolor'],
[5.0,2.0,3.5,1.0,'I. versicolor'],
[5.9,3.0,4.2,1.5,'I. versicolor'],
[6.0,2.2,4.0,1.0,'I. versicolor'],
[6.1,2.9,4.7,1.4,'I. versicolor'],
[5.6,2.9,3.6,1.3,'I. versicolor'],
[6.7,3.1,4.4,1.4,'I. versicolor'],
[5.6,3.0,4.5,1.5,'I. versicolor'],
[5.8,2.7,4.1,1.0,'I. versicolor'],
[6.2,2.2,4.5,1.5,'I. versicolor'],
[5.6,2.5,3.9,1.1,'I. versicolor'],
[5.9,3.2,4.8,1.8,'I. versicolor'],
[6.1,2.8,4.0,1.3,'I. versicolor'],
[6.3,2.5,4.9,1.5,'I. versicolor'],
[6.1,2.8,4.7,1.2,'I. versicolor'],
[6.4,2.9,4.3,1.3,'I. versicolor'],
[6.6,3.0,4.4,1.4,'I. versicolor'],
[6.8,2.8,4.8,1.4,'I. versicolor'],
[6.7,3.0,5.0,1.7,'I. versicolor'],
[6.0,2.9,4.5,1.5,'I. versicolor'],
[5.7,2.6,3.5,1.0,'I. versicolor'],
[5.5,2.4,3.8,1.1,'I. versicolor'],
[5.5,2.4,3.7,1.0,'I. versicolor'],
[5.8,2.7,3.9,1.2,'I. versicolor'],
[6.0,2.7,5.1,1.6,'I. versicolor'],
[5.4,3.0,4.5,1.5,'I. versicolor'],
[6.0,3.4,4.5,1.6,'I. versicolor'],
[6.7,3.1,4.7,1.5,'I. versicolor'],
[6.3,2.3,4.4,1.3,'I. versicolor'],
[5.6,3.0,4.1,1.3,'I. versicolor'],
[5.5,2.5,4.0,1.3,'I. versicolor'],
[5.5,2.6,4.4,1.2,'I. versicolor'],
[6.1,3.0,4.6,1.4,'I. versicolor'],
[5.8,2.6,4.0,1.2,'I. versicolor'],
[5.0,2.3,3.3,1.0,'I. versicolor'],
[5.6,2.7,4.2,1.3,'I. versicolor'],
[5.7,3.0,4.2,1.2,'I. versicolor'],
[5.7,2.9,4.2,1.3,'I. versicolor'],
[6.2,2.9,4.3,1.3,'I. versicolor'],
[5.1,2.5,3.0,1.1,'I. versicolor'],
[5.7,2.8,4.1,1.3,'I. versicolor'],
[6.4,3.1,5.5,1.8,'I. virginica'],
[6.0,3.0,4.8,1.8,'I. virginica'],
[6.9,3.1,5.4,2.1,'I. virginica'],
[6.7,3.1,5.6,2.4,'I. virginica'],
[6.9,3.1,5.1,2.3,'I. virginica'],
[5.8,2.7,5.1,1.9,'I. virginica'],
[6.8,3.2,5.9,2.3,'I. virginica'],
[6.7,3.3,5.7,2.5,'I. virginica'],
[6.7,3.0,5.2,2.3,'I. virginica'],
[6.3,2.5,5.0,1.9,'I. virginica'],
[6.5,3.0,5.2,2.0,'I. virginica'],
[6.2,3.4,5.4,2.3,'I. virginica'],
[4.7,3.2,1.6,0.2,'I. setosa'],
[4.8,3.1,1.6,0.2,'I. setosa'],
[5.4,3.4,1.5,0.4,'I. setosa'],
[5.2,4.1,1.5,0.1,'I. setosa'],
[5.5,4.2,1.4,0.2,'I. setosa'],
[4.9,3.1,1.5,0.2,'I. setosa'],
[5.0,3.2,1.2,0.2,'I. setosa'],
[5.5,3.5,1.3,0.2,'I. setosa'],
[4.9,3.6,1.4,0.1,'I. setosa'],
[4.4,3.0,1.3,0.2,'I. setosa'],
[5.1,3.4,1.5,0.2,'I. setosa'],
[5.0,3.5,1.3,0.3,'I. setosa'],
[4.5,2.3,1.3,0.3,'I. setosa'],
[4.4,3.2,1.3,0.2,'I. setosa'],
[5.0,3.5,1.6,0.6,'I. setosa'],
[5.1,3.8,1.9,0.4,'I. setosa'],
[4.8,3.0,1.4,0.3,'I. setosa'],
[5.1,3.8,1.6,0.2,'I. setosa'],
[5.9,3.0,5.1,1.8,'I. virginica']
]

class decisionnode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb


def sporedi_broj(row, column, value):
    return row[column] >= value


def sporedi_string(row, column, value):
    return row[column] == value


# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows, column, value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function = None
    if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
        # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
        split_function = sporedi_broj
    else:
        # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
        split_function = sporedi_string

    # Divide the rows into two sets and return them
    set_false = []
    set_true = []
    for row in rows:
        if split_function(row, column, value):
            set_true.append(row)
        else:
            set_false.append(row)
    set1 = [row for row in rows if
            split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
    set2 = [row for row in rows if
            not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
    # return (set1, set2)
    return (set_true, set_false)

def uniquecounts(rows):
    results = {}
    for row in rows:
        # The result is the last column
        r = row[-1]
        results.setdefault(r, 0)
        results[r] += 1

    return results

def log2(x):
    from math import log
    l2 = log(x) / log(2)
    return l2

def entropy(rows):
    results = uniquecounts(rows)
    # Now calculate the entropy
    ent = 0.0
    for r in results.keys():
        p = float(results[r]) / len(rows)
        ent = ent - p * log2(p)
    return ent
def buildtree(rows, scoref=entropy):
    if len(rows) == 0: return decisionnode()
    current_score = scoref(rows)

    # Set up some variables to track the best criteria
    best_gain = 0.0
    best_column = -1
    best_value = None
    best_subsetf = None
    best_subsett = None

    column_count = len(rows[0]) - 1
    for col in range(column_count):
        # Generate the list of different values in
        # this column
        column_values = set()
        for row in rows:
            column_values.add(row[col])
        # Now try dividing the rows up for each value
        # in this column
        for value in column_values:
            (set1, set2) = divideset(rows, col, value)

            # Information gain
            p = float(len(set1)) / len(rows)
            gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
            if gain > best_gain and len(set1) > 0 and len(set2) > 0:
                best_gain = gain
                best_column = col
                best_value = value
                best_subsett = set1
                best_subsetf = set2
                # best_criteria = (col, value)
                # best_sets = (set1, set2)

    # Create the subbranches
    if best_gain > 0:
        trueBranch = buildtree(best_subsett, scoref)
        falseBranch = buildtree(best_subsetf, scoref)
        return decisionnode(col=best_column, value=best_value,
                            tb=trueBranch, fb=falseBranch)
    else:
        return decisionnode(results=uniquecounts(rows))

def classify(observation, tree):
        if tree.results != None:
            return tree.results
        else:
            vrednost = observation[tree.col]
            branch = None

            if isinstance(vrednost, int) or isinstance(vrednost, float):
                if vrednost >= tree.value:
                    branch = tree.tb
                else:
                    branch = tree.fb
            else:
                if vrednost == tree.value:
                    branch = tree.tb
                else:
                    branch = tree.fb

            return classify(observation, branch)
if __name__ == "__main__":


    att1=input()
    att2=input()
    att3=input()
    att4=input()
    planttype=input()
    testCase=[att1,att2,att3,att4,planttype]
    mn1=[] #kreiranje prvo mnozestvo
    mn2=[] #kreiranje vtoro mnozestvo
    vkupno=len(trainingData) #vkupno redovi
    for i in range(0,vkupno/2): #polnenje na prvo trening mnozhestvo
        mn1.append(trainingData[i])
    for i in range(vkupno/2,vkupno): # polnenje na vtoro trening mnozhestvo
        mn2.append(trainingData[i])
    drvo1=buildtree(mn1) #kreiranje prvo drvo od prvo trening mnozhestvo
    drvo2=buildtree(mn2) #kreiranje vtoro drvo od vtoro trening mnozhestvo
    kl1=classify(testCase,drvo1) #prva klasifikacija
    kl2=classify(testCase,drvo2) #vtora klasifikacija
    if (kl1.keys()[0]==kl2.keys()[0]): #proverka dali se isti
        print kl1.keys()[0]
    if(kl1.values()[0]>kl2.values()[0]):
        print kl1.keys()[0]
        print 'KONTRADIKCIJA'
    if(kl2.values()[0]>kl1.values()[0]):
        print kl1.keys()[0]
        print 'KONTRADIKCIJA'
-------------------------------------------------------------------------------------------------------
#januari 2018 - Drva
Да се промени функцијата за предвидување, така што при изминувањето ќе печати информации за:
-со која колона и вредност се споредува
-за која е тековната вредност на тест примерокот за бараната колона
-нивото на тековниот јазол во дрвото
-која е следната гранка што ќе се изминува низ дрвото (True branch или False branch)
-преостанатиот дел од дрвото што треба да се измине
-празна линија

Потоа да се испечати истренираното дрво, да се вчита непознат тренинг примерок од стандардниот влез
и истиот да се класифицира со новата функција за предвидување.

trainingData=[['twitter','USA','yes',18,'None'],
        ['google','France','yes',23,'Premium'],
        ['google','France','no',26,'Basic'],
        ['google','Macedonia','yes',13,'None'],
        ['pinterest','USA','yes',24,'Basic'],
        ['bing','France','yes',23,'Basic'],
        ['google','UK','no',21,'Premium'],
        ['facebook','New Zealand','no',12,'None'],
        ['facebook','UK','no',21,'Basic'],
        ['google','USA','no',24,'Premium'],
        ['twitter','France','yes',19,'None'],
        ['pinterest','USA','no',18,'None'],
        ['google','UK','no',18,'None'],
        ['bing','UK','yes',19,'Premium'],
        ['bing','Macedonia','no',10,'None'],
        ['facebook','Macedonia','no',16,'Basic'],
        ['bing','UK','no',19,'Basic'],
        ['pinterest','Germany','no',2,'None'],
        ['pinterest','USA','yes',12,'Basic'],
        ['twitter','UK','no',21,'None'],
        ['twitter','UK','yes',26,'Premium'],
        ['google','UK','yes',18,'Basic'],
        ['bing','France','yes',19,'Basic']]

test_cases=[['google','MK','no',24,'Unknown'],
            ['google','MK','no',15,'Unknown'],
            ['pinterest','UK','yes',21,'Unknown'],
            ['pinterest','UK','no',25,'Unknown']]

# trainingData=[line.split('\t') for line in file('decision_tree_example.txt')]

class decisionnode:
      def __init__(self,col=-1,value=None,results=None,tb=None,fb=None,level=0):
         self.level=level
         self.col=col
         self.value=value
         self.results=results
         self.tb=tb
         self.fb=fb
         self.level=level

def sporedi_broj(row,column,value):
  return row[column]>=value

def sporedi_string(row,column,value):
  return row[column]==value

# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows,column,value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function=None
    if isinstance(value,int) or isinstance(value,float): # ako vrednosta so koja sporeduvame e od tip int ili float
       #split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
       split_function=sporedi_broj
    else:
       # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
       split_function=sporedi_string

    # Divide the rows into two sets and return them
    # set1=[row for row in rows if split_function(row)]  # za sekoj row od rows za koj split_function vrakja true
    # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
    set1=[row for row in rows if split_function(row,column,value)]  # za sekoj row od rows za koj split_function vrakja true
    set2=[row for row in rows if not split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja false

    return (set1,set2)

# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset2(rows,column,value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function=None
    if isinstance(value,int) or isinstance(value,float): # ako vrednosta so koja sporeduvame e od tip int ili float
       #split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
       split_function=sporedi_broj
    else:
       # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
       split_function=sporedi_string

    # Divide the rows into two sets and return them
    # set1=[row for row in rows if split_function(row)]  # za sekoj row od rows za koj split_function vrakja true
    # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
    set1=[]
    set2=[]
    for row in rows:
      if split_function(row,column,value):
        set1.append(row)
      else:
        set2.append(row)
    return (set1,set2)


# Create counts of possible results (the last column of
# each row is the result)
def uniquecounts(rows):
  results={}
  for row in rows:
     # The result is the last column
     r=row[len(row)-1]
     if r not in results: results[r]=0
     results[r]+=1
  return results


# Entropy is the sum of p(x)log(p(x)) across all
# the different possible results
def entropy(rows):
      from math import log
      log2=lambda x:log(x)/log(2)
      results=uniquecounts(rows)
      # Now calculate the entropy
      ent=0.0
      for r in results.keys():
            p=float(results[r])/len(rows)
            ent=ent-p*log2(p)
      return ent

def buildtree(rows,scoref=entropy,level=0):
      if len(rows)==0: return decisionnode()
      current_score=scoref(rows)

      # Set up some variables to track the best criteria
      best_gain=0.0
      best_criteria=None
      best_sets=None

      column_count=len(rows[0])-1
      for col in range(0,column_count):
            # Generate the list of different values in
            # this column
            column_values={}
            for row in rows:
                  column_values[row[col]]=1
                  # print row[col]
            # print
            # print column_values
            # Now try dividing the rows up for each value
            # in this column
            for value in column_values.keys():
                  (set1,set2)=divideset(rows,col,value)

                  # Information gain
                  p=float(len(set1))/len(rows)
                  gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
                  # print set1, set2, gain
                  if gain>best_gain and len(set1)>0 and len(set2)>0:
                        best_gain=gain
                        best_criteria=(col,value)
                        best_sets=(set1,set2)

      # Create the subbranches
      if best_gain>0:
            trueBranch=buildtree(best_sets[0],level=level+1)
            falseBranch=buildtree(best_sets[1], level=level+1)
            return decisionnode(col=best_criteria[0],value=best_criteria[1],
                            tb=trueBranch, fb=falseBranch, level=level)
      else:
            return decisionnode(results=uniquecounts(rows))

def printtree(tree,indent=''):
      # Is this a leaf node?
      if tree.results!=None:
            print str(tree.results)
      else:
            # Print the criteria
            print str(tree.col)+':'+str(tree.value)+'?' + ' Level='+str(tree.level)
            # Print the branches
            print indent+'T->',
            printtree(tree.tb,indent+'  ')
            print indent+'F->',
            printtree(tree.fb,indent+'  ')

def classify(observation,tree):
    if tree.results!=None:
        results=[(value,key) for key,value in tree.results.items()]
        results.sort()
        return results[0][1]
    else:
        vrednost=observation[tree.col]
        branch=None

        if isinstance(vrednost,int) or isinstance(vrednost,float):
            if vrednost>=tree.value: branch=tree.tb
            else: branch=tree.fb
        else:
           if vrednost==tree.value: branch=tree.tb
           else: branch=tree.fb

        return classify(observation,branch)


def classify2(observation,tree):
    if tree.results!=None:
        results=[(value,key) for key,value in tree.results.items()]
        results.sort()
        return results[0][1]
    else:
        vrednost=observation[tree.col]
        branch=None

        if isinstance(vrednost,int) or isinstance(vrednost,float):
            if vrednost>=tree.value: branch=tree.tb
            else: branch=tree.fb
        else:
           if vrednost==tree.value: branch=tree.tb
           else: branch=tree.fb

        return classify2(observation,branch)

def classify3(observation,tree):
    if tree.results!=None:
        results=[(value,key) for key,value in tree.results.items()]
        results.sort()
        return results[0][1]
    else:
        vrednost=observation[tree.col]
        branch=None
        granka='True branch'
        if isinstance(vrednost,int) or isinstance(vrednost,float):
            if vrednost>=tree.value:
                branch=tree.tb
            else:
                branch=tree.fb
                granka='False branch'
        else:
           if vrednost==tree.value:
               branch=tree.tb
           else:
               branch=tree.fb
               granka='False branch'
        print 'Sporeduvam kolona i vrednost', (tree.col, tree.value)
        print 'Tekovna vrednost:', vrednost
        print 'Sledna granka:',granka
        print 'Preostanata granka za izminuvanje:'
        printtree(branch)
        print
        return classify3(observation,branch)

if __name__ == "__main__":
    referrer=input()
    location=input()
    readFAQ=input()
    pagesVisited=input()
    serviceChosen='Unknown'


    testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]

    t=buildtree(trainingData)
    printtree(t)
    print classify3(testCase,t)

--------------------------------------------------------------------------------------------------------
# -*- coding: utf-8 -*-


#Дадено е податочно множество од риби кое ги содржи следните видови:

#    Code Finnish  Swedish    English        Latin

   #  1   Lahna    Braxen     Bream          Abramis brama
 #    2   Siika    Iiden      Whitewish      Leusiscus idus
   #  3   Saerki   Moerten    Roach          Leuciscus rutilus
 #    4   Parkki   Bjoerknan  ?              Abramis bjrkna
  #   5   Norssi   Norssen    Smelt          Osmerus eperlanus
   #  6   Hauki    Jaedda     Pike           Esox lucius
   #  7   Ahven    Abborre    Perch          Perca fluviatilis
#Дадени се следните атрибути:

  #  0  Weight      Weight of the fish (in grams)
  #  1  Length1     Length from the nose to the beginning of the tail (in cm)
  #  2  Length2     Length from the nose to the notch of the tail (in cm)
  #  3  Length3     Length from the nose to the end of the tail (in cm)
   # 4  Height%     Maximal height as % of Length3
   # 5  Width%      Maximal width as % of Length3
#Класата е дадена во последната колона.

#Да се направи модел за класификација за даденото податочно множество. За тренинг да се земат
#само првите 5 примероци од секоја од класите во множеството. Притоа ова да се направи во програмата,
# а не со рачно копирање! Да се класифицира елементот од податочното множество даден на влез и да се
# испечати предвидувањето. Елементот е даден со индексот од податочното множество.


data = [[242.0, 23.2, 25.4, 30.0, 38.4, 13.4, 1],
        [290.0, 24.0, 26.3, 31.2, 40.0, 13.8, 1],
        [340.0, 23.9, 26.5, 31.1, 39.8, 15.1, 1],
        [363.0, 26.3, 29.0, 33.5, 38.0, 13.3, 1],
        [430.0, 26.5, 29.0, 34.0, 36.6, 15.1, 1],
        [450.0, 26.8, 29.7, 34.7, 39.2, 14.2, 1],
        [500.0, 26.8, 29.7, 34.5, 41.1, 15.3, 1],
        [390.0, 27.6, 30.0, 35.0, 36.2, 13.4, 1],
        [450.0, 27.6, 30.0, 35.1, 39.9, 13.8, 1],
        [500.0, 28.5, 30.7, 36.2, 39.3, 13.7, 1],
        [475.0, 28.4, 31.0, 36.2, 39.4, 14.1, 1],
        [500.0, 28.7, 31.0, 36.2, 39.7, 13.3, 1],
        [500.0, 29.1, 31.5, 36.4, 37.8, 12.0, 1],
        [500.0, 29.5, 32.0, 37.3, 37.3, 13.6, 1],
        [600.0, 29.4, 32.0, 37.2, 40.2, 13.9, 1],
        [600.0, 29.4, 32.0, 37.2, 41.5, 15.0, 1],
        [700.0, 30.4, 33.0, 38.3, 38.8, 13.8, 1],
        [700.0, 30.4, 33.0, 38.5, 38.8, 13.5, 1],
        [610.0, 30.9, 33.5, 38.6, 40.5, 13.3, 1],
        [650.0, 31.0, 33.5, 38.7, 37.4, 14.8, 1],
        [575.0, 31.3, 34.0, 39.5, 38.3, 14.1, 1],
        [685.0, 31.4, 34.0, 39.2, 40.8, 13.7, 1],
        [620.0, 31.5, 34.5, 39.7, 39.1, 13.3, 1],
        [680.0, 31.8, 35.0, 40.6, 38.1, 15.1, 1],
        [700.0, 31.9, 35.0, 40.5, 40.1, 13.8, 1],
        [725.0, 31.8, 35.0, 40.9, 40.0, 14.8, 1],
        [720.0, 32.0, 35.0, 40.6, 40.3, 15.0, 1],
        [714.0, 32.7, 36.0, 41.5, 39.8, 14.1, 1],
        [850.0, 32.8, 36.0, 41.6, 40.6, 14.9, 1],
        [1000.0, 33.5, 37.0, 42.6, 44.5, 15.5, 1],
        [920.0, 35.0, 38.5, 44.1, 40.9, 14.3, 1],
        [955.0, 35.0, 38.5, 44.0, 41.1, 14.3, 1],
        [925.0, 36.2, 39.5, 45.3, 41.4, 14.9, 1],
        [975.0, 37.4, 41.0, 45.9, 40.6, 14.7, 1],
        [950.0, 38.0, 41.0, 46.5, 37.9, 13.7, 1],
        [270.0, 23.6, 26.0, 28.7, 29.2, 14.8, 2],
        [270.0, 24.1, 26.5, 29.3, 27.8, 14.5, 2],
        [306.0, 25.6, 28.0, 30.8, 28.5, 15.2, 2],
        [540.0, 28.5, 31.0, 34.0, 31.6, 19.3, 2],
        [800.0, 33.7, 36.4, 39.6, 29.7, 16.6, 2],
        [1000.0, 37.3, 40.0, 43.5, 28.4, 15.0, 2],
        [40.0, 12.9, 14.1, 16.2, 25.6, 14.0, 3],
        [69.0, 16.5, 18.2, 20.3, 26.1, 13.9, 3],
        [78.0, 17.5, 18.8, 21.2, 26.3, 13.7, 3],
        [87.0, 18.2, 19.8, 22.2, 25.3, 14.3, 3],
        [120.0, 18.6, 20.0, 22.2, 28.0, 16.1, 3],
        [0.0, 19.0, 20.5, 22.8, 28.4, 14.7, 3],
        [110.0, 19.1, 20.8, 23.1, 26.7, 14.7, 3],
        [120.0, 19.4, 21.0, 23.7, 25.8, 13.9, 3],
        [150.0, 20.4, 22.0, 24.7, 23.5, 15.2, 3],
        [145.0, 20.5, 22.0, 24.3, 27.3, 14.6, 3],
        [160.0, 20.5, 22.5, 25.3, 27.8, 15.1, 3],
        [140.0, 21.0, 22.5, 25.0, 26.2, 13.3, 3],
        [160.0, 21.1, 22.5, 25.0, 25.6, 15.2, 3],
        [169.0, 22.0, 24.0, 27.2, 27.7, 14.1, 3],
        [161.0, 22.0, 23.4, 26.7, 25.9, 13.6, 3],
        [200.0, 22.1, 23.5, 26.8, 27.6, 15.4, 3],
        [180.0, 23.6, 25.2, 27.9, 25.4, 14.0, 3],
        [290.0, 24.0, 26.0, 29.2, 30.4, 15.4, 3],
        [272.0, 25.0, 27.0, 30.6, 28.0, 15.6, 3],
        [390.0, 29.5, 31.7, 35.0, 27.1, 15.3, 3],
        [55.0, 13.5, 14.7, 16.5, 41.5, 14.1, 4],
        [60.0, 14.3, 15.5, 17.4, 37.8, 13.3, 4],
        [90.0, 16.3, 17.7, 19.8, 37.4, 13.5, 4],
        [120.0, 17.5, 19.0, 21.3, 39.4, 13.7, 4],
        [150.0, 18.4, 20.0, 22.4, 39.7, 14.7, 4],
        [140.0, 19.0, 20.7, 23.2, 36.8, 14.2, 4],
        [170.0, 19.0, 20.7, 23.2, 40.5, 14.7, 4],
        [145.0, 19.8, 21.5, 24.1, 40.4, 13.1, 4],
        [200.0, 21.2, 23.0, 25.8, 40.1, 14.2, 4],
        [273.0, 23.0, 25.0, 28.0, 39.6, 14.8, 4],
        [300.0, 24.0, 26.0, 29.0, 39.2, 14.6, 4],
        [6.7, 9.3, 9.8, 10.8, 16.1, 9.7, 5],
        [7.5, 10.0, 10.5, 11.6, 17.0, 10.0, 5],
        [7.0, 10.1, 10.6, 11.6, 14.9, 9.9, 5],
        [9.7, 10.4, 11.0, 12.0, 18.3, 11.5, 5],
        [9.8, 10.7, 11.2, 12.4, 16.8, 10.3, 5],
        [8.7, 10.8, 11.3, 12.6, 15.7, 10.2, 5],
        [10.0, 11.3, 11.8, 13.1, 16.9, 9.8, 5],
        [9.9, 11.3, 11.8, 13.1, 16.9, 8.9, 5],
        [9.8, 11.4, 12.0, 13.2, 16.7, 8.7, 5],
        [12.2, 11.5, 12.2, 13.4, 15.6, 10.4, 5],
        [13.4, 11.7, 12.4, 13.5, 18.0, 9.4, 5],
        [12.2, 12.1, 13.0, 13.8, 16.5, 9.1, 5],
        [19.7, 13.2, 14.3, 15.2, 18.9, 13.6, 5],
        [19.9, 13.8, 15.0, 16.2, 18.1, 11.6, 5],
        [200.0, 30.0, 32.3, 34.8, 16.0, 9.7, 6],
        [300.0, 31.7, 34.0, 37.8, 15.1, 11.0, 6],
        [300.0, 32.7, 35.0, 38.8, 15.3, 11.3, 6],
        [300.0, 34.8, 37.3, 39.8, 15.8, 10.1, 6],
        [430.0, 35.5, 38.0, 40.5, 18.0, 11.3, 6],
        [345.0, 36.0, 38.5, 41.0, 15.6, 9.7, 6],
        [456.0, 40.0, 42.5, 45.5, 16.0, 9.5, 6],
        [510.0, 40.0, 42.5, 45.5, 15.0, 9.8, 6],
        [540.0, 40.1, 43.0, 45.8, 17.0, 11.2, 6],
        [500.0, 42.0, 45.0, 48.0, 14.5, 10.2, 6],
        [567.0, 43.2, 46.0, 48.7, 16.0, 10.0, 6],
        [770.0, 44.8, 48.0, 51.2, 15.0, 10.5, 6],
        [950.0, 48.3, 51.7, 55.1, 16.2, 11.2, 6],
        [1250.0, 52.0, 56.0, 59.7, 17.9, 11.7, 6],
        [1600.0, 56.0, 60.0, 64.0, 15.0, 9.6, 6],
        [1550.0, 56.0, 60.0, 64.0, 15.0, 9.6, 6],
        [1650.0, 59.0, 63.4, 68.0, 15.9, 11.0, 6],
        [5.9, 7.5, 8.4, 8.8, 24.0, 16.0, 7],
        [32.0, 12.5, 13.7, 14.7, 24.0, 13.6, 7],
        [40.0, 13.8, 15.0, 16.0, 23.9, 15.2, 7],
        [51.5, 15.0, 16.2, 17.2, 26.7, 15.3, 7],
        [70.0, 15.7, 17.4, 18.5, 24.8, 15.9, 7],
        [100.0, 16.2, 18.0, 19.2, 27.2, 17.3, 7],
        [78.0, 16.8, 18.7, 19.4, 26.8, 16.1, 7],
        [80.0, 17.2, 19.0, 20.2, 27.9, 15.1, 7],
        [85.0, 17.8, 19.6, 20.8, 24.7, 14.6, 7],
        [85.0, 18.2, 20.0, 21.0, 24.2, 13.2, 7],
        [110.0, 19.0, 21.0, 22.5, 25.3, 15.8, 7],
        [115.0, 19.0, 21.0, 22.5, 26.3, 14.7, 7],
        [125.0, 19.0, 21.0, 22.5, 25.3, 16.3, 7],
        [130.0, 19.3, 21.3, 22.8, 28.0, 15.5, 7],
        [120.0, 20.0, 22.0, 23.5, 26.0, 14.5, 7],
        [120.0, 20.0, 22.0, 23.5, 24.0, 15.0, 7],
        [130.0, 20.0, 22.0, 23.5, 26.0, 15.0, 7],
        [135.0, 20.0, 22.0, 23.5, 25.0, 15.0, 7],
        [110.0, 20.0, 22.0, 23.5, 23.5, 17.0, 7],
        [130.0, 20.5, 22.5, 24.0, 24.4, 15.1, 7],
        [150.0, 20.5, 22.5, 24.0, 28.3, 15.1, 7],
        [145.0, 20.7, 22.7, 24.2, 24.6, 15.0, 7],
        [150.0, 21.0, 23.0, 24.5, 21.3, 14.8, 7],
        [170.0, 21.5, 23.5, 25.0, 25.1, 14.9, 7],
        [225.0, 22.0, 24.0, 25.5, 28.6, 14.6, 7],
        [145.0, 22.0, 24.0, 25.5, 25.0, 15.0, 7],
        [188.0, 22.6, 24.6, 26.2, 25.7, 15.9, 7],
        [180.0, 23.0, 25.0, 26.5, 24.3, 13.9, 7],
        [197.0, 23.5, 25.6, 27.0, 24.3, 15.7, 7],
        [218.0, 25.0, 26.5, 28.0, 25.6, 14.8, 7],
        [300.0, 25.2, 27.3, 28.7, 29.0, 17.9, 7],
        [260.0, 25.4, 27.5, 28.9, 24.8, 15.0, 7],
        [265.0, 25.4, 27.5, 28.9, 24.4, 15.0, 7],
        [250.0, 25.4, 27.5, 28.9, 25.2, 15.8, 7],
        [250.0, 25.9, 28.0, 29.4, 26.6, 14.3, 7],
        [300.0, 26.9, 28.7, 30.1, 25.2, 15.4, 7],
        [320.0, 27.8, 30.0, 31.6, 24.1, 15.1, 7],
        [514.0, 30.5, 32.8, 34.0, 29.5, 17.7, 7],
        [556.0, 32.0, 34.5, 36.5, 28.1, 17.5, 7],
        [840.0, 32.5, 35.0, 37.3, 30.8, 20.9, 7],
        [685.0, 34.0, 36.5, 39.0, 27.9, 17.6, 7],
        [700.0, 34.0, 36.0, 38.3, 27.7, 17.6, 7],
        [700.0, 34.5, 37.0, 39.4, 27.5, 15.9, 7],
        [690.0, 34.6, 37.0, 39.3, 26.9, 16.2, 7],
        [900.0, 36.5, 39.0, 41.4, 26.9, 18.1, 7],
        [650.0, 36.5, 39.0, 41.4, 26.9, 14.5, 7],
        [820.0, 36.6, 39.0, 41.3, 30.1, 17.8, 7],
        [850.0, 36.9, 40.0, 42.3, 28.2, 16.8, 7],
        [900.0, 37.0, 40.0, 42.5, 27.6, 17.0, 7],
        [1015.0, 37.0, 40.0, 42.4, 29.2, 17.6, 7],
        [820.0, 37.1, 40.0, 42.5, 26.2, 15.6, 7],
        [1100.0, 39.0, 42.0, 44.6, 28.7, 15.4, 7],
        [1000.0, 39.8, 43.0, 45.2, 26.4, 16.1, 7],
        [1100.0, 40.1, 43.0, 45.5, 27.5, 16.3, 7],
        [1000.0, 40.2, 43.5, 46.0, 27.4, 17.7, 7],
        [1000.0, 41.1, 44.0, 46.6, 26.8, 16.3, 7]]


class decisionnode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb


def sporedi_broj(row, column, value):
    return row[column] >= value


def sporedi_string(row, column, value):
    return row[column] == value


# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows, column, value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function = None
    if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
        # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
        split_function = sporedi_broj
    else:
        # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
        split_function = sporedi_string

    # Divide the rows into two sets and return them
    set_false = []
    set_true = []
    for row in rows:
        if split_function(row, column, value):
            set_true.append(row)
        else:
            set_false.append(row)
    set1 = [row for row in rows if
            split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
    set2 = [row for row in rows if
            not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
    # return (set1, set2)
    return (set_true, set_false)


# Create counts of possible results (the last column of
# each row is the result)
def uniquecounts(rows):
    results = {}
    for row in rows:
        # The result is the last column
        r = row[-1]
        results.setdefault(r, 0)
        results[r] += 1

    return results


# Probability that a randomly placed item will
# be in the wrong category

def log2(x):
    from math import log
    l2 = log(x) / log(2)
    return l2


# Entropy is the sum of p(x)log(p(x)) across all
# the different possible results
def entropy(rows):
    results = uniquecounts(rows)
    # Now calculate the entropy
    ent = 0.0
    for r in results.keys():
        p = float(results[r]) / len(rows)
        ent = ent - p * log2(p)
    return ent


def buildtree(rows, scoref=entropy):
    if len(rows) == 0: return decisionnode()
    current_score = scoref(rows)

    # Set up some variables to track the best criteria
    best_gain = 0.0
    best_column = -1
    best_value = None
    best_subsetf = None
    best_subsett = None

    column_count = len(rows[0]) - 1
    for col in range(column_count):
        # Generate the list of different values in
        # this column
        column_values = set()
        for row in rows:
            column_values.add(row[col])
        # Now try dividing the rows up for each value
        # in this column
        for value in column_values:
            (set1, set2) = divideset(rows, col, value)

            # Information gain
            p = float(len(set1)) / len(rows)
            gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
            if gain > best_gain and len(set1) > 0 and len(set2) > 0:
                best_gain = gain
                best_column = col
                best_value = value
                best_subsett = set1
                best_subsetf = set2
                # best_criteria = (col, value)
                # best_sets = (set1, set2)

    # Create the subbranches
    if best_gain > 0:
        trueBranch = buildtree(best_subsett, scoref)
        falseBranch = buildtree(best_subsetf, scoref)
        return decisionnode(col=best_column, value=best_value,
                            tb=trueBranch, fb=falseBranch)
    else:
        return decisionnode(results=uniquecounts(rows))


def printtree(tree, indent=''):
    # Is this a leaf node?
    if tree.results != None:
        print(indent + str(sorted(tree.results.items())))
    else:
        # Print the criteria
        print(indent + str(tree.col) + ':' + str(tree.value) + '? ')
        # Print the branches
        print(indent + 'T->')
        printtree(tree.tb, indent + '  ')
        print(indent + 'F->')
        printtree(tree.fb, indent + '  ')


def classify(observation, tree):
    if tree.results != None:
        return tree.results
    else:
        vrednost = observation[tree.col]
        branch = None

        if isinstance(vrednost, int) or isinstance(vrednost, float):
            if vrednost >= tree.value:
                branch = tree.tb
            else:
                branch = tree.fb
        else:
            if vrednost == tree.value:
                branch = tree.tb
            else:
                branch = tree.fb

        return classify(observation, branch)


def new_data_set_fu(data):
    classes = {}
    for i in data:
        # print(i)
        classes.setdefault(i[-1])

    classes_data = list(classes)

    flag = 0
    tmp = 0
    data_set = []
    for i in data:

        if flag == 5:
            flag = 0
            tmp += 1
            if tmp == classes_data[-1]:
                break
        elif classes_data[tmp] == i[-1]:
            data_set.append(i)
            # print(i)
            flag += 1

    return data_set

if __name__ == '__main__':

    my_index = 5

    data_set = new_data_set_fu(data)
    t = buildtree(data_set)

    #printtree(t)

    c = classify(data[my_index],t)

    print(c)
----------------------------------------------------------------------------------------------------
Drva ispitna
"""

Дрва на одлука (100 поени) Problem 2 (0 / 16)
Да се промени алгоритмот за дрво на одлука така што ќе се изградат две дрва на одлука. Секое од дрвата го користи половина од податочното множество.

Да се промени начинот на печатење на дрвото така што покрај секој јазол, ќе се испечати и неговото ниво.

Двете дрва да се испечатат и потоа да се испечати резултатот од класификацијата.

Доколку некое од дрвата има само една класа тогаш се предвидува истата, но ако има повеќе од една треба да се избере таа со најголем број на инстанци. Ако во листот има неколку класи со ист број на инстанци да се предвиде првата класа по азбучен ред.

Доколку двете дрва ја предвидат истата класа да се испечати класата. Во спротивно да се испечати KONTRADIKCIJA.


"""

trainingData = [
    [6.3, 2.9, 5.6, 1.8, 'I. virginica'],
    [6.5, 3.0, 5.8, 2.2, 'I. virginica'],
    [7.6, 3.0, 6.6, 2.1, 'I. virginica'],
    [4.9, 2.5, 4.5, 1.7, 'I. virginica'],
    [7.3, 2.9, 6.3, 1.8, 'I. virginica'],
    [6.7, 2.5, 5.8, 1.8, 'I. virginica'],
    [7.2, 3.6, 6.1, 2.5, 'I. virginica'],
    [6.5, 3.2, 5.1, 2.0, 'I. virginica'],
    [6.4, 2.7, 5.3, 1.9, 'I. virginica'],
    [6.8, 3.0, 5.5, 2.1, 'I. virginica'],
    [5.7, 2.5, 5.0, 2.0, 'I. virginica'],
    [5.8, 2.8, 5.1, 2.4, 'I. virginica'],
    [6.4, 3.2, 5.3, 2.3, 'I. virginica'],
    [6.5, 3.0, 5.5, 1.8, 'I. virginica'],
    [7.7, 3.8, 6.7, 2.2, 'I. virginica'],
    [7.7, 2.6, 6.9, 2.3, 'I. virginica'],
    [6.0, 2.2, 5.0, 1.5, 'I. virginica'],
    [6.9, 3.2, 5.7, 2.3, 'I. virginica'],
    [5.6, 2.8, 4.9, 2.0, 'I. virginica'],
    [7.7, 2.8, 6.7, 2.0, 'I. virginica'],
    [6.3, 2.7, 4.9, 1.8, 'I. virginica'],
    [6.7, 3.3, 5.7, 2.1, 'I. virginica'],
    [7.2, 3.2, 6.0, 1.8, 'I. virginica'],
    [6.2, 2.8, 4.8, 1.8, 'I. virginica'],
    [6.1, 3.0, 4.9, 1.8, 'I. virginica'],
    [6.4, 2.8, 5.6, 2.1, 'I. virginica'],
    [7.2, 3.0, 5.8, 1.6, 'I. virginica'],
    [7.4, 2.8, 6.1, 1.9, 'I. virginica'],
    [7.9, 3.8, 6.4, 2.0, 'I. virginica'],
    [6.4, 2.8, 5.6, 2.2, 'I. virginica'],
    [6.3, 2.8, 5.1, 1.5, 'I. virginica'],
    [6.1, 2.6, 5.6, 1.4, 'I. virginica'],
    [7.7, 3.0, 6.1, 2.3, 'I. virginica'],
    [6.3, 3.4, 5.6, 2.4, 'I. virginica'],
    [5.1, 3.5, 1.4, 0.2, 'I. setosa'],
    [4.9, 3.0, 1.4, 0.2, 'I. setosa'],
    [4.7, 3.2, 1.3, 0.2, 'I. setosa'],
    [4.6, 3.1, 1.5, 0.2, 'I. setosa'],
    [5.0, 3.6, 1.4, 0.2, 'I. setosa'],
    [5.4, 3.9, 1.7, 0.4, 'I. setosa'],
    [4.6, 3.4, 1.4, 0.3, 'I. setosa'],
    [5.0, 3.4, 1.5, 0.2, 'I. setosa'],
    [4.4, 2.9, 1.4, 0.2, 'I. setosa'],
    [4.9, 3.1, 1.5, 0.1, 'I. setosa'],
    [5.4, 3.7, 1.5, 0.2, 'I. setosa'],
    [4.8, 3.4, 1.6, 0.2, 'I. setosa'],
    [4.8, 3.0, 1.4, 0.1, 'I. setosa'],
    [4.3, 3.0, 1.1, 0.1, 'I. setosa'],
    [5.8, 4.0, 1.2, 0.2, 'I. setosa'],
    [5.7, 4.4, 1.5, 0.4, 'I. setosa'],
    [5.4, 3.9, 1.3, 0.4, 'I. setosa'],
    [5.1, 3.5, 1.4, 0.3, 'I. setosa'],
    [5.7, 3.8, 1.7, 0.3, 'I. setosa'],
    [5.1, 3.8, 1.5, 0.3, 'I. setosa'],
    [5.4, 3.4, 1.7, 0.2, 'I. setosa'],
    [5.1, 3.7, 1.5, 0.4, 'I. setosa'],
    [4.6, 3.6, 1.0, 0.2, 'I. setosa'],
    [5.1, 3.3, 1.7, 0.5, 'I. setosa'],
    [4.8, 3.4, 1.9, 0.2, 'I. setosa'],
    [5.0, 3.0, 1.6, 0.2, 'I. setosa'],
    [5.0, 3.4, 1.6, 0.4, 'I. setosa'],
    [5.2, 3.5, 1.5, 0.2, 'I. setosa'],
    [5.2, 3.4, 1.4, 0.2, 'I. setosa'],
    [5.5, 2.3, 4.0, 1.3, 'I. versicolor'],
    [6.5, 2.8, 4.6, 1.5, 'I. versicolor'],
    [5.7, 2.8, 4.5, 1.3, 'I. versicolor'],
    [6.3, 3.3, 4.7, 1.6, 'I. versicolor'],
    [4.9, 2.4, 3.3, 1.0, 'I. versicolor'],
    [6.6, 2.9, 4.6, 1.3, 'I. versicolor'],
    [5.2, 2.7, 3.9, 1.4, 'I. versicolor'],
    [5.0, 2.0, 3.5, 1.0, 'I. versicolor'],
    [5.9, 3.0, 4.2, 1.5, 'I. versicolor'],
    [6.0, 2.2, 4.0, 1.0, 'I. versicolor'],
    [6.1, 2.9, 4.7, 1.4, 'I. versicolor'],
    [5.6, 2.9, 3.6, 1.3, 'I. versicolor'],
    [6.7, 3.1, 4.4, 1.4, 'I. versicolor'],
    [5.6, 3.0, 4.5, 1.5, 'I. versicolor'],
    [5.8, 2.7, 4.1, 1.0, 'I. versicolor'],
    [6.2, 2.2, 4.5, 1.5, 'I. versicolor'],
    [5.6, 2.5, 3.9, 1.1, 'I. versicolor'],
    [5.9, 3.2, 4.8, 1.8, 'I. versicolor'],
    [6.1, 2.8, 4.0, 1.3, 'I. versicolor'],
    [6.3, 2.5, 4.9, 1.5, 'I. versicolor'],
    [6.1, 2.8, 4.7, 1.2, 'I. versicolor'],
    [6.4, 2.9, 4.3, 1.3, 'I. versicolor'],
    [6.6, 3.0, 4.4, 1.4, 'I. versicolor'],
    [6.8, 2.8, 4.8, 1.4, 'I. versicolor'],
    [6.7, 3.0, 5.0, 1.7, 'I. versicolor'],
    [6.0, 2.9, 4.5, 1.5, 'I. versicolor'],
    [5.7, 2.6, 3.5, 1.0, 'I. versicolor'],
    [5.5, 2.4, 3.8, 1.1, 'I. versicolor'],
    [5.5, 2.4, 3.7, 1.0, 'I. versicolor'],
    [5.8, 2.7, 3.9, 1.2, 'I. versicolor'],
    [6.0, 2.7, 5.1, 1.6, 'I. versicolor'],
    [5.4, 3.0, 4.5, 1.5, 'I. versicolor'],
    [6.0, 3.4, 4.5, 1.6, 'I. versicolor'],
    [6.7, 3.1, 4.7, 1.5, 'I. versicolor'],
    [6.3, 2.3, 4.4, 1.3, 'I. versicolor'],
    [5.6, 3.0, 4.1, 1.3, 'I. versicolor'],
    [5.5, 2.5, 4.0, 1.3, 'I. versicolor'],
    [5.5, 2.6, 4.4, 1.2, 'I. versicolor'],
    [6.1, 3.0, 4.6, 1.4, 'I. versicolor'],
    [5.8, 2.6, 4.0, 1.2, 'I. versicolor'],
    [5.0, 2.3, 3.3, 1.0, 'I. versicolor'],
    [5.6, 2.7, 4.2, 1.3, 'I. versicolor'],
    [5.7, 3.0, 4.2, 1.2, 'I. versicolor'],
    [5.7, 2.9, 4.2, 1.3, 'I. versicolor'],
    [6.2, 2.9, 4.3, 1.3, 'I. versicolor'],
    [5.1, 2.5, 3.0, 1.1, 'I. versicolor'],
    [5.7, 2.8, 4.1, 1.3, 'I. versicolor'],
    [6.4, 3.1, 5.5, 1.8, 'I. virginica'],
    [6.0, 3.0, 4.8, 1.8, 'I. virginica'],
    [6.9, 3.1, 5.4, 2.1, 'I. virginica'],
    [6.7, 3.1, 5.6, 2.4, 'I. virginica'],
    [6.9, 3.1, 5.1, 2.3, 'I. virginica'],
    [5.8, 2.7, 5.1, 1.9, 'I. virginica'],
    [6.8, 3.2, 5.9, 2.3, 'I. virginica'],
    [6.7, 3.3, 5.7, 2.5, 'I. virginica'],
    [6.7, 3.0, 5.2, 2.3, 'I. virginica'],
    [6.3, 2.5, 5.0, 1.9, 'I. virginica'],
    [6.5, 3.0, 5.2, 2.0, 'I. virginica'],
    [6.2, 3.4, 5.4, 2.3, 'I. virginica'],
    [4.7, 3.2, 1.6, 0.2, 'I. setosa'],
    [4.8, 3.1, 1.6, 0.2, 'I. setosa'],
    [5.4, 3.4, 1.5, 0.4, 'I. setosa'],
    [5.2, 4.1, 1.5, 0.1, 'I. setosa'],
    [5.5, 4.2, 1.4, 0.2, 'I. setosa'],
    [4.9, 3.1, 1.5, 0.2, 'I. setosa'],
    [5.0, 3.2, 1.2, 0.2, 'I. setosa'],
    [5.5, 3.5, 1.3, 0.2, 'I. setosa'],
    [4.9, 3.6, 1.4, 0.1, 'I. setosa'],
    [4.4, 3.0, 1.3, 0.2, 'I. setosa'],
    [5.1, 3.4, 1.5, 0.2, 'I. setosa'],
    [5.0, 3.5, 1.3, 0.3, 'I. setosa'],
    [4.5, 2.3, 1.3, 0.3, 'I. setosa'],
    [4.4, 3.2, 1.3, 0.2, 'I. setosa'],
    [5.0, 3.5, 1.6, 0.6, 'I. setosa'],
    [5.1, 3.8, 1.9, 0.4, 'I. setosa'],
    [4.8, 3.0, 1.4, 0.3, 'I. setosa'],
    [5.1, 3.8, 1.6, 0.2, 'I. setosa'],
    [5.9, 3.0, 5.1, 1.8, 'I. virginica']
]


class decisionnode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None,l=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb
        self.l = l


def sporedi_broj(row, column, value):
    return row[column] >= value


def sporedi_string(row, column, value):
    return row[column] == value


# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows, column, value):
    # Make a function that tells us if a row is in
    # the first group (true) or the second group (false)
    split_function = None
    if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
        # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
        split_function = sporedi_broj
    else:
        # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
        split_function = sporedi_string

    # Divide the rows into two sets and return them
    set_false = []
    set_true = []
    for row in rows:
        if split_function(row, column, value):
            set_true.append(row)
        else:
            set_false.append(row)
    set1 = [row for row in rows if
            split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
    set2 = [row for row in rows if
            not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
    # return (set1, set2)
    return (set_true, set_false)


#st, sf = divideset(my_data, 3, 20)
#print(sf)
#print(st)


# Create counts of possible results (the last column of
# each row is the result)
def uniquecounts(rows):
    results = {}
    for row in rows:
        # The result is the last column
        r = row[-1]
        results.setdefault(r, 0)
        results[r] += 1

    return results


#print(uniquecounts(my_data))
#print(uniquecounts(st))
#print(uniquecounts(sf))


# Probability that a randomly placed item will
# be in the wrong category

def log2(x):
    from math import log
    l2 = log(x) / log(2)
    return l2


# Entropy is the sum of p(x)log(p(x)) across all
# the different possible results
def entropy(rows):
    results = uniquecounts(rows)
    # Now calculate the entropy
    ent = 0.0
    for r in results.keys():
        p = float(results[r]) / len(rows)
        ent = ent - p * log2(p)
    return ent


#print(entropy(my_data), entropy(st), entropy(sf))


def buildtree(rows,l=-1, scoref=entropy):
    if len(rows) == 0: return decisionnode()
    current_score = scoref(rows)

    # Set up some variables to track the best criteria
    best_gain = 0.0
    best_column = -1
    best_value = None
    best_subsetf = None
    best_subsett = None

    column_count = len(rows[0]) - 1
    for col in range(column_count):
        # Generate the list of different values in
        # this column
        column_values = set()
        for row in rows:
            column_values.add(row[col])
        # Now try dividing the rows up for each value
        # in this column
        for value in column_values:
            (set1, set2) = divideset(rows, col, value)

            # Information gain
            p = float(len(set1)) / len(rows)
            gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
            if gain > best_gain and len(set1) > 0 and len(set2) > 0:
                best_gain = gain
                best_column = col
                best_value = value
                best_subsett = set1
                best_subsetf = set2
                # best_criteria = (col, value)
                # best_sets = (set1, set2)

    # Create the subbranches
    if best_gain > 0:
        l = l + 1
        trueBranch = buildtree(best_subsett,l, scoref)
        falseBranch = buildtree(best_subsetf,l, scoref)
        return decisionnode(col=best_column, value=best_value,
                            tb=trueBranch, fb=falseBranch,l = l)
    else:
        return decisionnode(results=uniquecounts(rows))


def printtree(tree, indent=''):
    # Is this a leaf node?
    if tree.results != None:
        print str(sorted(tree.results.items()))
    else:
        # Print the criteria
        print( str(tree.col) + ':' + str(tree.value) + '? '+'Level='+str(tree.l))
        # Print the branches
        print(indent + 'T->'),
        printtree(tree.tb, indent + '  ')
        print(indent + 'F->'),
        printtree(tree.fb, indent + '  ')


def classify(observation, tree):
    if tree.results != None:

         maxi=0
        # print tree.results
        #for k in tree.results:
        #    if tree.results[k]>=maxi:
        #        maxi=tree.results[k]
        #for k in tree.results:
        #    if tree.results[k] == maxi:
        #        lista.append(k)
        #lista.sort()
        #return lista[0]
        return tree.results
    else:
        vrednost = observation[tree.col]
        branch = None

        if isinstance(vrednost, int) or isinstance(vrednost, float):
            if vrednost >= tree.value:
                branch = tree.tb
            else:
                branch = tree.fb
        else:
            if vrednost == tree.value:
                branch = tree.tb
            else:
                branch = tree.fb

        return classify(observation, branch)


if __name__ == '__main__':

    arg1 = 1
    arg2 = 2.2
    arg3 = 4.0
    arg4 = 1.1
    cl = 'I. virginica'

    tmp = [arg1,arg2,arg3,arg4,cl]

    p1 = []
    p2  = []
    leng = len(trainingData)

    for i in range(0,leng/2):
        p1.append(trainingData[i])

    for i in range(leng/2,len(trainingData)):
        p2.append(trainingData[i])

    d1 = buildtree(p1)
    d2 = buildtree(p2)

    print 'DRVO 1\n',printtree(d1)
    print 'DRVO 2\n', printtree(d2)

    k1 = classify(tmp,d1)
    k2 = classify(tmp,d2)

    print k1
    print k2

    if k1.keys()[0] == k2.keys()[0]:
        print k1.keys()[0]
    if(k1.values()[0]>k2.values()[0]):
        print k1.keys()[0]
        print 'KONTRADIKCIJA'
    if(k2.values()[0]>k1.values()[0]):
        print k2.keys()[0]
        print 'KONTRADIKCIJA'
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
### Klasifikacija ###
"""Задача 1 Problem 1 (2 / 3)
Дадено е тренинг множество од неколку документи. Притоа се знае секој документ од која класа е
(science или sport). Mножеството е претставено како листи од торки, така што во секоја торка
прв елемент е текстот на документот како стринг, а втор елемент е класата како стринг.
Да се истренира модел врз основа на тренинг множеството и потоа за секој документ
прочитан од стандарден влез да се испечати неговата класа.
"""
#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
import math

train_data = [
    ("""What Are We Searching for on Mars?
Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
""",
     "science"),
    ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
""",
     "science"),
    ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
"With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
""",
     "science"),
    ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
""",
     "science"),
    ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
""",
     "science"),
    ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
""",
     "science"),
    ("""Envisioning a River of Air
By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
""",
     "science"),
    ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
"""
     , "sport"),
    ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
"""
     , "sport"),
    ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
"""
     , "sport"),
    ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
"""
     , "sport")
    , ("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
"""
       , "sport"),
    ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
"""
     , "sport"),
    ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
So far, that's half true.
Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
"""
     , "sport"),
    ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
"""
     , "sport"),
]


def getwords(doc):
    splitter = re.compile('\\W*')
    words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]

    return dict([(word, 1) for word in words])


class documentClassifier:
    def __init__(self, getfeatures, filename=None):
        self.featureCountsPerCategory = {}
        self.categoryCounts = {}
        self.getfeatures = getfeatures

    def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
        self.featureCountsPerCategory.setdefault(currentFeature, {})
        self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
        self.featureCountsPerCategory[currentFeature][currentCategory] += 1

    def incrementCategoryCounts(self, cat):
        self.categoryCounts.setdefault(cat, 0)
        self.categoryCounts[cat] += 1

    def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
        if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
            currentFeature]:
            return float(self.featureCountsPerCategory[currentFeature][currentCategory])
        return 0.0

    def getCategoryCount(self, currentCategory):
        if currentCategory in self.categoryCounts:
            return float(self.categoryCounts[currentCategory])
        return 0

    def getTotal(self):
        return sum(self.categoryCounts.values())

    def categories(self):
        return self.categoryCounts.keys()

    def train(self, item, currentCategory):
        features = self.getfeatures(item)
        for currentFeature in features:
            self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
        self.incrementCategoryCounts(currentCategory)

    def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
        if self.getCategoryCount(currentCategory) == 0: return 0
        return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
            currentCategory)

    def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
        basicprob = prf(currentFeature, currentCategory)
        totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
                      self.categories()])
        bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
        return bp


class naivebayes(documentClassifier):
    def __init__(self, getfeatures):
        documentClassifier.__init__(self, getfeatures)
        self.thresholds = {}

    def setThreshold(self, currentCategory, threshold):
        self.thresholds[currentCategory] = threshold

    def getThreshold(self, currentCategory):
        if currentCategory not in self.thresholds: return 1.0
        return self.thresholds[currentCategory]

    def calculateDocumentProbabilityInClass(self, item, currentCategory):
        features = self.getfeatures(item)
        p = 1
        for currentFeature in features:
            p *= self.weightedprob(currentFeature, currentCategory, self.getFeaturePerCategoryProbability)

        return p

    def getCategoryProbabilityForDocument(self, item, currentCategory):
        catprob = self.getCategoryCount(currentCategory) / self.getTotal()
        calculateDocumentProbabilityInClass = self.calculateDocumentProbabilityInClass(item, currentCategory)

        return calculateDocumentProbabilityInClass * catprob / (1.0 / self.getTotal())

    def classifyDocument(self, item, default=None):
        probs = {}
        max = 0.0

        for cat in self.categories():
            probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
            if probs[cat] > max:
                max = probs[cat]
                best = cat

        for cat in probs:
            if cat == best: continue
            if probs[cat] * self.getThreshold(best) > probs[best]: return default
        return best


def trainClassifier(cl, data):
    cl.train("""So far this season Chelsea have looked the class of the league, but that does not faze Rooney.""",
             "science")
    cl.train("""Particularly for the strikers whose hype and profiles eclipse their prowess in front of goal. ""","sport")
    cl.train("""Armed with a bachelors degree in botany and a masters in microbiology, Hoffman moved to Charlotte where she became director of the Charlotte Nature Museum. ""","science")
    cl.train("""Chicago Bears quarterback Jay Cutler sits down for his usual press conference, but doesn't stay long as the media room has yet to be filled with most of the local media members.""","sport")
    cl.train("""When astronauts return from the International Space Station, their capsule hits the atmosphere at a speed of more than 17,000 miles per hour.""","science")


if __name__ == "__main__":
    cl = naivebayes(getwords)
    trainClassifier(cl, train_data)
    recenica = input()
    # klasa = 'bad'
    # print klasa
    print cl.classifyDocument(recenica)
-----------------------------------------------------------------------------------------------------
Klasifikacija lab2

#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
import math

train_data = [
    ("""What Are We Searching for on Mars?
Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
""",
     "science"),
    ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
""",
     "science"),
    ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
"With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
""",
     "science"),
    ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
""",
     "science"),
    ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
""",
     "science"),
    ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
""",
     "science"),
    ("""Envisioning a River of Air
By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
""",
     "science"),
    ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
"""
     , "sport"),
    ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
"""
     , "sport"),
    ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
"""
     , "sport"),
    ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
"""
     , "sport")
    , ("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
"""
       , "sport"),
    ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
"""
     , "sport"),
    ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
So far, that's half true.
Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
"""
     , "sport"),
    ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
"""
     , "sport"),
]


def getwords(doc):
    splitter = re.compile('\\W*')
    words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]

    return dict([(word, 1) for word in words])


class documentClassifier:
    def __init__(self, getfeatures, filename=None):
        self.featureCountsPerCategory = {}
        self.categoryCounts = {}
        self.getfeatures = getfeatures

    def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
        self.featureCountsPerCategory.setdefault(currentFeature, {})
        self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
        self.featureCountsPerCategory[currentFeature][currentCategory] += 1

    def incrementCategoryCounts(self, cat):
        self.categoryCounts.setdefault(cat, 0)
        self.categoryCounts[cat] += 1

    def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
        if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
            currentFeature]:
            return float(self.featureCountsPerCategory[currentFeature][currentCategory])
        return 0.0

    def getCategoryCount(self, currentCategory):
        if currentCategory in self.categoryCounts:
            return float(self.categoryCounts[currentCategory])
        return 0

    def getTotal(self):
        return sum(self.categoryCounts.values())

    def categories(self):
        return self.categoryCounts.keys()

    def train(self, item, currentCategory):
        features = self.getfeatures(item)
        for currentFeature in features:
            self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
        self.incrementCategoryCounts(currentCategory)

    def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
        if self.getCategoryCount(currentCategory) == 0: return 0
        return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
            currentCategory)

    def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
        basicprob = prf(currentFeature, currentCategory)
        totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
                      self.categories()])
        bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
        return bp


class naivebayes(documentClassifier):
    def __init__(self, getfeatures):
        documentClassifier.__init__(self, getfeatures)
        self.thresholds = {}

    def setThreshold(self, currentCategory, threshold):
        self.thresholds[currentCategory] = threshold

    def getThreshold(self, currentCategory):
        if currentCategory not in self.thresholds: return 1.0
        return self.thresholds[currentCategory]

    def calculateDocumentProbabilityInClass(self, item, currentCategory):
        features = self.getfeatures(item)
        p = 1
        for currentFeature in features:
            p *= self.weightedprob(currentFeature, currentCategory, self.getFeaturePerCategoryProbability)

        return p

    def getCategoryProbabilityForDocument(self, item, currentCategory):
        catprob = self.getCategoryCount(currentCategory) / self.getTotal()
        calculateDocumentProbabilityInClass = self.calculateDocumentProbabilityInClass(item, currentCategory)

        return calculateDocumentProbabilityInClass * catprob / (1.0 / self.getTotal())

    def classifyDocument(self, item, default=None):
        probs = {}
        max = 0.0

        for cat in self.categories():
            probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
            if probs[cat] > max:
                max = probs[cat]
                best = cat

        for cat in probs:
            if cat == best: continue
            if probs[cat] * self.getThreshold(best) > probs[best]: return default
        return best


def trainClassifier(cl, data):
    #cl.train("""So far this season Chelsea have looked the class of the league, but that does not faze Rooney.""","science")
    #cl.train("""Particularly for the strikers whose hype and profiles eclipse their prowess in front of goal. ""","sport")
    #cl.train("""Armed with a bachelors degree in botany and a masters in microbiology, Hoffman moved to Charlotte where she became director of the Charlotte Nature Museum. ""","science")
    #cl.train("""Chicago Bears quarterback Jay Cutler sits down for his usual press conference, but doesn't stay long as the media room has yet to be filled with most of the local media members.""","sport")
    #cl.train("""When astronauts return from the International Space Station, their capsule hits the atmosphere at a speed of more than 17,000 miles per hour.""","science")

    for el in data:
        cl.train(el[0],el[1])


if __name__ == "__main__":
    cl = naivebayes(getwords)
    trainClassifier(cl, train_data)
    recenica = input()
    klasa = cl.classifyDocument(recenica)
    verojatnost=cl.getCategoryProbabilityForDocument(recenica,klasa)

    # print klasa

    print klasa, '%.8f'% verojatnost
-----------------------------------------------------------------------------------------------------
Klasifikacija 3
"""
За секоја прочитан документ од стандарден влез да се испечатат зборовите кои се употребуваат
за класификација, класата и веројатноста на зборот да е од таа класа (заокружено на 4 децимали),
како и тежинската веројатност на зборот да припаѓа на класата (заокружено на 4 децимали).
На крај да се испечати предвидената класа на документот и логаритам со основа 2 од веројатноста
со која се предвидува (заокружено на 4 децимали).
"""

#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
import math

train_data = [
    ("""What Are We Searching for on Mars?
Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
""",
     "science"),
    ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
""",
     "science"),
    ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
"With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
""",
     "science"),
    ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
""",
     "science"),
    ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
""",
     "science"),
    ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
""",
     "science"),
    ("""Envisioning a River of Air
By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
""",
     "science"),
    ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
"""
     , "sport"),
    ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
"""
     , "sport"),
    ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
"""
     , "sport"),
    ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
"""
     , "sport")
    , ("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
"""
       , "sport"),
    ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
"""
     , "sport"),
    ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
So far, that's half true.
Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
"""
     , "sport"),
    ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
"""
     , "sport"),
]

def getwords(doc):
    splitter = re.compile('\\W*')
    words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]

    return dict([(word, 1) for word in words])


class documentClassifier:
    def __init__(self, getfeatures, filename=None):
        self.featureCountsPerCategory = {}
        self.categoryCounts = {}
        self.getfeatures = getfeatures

    def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
        self.featureCountsPerCategory.setdefault(currentFeature, {})
        self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
        self.featureCountsPerCategory[currentFeature][currentCategory] += 1

    def incrementCategoryCounts(self, cat):
        self.categoryCounts.setdefault(cat, 0)
        self.categoryCounts[cat] += 1

    def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
        if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
            currentFeature]:
            return float(self.featureCountsPerCategory[currentFeature][currentCategory])
        return 0.0

    def getCategoryCount(self, currentCategory):
        if currentCategory in self.categoryCounts:
            return float(self.categoryCounts[currentCategory])
        return 0

    def getTotal(self):
        return sum(self.categoryCounts.values())

    def categories(self):
        return self.categoryCounts.keys()

    def train(self, item, currentCategory):
        features = self.getfeatures(item)
        for currentFeature in features:
            self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
        self.incrementCategoryCounts(currentCategory)

    def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
        if self.getCategoryCount(currentCategory) == 0: return 0
        return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
            currentCategory)

    def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
        basicprob = prf(currentFeature, currentCategory)
        totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
                      self.categories()])
        bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
        return bp


class naivebayes(documentClassifier):
    def __init__(self, getfeatures):
        documentClassifier.__init__(self, getfeatures)
        self.thresholds = {}

    def setThreshold(self, currentCategory, threshold):
        self.thresholds[currentCategory] = threshold

    def getThreshold(self, currentCategory):
        if currentCategory not in self.thresholds: return 1.0
        return self.thresholds[currentCategory]

    def calculateDocumentProbabilityInClass(self, item, currentCategory):
        features = self.getfeatures(item)
        p = 1
        for currentFeature in features:
            p *= self.weightedprob(currentFeature, currentCategory, self.getFeaturePerCategoryProbability)

        return p

    def getCategoryProbabilityForDocument(self, item, currentCategory):
        catprob = self.getCategoryCount(currentCategory) / self.getTotal()
        calculateDocumentProbabilityInClass = self.calculateDocumentProbabilityInClass(item, currentCategory)

        return calculateDocumentProbabilityInClass * catprob / (1.0 / self.getTotal())

    def classifyDocument(self, item, default=None):
        probs = {}
        max = 0.0

        for cat in self.categories():
            probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
            if probs[cat] > max:
                max = probs[cat]
                best = cat

        for cat in probs:
            if cat == best: continue
            if probs[cat] * self.getThreshold(best) > probs[best]: return default
        return best


def trainClassifier(cl, data):
    for el in data:
        cl.train(el[0],el[1])


if __name__ == "__main__":
    cl = naivebayes(getwords)
    trainClassifier(cl, train_data)
    recenica = input()
    cl.setThreshold('science',1)
    klasa=cl.classifyDocument(recenica)
    pom=cl.getCategoryProbabilityForDocument(recenica,klasa)
    verojatnost=0
    verojatnost=round(math.log(pom)/math.log(2),4)


    zborovi =getwords(recenica)
    kategorii = cl.categories()
    for zbor in zborovi:
        for kategorija in kategorii:
            verojatnostNaZbor = round(cl.getFeaturePerCategoryProbability(zbor,kategorija),4)
            verojatnostNaZborTezinska = round(cl.weightedprob(zbor,kategorija,cl.getFeaturePerCategoryProbability),4)
            print zbor, kategorija, verojatnostNaZbor, verojatnostNaZborTezinska
    #klasa = 'bad'
    #verojatnost = 0
    print klasa, verojatnost
-----------------------------------------------------------------------------------------------------
Klasifikacija ispit januari-courses

Заради потребата на софистицирана класификација на документи, веќе е имплементирана и достапна во почетниот
код фунцијата getwords_with_ignore која ги дава уникатните зборовите од еден документ така што зборовите
кои се веќе во интерната променлива words_to_ingore се игнорираат. Значи секој збор во words_to_ingore не
фигурира во речникот со уникатни зборови кој се добива како резултат на getwords_with_ignore.

Множеството на податоци train_data е предефинирано. Притоа се знае секој документ од која класа е
(science или sport). Mножеството е претставено како листи од торки, така што во секоја торка прв елемент е
текстот на документот како стринг, а втор елемент е класата како стринг. Да се истренира класификатор со
користење на стандардната getwords (од аудиториските вежби) врз основа на тренинг множеството. Исто така
да се направат потребните промени за да се истренира и втор класификатор кој ќе го употребува истото
тренинг множество, но притоа ќе ја употребува новата функција која е веќе имплементирана
getwords_with_ignore.

Потоа за секој документ прочитан од стандарден влез да се испечатат 2 реда. Првиот ред ја содржи
предвидената класа со стандардниот класификатор и логаритам со основа 2 од веројатноста со која се
предвидува (заокружено на 4 децимали), а вториот ред предвидената класа со помош на вториот класификатор и
логаритам со основа 2 од веројатноста со која се предвидува (заокружено на 4 децимали). Да се испечати
колку пати втората веројатност е поголема од првата заокружено на 4 децимали. Ако предвидувањето на двата
класификатори е различно да се испчати уште еден ред со зборот “kontradikcija”.

    Vlez:
        """Just last week, preservationists at the Old Pejeta animal sanctuary in Kenya conceded
        that their one male and two female northern white rhinos will not reproduce naturally.
        The animals were flown from the Czech zoo to the Kenyan conservancy in December 2009 in
        hopes that the natural environment could be easier for them to breed there than in captivity."""
    Izlez:
science -51.5029
science -46.0544
43.6706

Vlez 2:

"""HONOLULU (AP) — Lava from a volcano on Hawaii's Big Island is on course to reach a shopping center
with a gas station and a supermarket in seven to 10 days, officials said Monday."""
    Izlez 2:
sport -21.1937
science -17.6781
11.4370
kontradikcija

====*===*====*============*****============***=============**=============***
#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
import math


train_data=[
("""What Are We Searching for on Mars?
Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
""",
"science"),
("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
""",
"science"),
("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
"With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
""",
"science"),
("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
""",
"science"),
("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
""",
"science"),
("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
""",
"science"),
("""Envisioning a River of Air
By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
""",
"science"),
("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
"""
,"sport"),
("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
"""
,"sport"),
("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
"""
,"sport"),
("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
"""
,"sport")
,("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
"""
,"sport"),
("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
"""
,"sport"),
("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
So far, that's half true.
Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
"""
,"sport"),
("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
"""
,"sport"),
]


def getwords(doc, words_to_ignore=None):
    splitter = re.compile('\\W*')

    words = [word.lower() for word in splitter.split(doc) if
             len(word) > 2 and len(word) < 20 and (words_to_ignore is None or word.lower() not in words_to_ignore)]

    return dict([(word, 1) for word in words])

def getwords_with_ignore(doc,words_to_ignore=['and', 'are', 'for', 'was', 'what', 'when', 'who', 'but', 'from', 'after', 'out', 'our', 'my', 'the', 'with', 'some', 'not', 'this', 'that']):
    return getwords(doc,words_to_ignore)

def getwords2(doc,words_to_ignore=None):
    splitter=re.compile('\\W*')

    words=[word.lower() for word in splitter.split(doc) if len(word)>2 and len(word)<20 and (words_to_ignore is None or word.lower() not in words_to_ignore)]
    return dict([(word,1) for word in words])


words_to_ignore=['and', 'are', 'for', 'was', 'what', 'when', 'who', 'but', 'from', 'after', 'out', 'our', 'my', 'the', 'with', 'some', 'not', 'this', 'that']

def trainClassifier(cl, data):
    for i in data:
        cl.train(i[0],i[1])
    # cl.train(data[0][0],data[0][1])


class documentClassifier:
    def __init__(self, getfeatures, filename=None):
        # Broj na parovi atribut/kategorija (feature/category)
        self.featureCountsPerCategory = {}
        # Broj na dokumenti vo sekoja kategorija
        self.categoryCounts = {}
        # funkcija za dobivanje na atributite (zborovite) vo dokumentot
        self.getfeatures = getfeatures

    # Zgolemuvanje na brojot na parovi atribut/kategorija
    def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
        self.featureCountsPerCategory.setdefault(currentFeature, {})
        self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
        self.featureCountsPerCategory[currentFeature][currentCategory] += 1

    # Zgolemuvanje na brojot na predmeti(dokumenti) vo kategorija
    def incrementCategoryCounts(self, cat):
        self.categoryCounts.setdefault(cat, 0)
        self.categoryCounts[cat] += 1

    # Dobivanje na brojot kolku pati
    # odreden atribut se ima pojaveno vo odredena kategorija
    def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
        if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
            currentFeature]:
            return float(self.featureCountsPerCategory[currentFeature][currentCategory])
        return 0.0

    # Dobivanje na brojot na predmeti(dokumenti) vo kategorija
    def getCategoryCount(self, currentCategory):
        if currentCategory in self.categoryCounts:
            return float(self.categoryCounts[currentCategory])
        return 0

    # Dobivanje na vkupniot broj na predmeti
    def getTotalCount(self):
        return sum(self.categoryCounts.values())

    # Dobivanje na lista na site kategorii
    def categories(self):
        return self.categoryCounts.keys()

    # Treniranje na klasifikatorot
    # Noviot predmet (dokument) item pripagja na kategorijata cat
    def train(self, item, currentCategory):
        # Se zemaat atributite (zborovite) vo predmetot(dokumentot)
        features = self.getfeatures(item)
        # Se zgolemuva brojot na sekoj atribut vo ovaa kategorija
        for currentFeature in features:
            self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)

        # Se zgolemuva brojot na predmeti (dokumenti) vo ovaa kategorija
        self.incrementCategoryCounts(currentCategory)

    def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
        if self.getCategoryCount(currentCategory) == 0: return 0
        # Verojatnosta e vkupniot broj na pati koga
        # ovoj atribut f (zbor) se pojavil vo ovaa
        # kategorija podeleno so vkupniot broj na
        # predmeti (dokumenti) vo ovaa kategorija
        return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
            currentCategory)

    def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
        # Presmetaj ja osnovnata verojatnost
        basicprob = prf(currentFeature, currentCategory)
        # Izbroj kolku pati se ima pojaveno ovoj atribut (zbor)
        # vo site kategorii
        totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
                      self.categories()])
        # Presmetaj ja tezinski usrednetata verojatnost
        bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
        return bp


class naivebayes(documentClassifier):
    def __init__(self, getfeatures):
        documentClassifier.__init__(self, getfeatures)
        self.thresholds = {}

    def setThreshold(self, currentCategory, threshold):
        self.thresholds[currentCategory] = threshold

    def getThreshold(self, currentCategory):
        if currentCategory not in self.thresholds: return 1.0
        return self.thresholds[currentCategory]

    # ja vrakja verojatnosta na dokumentot da e od klasata cat (cat e odnapred poznata)
    def caclulateDocumentProbabilityInClass(self, item, currentCategory):
        # zemi gi zborovite vo dokumentot item
        features = self.getfeatures(item)
        # pomnozi gi verojatnostite na site zborovi
        p = 1
        for currentFeature in features:
            p *= self.weightedprob(currentFeature, currentCategory,
                                   self.getFeaturePerCategoryProbability)

        return p

    # Ja vrakja verojatnosta na klasata ako e poznat dokumentot
    def getCategoryProbabilityForDocument(self, item, currentCategory):
        catprob = self.getCategoryCount(currentCategory) / self.getTotalCount()
        caclulateDocumentProbabilityInClass = self.caclulateDocumentProbabilityInClass(item, currentCategory)
        # Bayes Theorem
        return caclulateDocumentProbabilityInClass * catprob / (1.0 / self.getTotalCount())

    # klasificiranje na dokument
    def classifyDocument(self, item, default=None):
        probs = {}
        # najdi ja kategorijata (klasata)
        # so najgolema verojatnost
        max = 0.0
        for cat in self.categories():
            probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
            if probs[cat] > max:
                max = probs[cat]
                best = cat

        # proveri dali verojatnosta e pogolema od
        # threshold*next best (sledna najdobra)
        for cat in probs:
            if cat == best: continue
            if probs[cat] * self.getThreshold(best) > probs[best]: return default

        return best


if __name__ == "__main__":
    #recenica = """Just last week, preservationists at the Old Pejeta animal sanctuary in Kenya conceded that their one male and two female northern white rhinos will not reproduce naturally. The animals were flown from the Czech zoo to the Kenyan conservancy in December 2009 in hopes that the natural environment could be easier for them to breed there than in captivity."""
    recenica = input()
    cl = naivebayes(getwords)
    cl2 = naivebayes(getwords_with_ignore)
    trainClassifier(cl,train_data)
    trainClassifier(cl2,train_data)
    klasa1 = cl.classifyDocument(recenica)
    klasa2 = cl2.classifyDocument(recenica)
    verojatnost1 = cl.getCategoryProbabilityForDocument(recenica, klasa1)
    verojatnost2 = cl2.getCategoryProbabilityForDocument(recenica, klasa2)
    # print (klasa1, klasa2)
    print klasa1,"%.4f" % math.log(verojatnost1,2)
    print klasa2,"%.4f" % math.log(verojatnost2,2)
    print "%.4f" % (verojatnost2/verojatnost1)
    if klasa1 != klasa2:
        print 'kontradikcija'
------------------------------------------------------------------------------------------------------
#-*- coding: utf-8 -*-

"""
Класификација на документи (испит)
Заради потребата на софистицирана класификација на документи, фунцијата која ги дава уникатните зборовите од
еден документ getwords треба да се промени така што ќе прима втор опционален аргумент words_to_ingore. Ако не
се проследи опционалниот аргумент подразбирлива вредност е None. Кога се проследува вредност таа вредност треба
да биде листа од зборови кои функцијата ќе ги изоставува при генерирањето на излезниот речник. Значи секој збор
во words_to_ingore не треба да фигурира во речникот со уникатни зборови кој се добива како резултат на getwords.

Множеството на податоци train_data е предефинирано.
Притоа се знае секој документ од која класа е (science или sport).
Mножеството е претставено како листи од торки, така што во секоја торка прв елемент е текстот на документот како стринг,
а втор елемент е класата како стринг. Да се истренира класификатор со користење на стандардната getwords (од аудиториските вежби)
врз основа на тренинг множеството. Исто така да се направат потребните промени за да се истренира и втор
класификатор кој ќе го употребува истото тренинг множество, но притоа ќе ја употребува новата функција која е веќе
имплементирана getwords_with_ignore.

Потоа за секој документ прочитан од стандарден влез да се испечатат 2 реда.
Првиот ред ја содржи предвидената класа со стандардниот класификатор, а вториот ред предвидената класа со помош на
вториот класификатор. Ако предвидувањето на двата класификатори е различно
да се испчати уште еден ред со зборот “kontradikcija”.

"""

import re
import math

train_data=[
("""What Are We Searching for on Mars?
Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
""",
"science"),
("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
""",
"science"),
("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
"With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
""",
"science"),
("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
""",
"science"),
("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
""",
"science"),
("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
""",
"science"),
("""Envisioning a River of Air
By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
""",
"science"),
("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
"""
,"sport"),
("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
"""
,"sport"),
("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
"""
,"sport"),
("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
"""
,"sport")
,("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
"""
,"sport"),
("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
"""
,"sport"),
("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
So far, that's half true.
Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
"""
,"sport"),
("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
"""
,"sport"),
]


words_to_ignore=['and', 'are', 'for', 'was', 'what', 'when', 'who', 'but', 'from', 'after', 'out', 'our', 'my', 'the', 'with', 'some', 'not', 'this', 'that']


def getwords(doc):
    # regularen izraz koj ke go deli stringot na zborovi
    # stringot se deli na zborovi na site prazni mesta i interpunkciski znaci
    splitter = re.compile('\\W*')
    # podeli go dokumentot na zborovi
    # i konvertiraj gi vo mali bukvi
    # pa potoa stavi gi vo lista
    # ako nivnata dolzina e >2 i <20
    words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
    # se vrakja recnik cii klucevi se zborovite koi
    # se vo dokumentot, a eden zbor duri i da se
    # srekjava poveke pati vo recnikot ke bide samo ednas
    # vrednosta (1) vo recnikot ne e potrebna
    return dict([(word, 1) for word in words])


def getwords2(doc):
    splitter = re.compile('\\W*')
    words = []
    for word in splitter.split(doc):
        if len(word) > 2 and len(word) < 20 and word.lower() not in words_to_ignore:
            words.append(word.lower())

    return dict([(word, 1) for word in words])


class documentClassifier:
    def __init__(self, getfeatures, filename=None):
        # Broj na parovi atribut/kategorija (feature/category)
        self.featureCountsPerCategory = {}
        # Broj na dokumenti vo sekoja kategorija
        self.categoryCounts = {}
        # funkcija za dobivanje na atributite (zborovite) vo dokumentot
        self.getfeatures = getfeatures

    # Zgolemuvanje na brojot na parovi atribut/kategorija
    def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
        self.featureCountsPerCategory.setdefault(currentFeature, {})
        self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
        self.featureCountsPerCategory[currentFeature][currentCategory] += 1

    # Zgolemuvanje na brojot na predmeti(dokumenti) vo kategorija
    def incrementCategoryCounts(self, cat):
        self.categoryCounts.setdefault(cat, 0)
        self.categoryCounts[cat] += 1

    # Dobivanje na brojot kolku pati
    # odreden atribut se ima pojaveno vo odredena kategorija
    def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
        if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
            currentFeature]:
            return float(self.featureCountsPerCategory[currentFeature][currentCategory])
        return 0.0

    # Dobivanje na brojot na predmeti(dokumenti) vo kategorija
    def getCategoryCount(self, currentCategory):
        if currentCategory in self.categoryCounts:
            return float(self.categoryCounts[currentCategory])
        return 0

    # Dobivanje na vkupniot broj na predmeti
    def getTotalCount(self):
        return sum(self.categoryCounts.values())

    # Dobivanje na lista na site kategorii
    def categories(self):
        return self.categoryCounts.keys()

    # Treniranje na klasifikatorot
    # Noviot predmet (dokument) item pripagja na kategorijata cat
    def train(self, item, currentCategory):
        # Se zemaat atributite (zborovite) vo predmetot(dokumentot)
        features = self.getfeatures(item)
        # Se zgolemuva brojot na sekoj atribut vo ovaa kategorija
        for currentFeature in features:
            self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)

        # Se zgolemuva brojot na predmeti (dokumenti) vo ovaa kategorija
        self.incrementCategoryCounts(currentCategory)

    def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
        if self.getCategoryCount(currentCategory) == 0: return 0
        # Verojatnosta e vkupniot broj na pati koga
        # ovoj atribut f (zbor) se pojavil vo ovaa
        # kategorija podeleno so vkupniot broj na
        # predmeti (dokumenti) vo ovaa kategorija
        return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
            currentCategory)

    def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
        # Presmetaj ja osnovnata verojatnost
        basicprob = prf(currentFeature, currentCategory)
        # Izbroj kolku pati se ima pojaveno ovoj atribut (zbor)
        # vo site kategorii
        totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
                      self.categories()])
        # Presmetaj ja tezinski usrednetata verojatnost
        bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
        return bp


class naivebayes(documentClassifier):

    def __init__(self, getfeatures):
        documentClassifier.__init__(self, getfeatures)
        self.thresholds = {}

    def setThreshold(self, currentCategory, threshold):
        self.thresholds[currentCategory] = threshold

    def getThreshold(self, currentCategory):
        if currentCategory not in self.thresholds: return 1.0
        return self.thresholds[currentCategory]

    # ja vrakja verojatnosta na dokumentot da e od klasata cat (cat e odnapred poznata)
    def caclulateDocumentProbabilityInClass(self, item, currentCategory):
        # zemi gi zborovite vo dokumentot item
        features = self.getfeatures(item)
        # pomnozi gi verojatnostite na site zborovi
        p = 1
        for currentFeature in features: p *= self.weightedprob(currentFeature, currentCategory,
                                                               self.getFeaturePerCategoryProbability)

        return p

    # Ja vrakja verojatnosta na klasata ako e poznat dokumentot
    def getCategoryProbabilityForDocument(self, item, currentCategory):
        catprob = self.getCategoryCount(currentCategory) / self.getTotalCount()
        caclulateDocumentProbabilityInClass = self.caclulateDocumentProbabilityInClass(item, currentCategory)
        # Bayes Theorem
        return caclulateDocumentProbabilityInClass * catprob / (1.0 / self.getTotalCount())

    # klasificiranje na dokument
    def classifyDocument(self, item, default=None):
        probs = {}
        # najdi ja kategorijata (klasata)
        # so najgolema verojatnost
        max = 0.0
        for cat in self.categories():
            probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
            if probs[cat] > max:
                max = probs[cat]
                best = cat

        # proveri dali verojatnosta e pogolema od
        # threshold*next best (sledna najdobra)
        for cat in probs:
            if cat == best: continue
            if probs[cat] * self.getThreshold(best) > probs[best]: return default

        return best,round(math.log(max,2),4)


if __name__ == '__main__':

    #recenica = input()
    recenica = """Just last week, preservationists at the Old Pejeta animal sanctuary in Kenya conceded that their one male and two female northern white rhinos will not reproduce naturally. The animals were flown from the Czech zoo to the Kenyan conservancy in December 2009 in hopes that the natural environment could be easier for them to breed there than in captivity."""


    cl = naivebayes(getwords)

    for i in train_data:
        cl.train(i[0],i[1])

    class1, verojatnost1 = cl.classifyDocument(recenica)


    cl1 = naivebayes(getwords2)
    for i in train_data:
        cl1.train(i[0],i[1])

    class2, verojatnost2 = cl1.classifyDocument(recenica)


    print class1, verojatnost1
    print class2, verojatnost2
    print "%.4f" % round((verojatnost2 / verojatnost1), 4)

    if class1 != class2:
        print("kontradikcija")
-----------------------------------------------------------------------------------------------------
""" Klasifikacija - Twiter

Потребно е да се направи систем кој ќе знае да класифицира твитови во однос на тонот (sentiment) на позитивен и негативен.
Дадена ви е листа train_data од торки. Прв елемент во торката е класата (positive/negative) и втор елемент е содржината на твитот.
Користејќи ги првите 200 примери, да се изгради наивен Баесов класификатор кој ќе научи да класифицира непознати твитови.

Потоа, за прочитан индекс од влезот (број од 200 до 999) да се најде твитот на соодветната позиција во train_data и истиот да се класифицира.
Во првата линија се печати бројот на позитивни и негативни примери во тренинг множеството,
а во втората линија се печати индексот на тест примерот (прочитано од влез), точната класа, предвидената класа и содржината на твитот.

"""

def getwords(doc):
    # regularen izraz koj ke go deli stringot na zborovi
    # stringot se deli na zborovi na site prazni mesta i interpunkciski znaci
    splitter = re.compile('\\W*')
    # podeli go dokumentot na zborovi
    # i konvertiraj gi vo mali bukvi
    # pa potoa stavi gi vo lista
    # ako nivnata dolzina e >2 i <20
    words = set()
    for word in splitter.split(doc):
        if 2 < len(word) < 20:
            words.add(word.lower())
    return words
    # words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
    # # se vrakja recnik cii klucevi se zborovite koi
    # # se vo dokumentot, a eden zbor duri i da se
    # # srekjava poveke pati vo recnikot ke bide samo ednas
    # # vrednosta (1) vo recnikot ne e potrebna
    # return dict([(word, 1) for word in words])


# {'python': {'bad': 0, 'good': 6}, 'the': {'bad': 3, 'good': 3}}


# print(w)
# exit(1)

class documentClassifier:
    def __init__(self, getfeatures, filename=None):
        # Broj na parovi atribut/kategorija (feature/category)
        self.featureCountsPerCategory = {}
        # Broj na dokumenti vo sekoja kategorija
        self.categoryCounts = {}
        # funkcija za dobivanje na atributite (zborovite) vo dokumentot
        self.getfeatures = getfeatures

    # Zgolemuvanje na brojot na parovi atribut/kategorija
    def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
        self.featureCountsPerCategory.setdefault(currentFeature, {})
        self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
        self.featureCountsPerCategory[currentFeature][currentCategory] += 1

    # Zgolemuvanje na brojot na predmeti(dokumenti) vo kategorija
    def incrementCategoryCounts(self, cat):
        self.categoryCounts.setdefault(cat, 0)
        self.categoryCounts[cat] += 1

    # Dobivanje na brojot kolku pati
    # odreden atribut se ima pojaveno vo odredena kategorija
    def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
        if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
            currentFeature]:
            return float(self.featureCountsPerCategory[currentFeature][currentCategory])
        return 0.0

    # Dobivanje na brojot na predmeti(dokumenti) vo kategorija
    def getCategoryCount(self, currentCategory):
        if currentCategory in self.categoryCounts:
            return float(self.categoryCounts[currentCategory])
        return 0

    # Dobivanje na vkupniot broj na predmeti
    def getTotalCount(self):
        return sum(self.categoryCounts.values())

    # Dobivanje na lista na site kategorii
    def categories(self):
        return self.categoryCounts.keys()

    # Treniranje na klasifikatorot
    # Noviot predmet (dokument) item pripagja na kategorijata cat
    def train(self, item, currentCategory):
        # Se zemaat atributite (zborovite) vo predmetot(dokumentot)
        features = self.getfeatures(item)
        # Se zgolemuva brojot na sekoj atribut vo ovaa kategorija
        for currentFeature in features:
            self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)

        # Se zgolemuva brojot na predmeti (dokumenti) vo ovaa kategorija
        self.incrementCategoryCounts(currentCategory)

    def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
        if self.getCategoryCount(currentCategory) == 0: return 0
        # Verojatnosta e vkupniot broj na pati koga
        # ovoj atribut f (zbor) se pojavil vo ovaa
        # kategorija podeleno so vkupniot broj na
        # predmeti (dokumenti) vo ovaa kategorija
        return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
            currentCategory)

    def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
        # Presmetaj ja osnovnata verojatnost
        basicprob = prf(currentFeature, currentCategory)
        # Izbroj kolku pati se ima pojaveno ovoj atribut (zbor)
        # vo site kategorii
        totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
                      self.categories()])
        # Presmetaj ja tezinski usrednetata verojatnost
        bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
        return bp


#dc = documentClassifier(getwords)
#dc.train("sistemi na znaenje e dosaden predmet", "tracevi")
#dc.train("asistentot po sistemi na znaenje e isto taka dosaden", "tracevi")
#dc.train("vezbite po sistemi na znaenje moze da se podobrat na sledniov nacin...", "kritiki")
#dc.train("predvanjata po sistemi na znaenje ne moze da se podobrat bidejki se najdobri...", "kritiki")


class naivebayes(documentClassifier):
    def __init__(self, getfeatures):
        documentClassifier.__init__(self, getfeatures)
        self.thresholds = {}

    def setThreshold(self, currentCategory, threshold):
        self.thresholds[currentCategory] = threshold

    def getThreshold(self, currentCategory):
        if currentCategory not in self.thresholds: return 1.0
        return self.thresholds[currentCategory]

    # ja vrakja verojatnosta na dokumentot da e od klasata cat (cat e odnapred poznata)
    def caclulateDocumentProbabilityInClass(self, item, currentCategory):
        # zemi gi zborovite vo dokumentot item
        features = self.getfeatures(item)
        # pomnozi gi verojatnostite na site zborovi
        p = 1
        for currentFeature in features:
            p *= self.weightedprob(currentFeature, currentCategory,
                                   self.getFeaturePerCategoryProbability)

        return p

    # Ja vrakja verojatnosta na klasata ako e poznat dokumentot
    def getCategoryProbabilityForDocument(self, item, currentCategory):
        catprob = self.getCategoryCount(currentCategory) / self.getTotalCount()
        caclulateDocumentProbabilityInClass = self.caclulateDocumentProbabilityInClass(item, currentCategory)
        # Bayes Theorem
        return caclulateDocumentProbabilityInClass * catprob / (1.0 / self.getTotalCount())

    # klasificiranje na dokument
    def classifyDocument(self, item, default=None):
        probs = {}
        # najdi ja kategorijata (klasata)
        # so najgolema verojatnost
        max = 0.0
        for cat in self.categories():
            probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
            if probs[cat] > max:
                max = probs[cat]
                best = cat

        # proveri dali verojatnosta e pogolema od
        # threshold*next best (sledna najdobra)
        for cat in probs:
            if cat == best: continue
            if probs[cat] * self.getThreshold(best) > probs[best]: return default

        return best


if __name__ == '__main__':

    cl = naivebayes(getwords)

    #index = input()
    index=250

    pozitive = 0
    negative = 0

    for i in range(200):
        #print train_data[i][1],train_data[i][0], i
        if train_data[i][0] == 'negative':
            negative+=1
        else:
            pozitive+=1

        cl.train(train_data[i][1], train_data[i][0])
        #break

    twit = train_data[index]

    klasifikacija = cl.classifyDocument(twit[1])

    print "Pozitivni: {}, Negativni: {}".format(pozitive, negative)
    print "Index: {}, Tocna Klasa: {}, Predvidena Klasa: {}, Twit: {}".format(index, tweet[0], klasifikacija, twit[1])

------------------------------------------------------------------------------------------------------
#Пресметка на статистики и еден подвижен прозорец Problem 1 (2 / 2)
За даденото податочно множество во листата X_data со предефинирана должина N од секоја
колона треба да се пресметаат следниве статистики: минимум, максимум, средна вредност,
стандардна девијација. Притоа од стандарден влез се чита поместувањето D и должнината на
подвижниот прозорец L. На излез треба да се испечати испроцесираното множество, така што
во секоја линија ќе се испечати бројот на редицата со која завршува подвижниот прозорец
и листа од вредности заокружени на 2 децимали со бараните статистики од секоја колона
(прво 4 статистики за првата колона, па 4 статистики за втората колона итн...)
Вкупниот број на редови е floor((N-L)/D + 1).

НАПОМЕНА: Заради ефикасност на решението треба да ги пресметувате само статистиките кои се бараат.


from __future__ import print_function
import numpy as np
import scipy.stats as sp

X_data = [[119, 57, 51, 3, 1, 141],
          [105, 54, 62, 3, 1, 133],
.........................]]

.........

def percentiles_all(x, iqr=True, amplitude=True, percentiles_list=[5, 10, 25, 40, 50, 60, 75, 90, 95]):
    names = ['p_' + str(p) for p in percentiles_list]
    if iqr and 25 in percentiles_list and 75 in percentiles_list:
        names.append('iqr')
    if amplitude and 1 in percentiles_list and 99 in percentiles_list:
        names.append('perc_amp')
    if len(x) == 0:
        values = [0 for i in range(len(names))]
        return values, names
    if len(percentiles_list) > 0 and all([0 < q < 100 for q in percentiles_list]):
        values = list(np.percentile(x, percentiles_list))
    else:
        values = []
    if iqr:
        q1 = percentiles_list.index(25)
        q3 = percentiles_list.index(75)
        values.append(values[q3] - values[q1])
    if amplitude and 1 in percentiles_list and 99 in percentiles_list:
        q1 = percentiles_list.index(1)
        q3 = percentiles_list.index(99)
        values.append(values[q3] - values[q1])
        return values, names


def stats_calculate_all(X_data):
    stats_all_names = stats_all_names = ['len', 'min', 'max', 'range', 'mean', 'hmean', 'gmean', 'var', 'std', 'skew',
                                         'kurtosis', 'median', 'mode', 'energy', 'energy_sample', 'snr']

    xnp = np.array(X_data)
    n = len(X_data)
    if n == 0:
        values = [0 for i in range(len(stats_all_names))]
        return values, stats_all_names
    values = []
    vmin = float(min(xnp))
    if vmin < 1:
        offset = 1 + abs(vmin)
    else:
        offset = 0
    vmax = float(max(xnp))
    #vrange = vmax - vmin
    vmean = float(np.mean(xnp))
    vstd = float(np.std(xnp))
    #venergy = float(sum(np.array(xnp) ** 2))
    #venergy_sample = venergy / n
    #snr = 0.0
    #if vstd != 0: snr = vmean / vstd
    values.append(vmin)
    values.append(vmax)
    #values.append(vrange)
    values.append(vmean)
    # values.append(float(sp.hmean(xnp + offset)))
    # values.append(float(sp.gmean(xnp + offset)))
    # values.append(vstd ** 2)
    values.append(vstd)
    # values.append(sp.skew(xnp))
    # values.append(sp.kurtosis(xnp))
    #values.append(np.median(xnp))
    # vmode = sp.mode(xnp)
    # vmode = float(vmode[0][0])
    # values.append(vmode)
    # values.append(venergy)
    # values.append(venergy_sample)
    #values.append(snr)
    return values, stats_all_names


if __name__ == "__main__":
     x = np.array(X_data)

    # vasiot kod tuka
     shift=input()
     w_long=input()
     #print(x.shape)
     #print(x[:10, 0].shape)
     #print(x[:10, 0])
     #print(sorted(x[:10, 0]))
     #print(stats_calculate_all(x[:10, 0]))
     #print(percentiles_all(x[:10, 0], iqr=False, amplitude=False, percentiles_list=[10, 20, 50, 70, 90]))
     #shift = 50
     #w_long=100
     #w_short = 20
     for i in range(w_long, len(X_data), shift):
        row=[]
        for j in range(x.shape[1]):
            #x.shape[1]
            x_winow_long = x[i - w_long:i,j]
            #x_winow_short = x[i - w_short:i, j]
            #p1 = percentiles_all(x_winow_long)
            #p2 = percentiles_all(x_winow_short)
            s1, stat_names = stats_calculate_all(x_winow_long)
            #s2, _ = stats_calculate_all(x_winow_short)
            #s2=round((s1),2)
            #long_MVA = s1[4]
            row+=s1
            #print(s1,row)
            #short_MVA = s2[4]
        row3=[round(r,2) for r in row]
        print(i,row3)

---------------------------------------------------------------------------------------------------
#Пресметка на статистики и два подвижни прозорци Problem 2 (1 / 2)
За даденото податочно множество во листата X_data со предефинирана должина N од секоја колона т
реба да се пресмета средната вредност. Притоа од стандарден влез се чита должнината на долгиот
подвижнен прозорец L1, и краткиот подвижнен прозорец L2. Поместувањето е фиксно и е 1 ред.
На излез треба да се испечати испроцесираното множество, така што во секоја линија ќе се
испечати бројот на редицата со која завршува подвижниот прозорец и листа од вредности за
секоја колона заокружени на 2 децимали: тековната вредност, средната вредност во долгиот
подвижен прозорец, средна вредност во краткиот подвижен прозорец, и разлика од двете средни вредности.
За M колони во секој ред треба да се испечатат листа со M x 4 елементи
(тековна вредност, средна вредност долг подвижен прозорец, средна вредност краток
подвижен прозорец, разлика од средните вредности). Заокружувањето се прави при печатењето на
вредностите, но тие се чуваат без заокружување.

НАПОМЕНА: Заради ефикасност на решението треба да ги пресметувате само статистиките кои се бараат.

def stats_calculate_all(x):
    """
    Calculates stats from the provided list xnp, based on the stats config object.
    :param stat_config: Configuration for which stats to be computed.
    :param x: The time series list.
    :param offset: Offset of the values so some stats can be calculated
    :return:
    """
    stats_all_names = ['len', 'min', 'max', 'range', 'mean', 'hmean', 'gmean', 'var', 'std', 'skew', 'kurtosis',
                       'median', 'mode', 'energy', 'energy_sample', 'snr']

    xnp = np.array(x)
    n = len(x)
    if n == 0:
        values = [0 for i in range(len(stats_all_names))]
        return values, stats_all_names
    values = [n]
    vmin = float(min(xnp))
    if vmin < 1:
        offset = 1 + abs(vmin)
    else:
        offset = 0
    vmax = float(max(xnp))
    vrange = vmax - vmin
    vmean = float(np.mean(xnp))
    vstd = float(np.std(xnp))

    values.append(vmin)
    values.append(vmax)
    values.append(vrange)
    values.append(vmean)
    values.append(vstd)

    return values, stats_all_names

if __name__ == "__main__":
    x = np.array(X_data)
    w_long = input()
    w_short = input()
    shift = 1
    arr_len = len(x[0, :])

    for i in range(max(w_short, w_long), len(X_data), shift):
        result = []
        for j in range(0, arr_len):

            x_winow_long = x[i - w_long:i, j]
            x_winow_short = x[i - w_short:i, j]
            s1, stat_names = stats_calculate_all(x_winow_long)
            s2, _ = stats_calculate_all(x_winow_short)
            long_MVA = s1[4]  # Moving Average long window
            short_MVA = s2[4]  # Moving Average short window
            # broj na primerok, broj na kolona, vrednost na primerokot i kolonata
            # razlika od tekovna vrednost i prosek od dolg prozorec
            # razlika od tekovna vrednost i prosek od kratok prozorec
            # razlika od pomegu prosek vo dolg i kratok prozorec
            result.extend([round(x[i, j], 2), round(long_MVA, 2), round(short_MVA, 2), round(long_MVA - short_MVA, 2)])
        print(i,result)
-------------------------------------------------------------------------------------------------------
# Vremenski prozorci - ispit januari 2017 - courses

За даденото податочно множество во листата X_data со предефинирана должина N од секоја колона треба да се пресмета средната вредност, медијаната и стандардната девијација.
Притоа од стандарден влез се чита должнината на долгиот подвижнен прозорец L1, и краткиот подвижнен прозорец L2.
Поместувањето е фиксно и е 5 реда. На излез треба да се испечати процесираното множество, така што во секоја линија
ќе се испечати бројот на редицата со која завршува подвижниот прозорец и листа од вредности за секоја колона заокружени
на 2 децимали: тековната вредност, средната вредност, медијаната и стандардната девијација во долгиот подвижен прозорец,
средната вредност, медијаната и стандардната девијација во краткиот подвижен прозорец. Заокружувањето се прави при печатењето на вредностите,
но тие се чуваат без заокружување. Дополнително, доколку средната вредност од долгиот е поголема од средната вредност од краткиот прозорец,
за секоја колона да се додаде вредност 1, а доколку средната вредност од краткиот е поголема од средната вредност од долгиот, да се додаде вредност -1.


y_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


def percentiles_all(x, iqr=True, amplitude=True, percentiles_list=[5, 10, 25, 40, 50, 60, 75, 90, 95]):
    names = ['p_' + str(p) for p in percentiles_list]
    if iqr and 25 in percentiles_list and 75 in percentiles_list:
        names.append('iqr')
    if amplitude and 1 in percentiles_list and 99 in percentiles_list:
        names.append('perc_amp')
    if len(x) == 0:
        values = [0 for i in range(len(names))]
        return values, names
    if len(percentiles_list) > 0 and all([0 < q < 100 for q in percentiles_list]):
        values = list(np.percentile(x, percentiles_list))
    else:
        values = []
    if iqr:
        q1 = percentiles_list.index(25)
        q3 = percentiles_list.index(75)
        values.append(values[q3] - values[q1])
    if amplitude and 1 in percentiles_list and 99 in percentiles_list:
        q1 = percentiles_list.index(1)
        q3 = percentiles_list.index(99)
        values.append(values[q3] - values[q1])
    return values, names

def stats_calculate_all(x):
    xnp = np.array(x)
    n = len(x)
    if n == 0:
        values = [0 for i in range(len(stats_all_names))]
        return values, stats_all_names
    values = []
    vmean = float(np.mean(xnp))
    vstd = float(np.std(xnp))
    if vstd != 0:
        snr = vmean / vstd
    values.append(vmean)
    values.append(np.median(xnp))
    values.append(vstd)
    return values


if __name__ == "__main__":
    x = np.array(X_data)
    shift = 5  # pomestuvanje pomegu podviznite prozorci
    w_long = input()  # dolzina (broj na otcituvanja) na dolgiot prozorec
    w_short = input()  # dolzina (broj na otcituvanja) na kratkiot prozorec
    for i in range(max(w_short, w_long), len(X_data), shift):
        lista=[]
        nova=[]
        for j in range(x.shape[1]):
            x_winow_long = x[i - w_long:i, j]
            x_winow_short = x[i - w_short:i, j]
            s1 = stats_calculate_all(x_winow_long)
            s2= stats_calculate_all(x_winow_short)
            lista.append(round((x[i][j]),2))
            for d in s1:
                lista.append(round(d,2))
            for d in s2:
                lista.append(round(d,2))
            if(s1[0]<s2[0]):
                lista.append(-1)
            else:
                lista.append(1)
        print (i,lista)

------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
Sistemi za preporaka - 1

Оцени на корисници и филмови Problem 1 (1 / 2)
За даденото множество кое е претставено како речник чиј клуч е името на
корисникот и вредност е речник чиј клуч е филмот, а вредност е оцената
која корисникот ја дал за филмот, да се инвертира така што ќе добиете
повторно речник од речници. Во новиот речник клуч е името на филмот,
а вредност е речник чиј клуч е името на корисникот, а вредност е оцената
која тој корисник ја дал за тековниот филм.

Потоа за прочитано име на филм од стандарден влез да се испечати најмалата и најголемата
оцена која е дадена за него.

Sample input
'Catch Me If You Can'
Sample output
{'Lisa Rose': 3.0, 'Jack Matthews': 4.5, 'Michael Phillips': 2.5, 'Gary Coleman': 1.5, 'Michelle Nichols': 2.5}


oceniPoKorisnici={
    'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
    'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
    'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
    'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
    'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
    'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
    'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
    'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
    'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
    'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
    }

def invertirajOceni(oceni):
    oceniPoFilmovi={}
    for person in oceni:
        for item in oceni[person]:
            oceniPoFilmovi.setdefault(item,{})
            oceniPoFilmovi[item][person]=oceni[person][item]


    return oceniPoFilmovi

if __name__ == "__main__":
    oceniPoFilmovi=invertirajOceni(oceniPoKorisnici)

    film=input()

    print oceniPoFilmovi[film]
-----------------------------------------------------------------------------------------------------
Sistemi za preporaka - 2

Opened: 196 дена
Функции за сличност Problem 2 (1 / 2)
Да се напишат функции за пресметување на сличност базирани на Пеарсонова корелација и
Евклидово растојание кои ќе враќаат торка од сличноста и бројот на заеднички елементи.
За прочитани имиња на двајца корисници да се испечатата торките што ги враќаат двете функции.
Sample input
'Jack Matthews'
'Gene Seymour'
Sample output
(0.905, 4)
(0.667, 4)

import math

oceniPoKorisnici={
    'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
    'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
    'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
    'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
    'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
    'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
    'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
    'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
    'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
    'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
    }

def sim_distance(oceni,person1,person2):
    # Se pravi lista na zaednicki predmeti (filmovi)
    zaednicki={}
    for item in oceni[person1].keys():
        if item in oceni[person2]:
            zaednicki[item]=1
    # ako nemaat zaednicki rejtinzi, vrati 0
    if len(zaednicki)==0: return 0
    # Soberi gi kvadratite na zaednickite razliki
    sum_of_squares=sum([pow(oceni[person1][item]-oceni[person2][item],2)
        for item in oceni[person1] if item in oceni[person2]])
    return (round(1/(1+math.sqrt(sum_of_squares)),3),len(zaednicki))

def sim_pearson(oceni,p1,p2):
    #for math import sqrt
    # Se kreira recnik vo koj ke se cuvaat predmetite (filmovi) koi se oceneti od dvajcata
    # Vo recnikot ni se vazni samo klucevite za da gi cuvame iminjata na filmovite koi se zaednicki, a vrednostite ne ni se vazni
    zaednicki={}
    for item in oceni[p1]:
        if item in oceni[p2]: zaednicki[item]=1

    # Se presmetuva brojot na predmeti oceneti od dvajcata
    n=len(zaednicki)

    # Ako nemaat zaednicki predmeti vrati korelacija 0
    if n==0: return 0

    # Soberi gi zaednickite oceni (rejtinzi) za  sekoja licnost posebno
    sum1=sum([oceni[p1][it] for it in zaednicki])
    sum2=sum([oceni[p2][it] for it in zaednicki])

    # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za  sekoja licnost posebno
    sum1Sq=sum([pow(oceni[p1][it],2) for it in zaednicki])
    sum2Sq=sum([pow(oceni[p2][it],2) for it in zaednicki])

    # Soberi gi proizvodite od ocenite na dvete licnosti
    pSum=sum([oceni[p1][it]*oceni[p2][it] for it in zaednicki])

    # Presmetaj go koeficientot na korelacija
    num=pSum-(sum1*sum2/n)
    den=math.sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: return 0
    r=num/den
    return (round(r,3),n)

if __name__ == "__main__":

    korisnik1=input()
    korisnik2=input()
    # korisnik1='Mick LaSalle'
    # korisnik2='Lisa Rose'
    print sim_pearson(oceniPoKorisnici, korisnik1, korisnik2)
    print sim_distance(oceniPoKorisnici, korisnik1, korisnik2)

------------------------------------------------------------------------------------------------------
Sistemi za preporaka - 3

Табела на слични корисници Problem 3 (1 / 2)
Да се напише функција која ќе генерира табела на слични корисници претставена како речник од речници
(клучеви се имињата на корисниците), така што за секој пар корисници ќе чува торка од сличност
базирана на Пеарсонова корелација, сличност базирана на Евклидово растојание, и број на заеднички
оцени (оцени дадени за исти филмови). Вредностите да бидат заокружени на 3 децимали. За прочитани
имиња на двајца корисници да се испечати торката која се чува во генерираната табела.
Sample input
'Larry'
'Gene Seymour'
Sample output
(0.327, -0.5, 3)


from math import sqrt
oceniPoKorisnici={
    'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
    'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
    'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
    'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
    'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
    'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
    'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
    'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
    'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
    'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
    }
# Vrakja merka za slicnost bazirana na rastojanieto za person1 i person2
def sim_distance(oceni,person1,person2):
    si={}
    for item in oceni[person1]:
        if item in oceni[person2]:
            si[item]=1
    if len(si)==0:
        return 0
    sum_of_squares=sum([pow(oceni[person1][item]-oceni[person2][item],2)
                        for item in oceni[person1] if item in oceni[person2]])
    return (round(1.0/(1+sqrt(sum_of_squares)),3),len(si))
    return (0,0)
def sim_pearson(oceni,person1,person2):
    si={}
    for item in oceni[person1]:
        if item in oceni[person2]:
            si[item]=1
    n=len(si)
    if n==0: return 0
    sum1=sum([oceni[person1][it] for it in si])
    sum2=sum([oceni[person2][it] for it in si])

    sum1Sq=sum([pow(oceni[person1][it],2) for it in si])
    sum2Sq=sum([pow(oceni[person2][it],2) for it in si])
    pSum=sum([oceni[person1][it]*oceni[person2][it] for it in si])
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0:
        return 0
    r=num/den
    return (round(r,3),n)


def TabelaNaSlicniKorisnici(oceni):
    slicnosti={}
    for item in oceni:
        for key in oceni:
            if item!=oceni:
                slicnosti.setdefault(item,{})
                torka=sim_pearson(oceni,item,key)
                torka1=sim_distance(oceni,item,key)
                if torka!=0:
                    a=torka1[0]
                    b=torka[0]
                    c=torka[1]
                    slicnosti[item][key]=a,b,c
    return slicnosti
if __name__ == "__main__":
    korisnik1=input()
    korisnik2=input()
    # korisnik1='Mick LaSalle'
    # korisnik2='Lisa Rose'
    # print oceniPoKorisnici
    tabela=TabelaNaSlicniKorisnici(oceniPoKorisnici)
    # print tabela
    print tabela[korisnik1][korisnik2]

------------------------------------------------------------------------------------------------------
Sistemi za preporaka
# -*- coding: utf-8 -*-

"""
Да се напише функција која во зависност од бројот на рангирани филмови на корисникот
ќе одбира начинот на препорачување - дали item-based или user-based. Функцијата треба да прима аргумент
име на корисникот и бројот n од стандарден влез. Ако бројот на рангирани филмови на корисникот е помал од n
препорачува на со item-based начин, а ако е поголем или еднаков на n да препорачува на user-based начин. На излез да
се печати одбраниот начин (user-based или item-based), и во вториот ред да се испечати листа од препорачани филмови која
ги содржи само имињата сортирани во растечки (азбучен) редослед.

"""


oceniPoKorisnici={
    'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
    'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
    'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
    'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
    'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
    'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
    'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
    'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
    'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
    'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
    }

import math

# Vrakja merka za slicnost bazirana na rastojanieto za person1 i person2
def sim_distance(oceni, person1, person2):
    # Se pravi lista na zaednicki predmeti (filmovi)

    filmovi1=set(oceni[person1].keys())
    filmovi2=set(oceni[person2].keys())
    zaednicki = filmovi1.intersection(filmovi2)
#     print(filmovi1)
#     print(filmovi2)
#     print(zaednicki)
#     for item in oceni[person1].keys():
#         if item in oceni[person2]:
#             zaednicki.add(item)
#     # ako nemaat zaednicki rejtinzi, vrati 0
    if len(zaednicki) == 0: return 0
#     # Soberi gi kvadratite na zaednickite razliki
    suma = 0.0
    for item in zaednicki:
        ocena1 = oceni[person1][item]
        ocena2 = oceni[person2][item]
        suma += (ocena1 - ocena2) ** 2
#         print(item, person1, ocena1, person2, ocena2)

    return 1 / (1 + math.sqrt(suma))

def sim_pearson(oceni, p1, p2):
    # Se kreira recnik vo koj ke se cuvaat predmetite (filmovi) koi se oceneti od dvajcata
    # Vo recnikot ni se vazni samo klucevite za da gi cuvame iminjata na filmovite koi se zaednicki, a vrednostite ne ni se vazni
    zaednicki = set()
    for item in oceni[p1]:
        if item in oceni[p2]:
            zaednicki.add(item)

    # Se presmetuva brojot na predmeti oceneti od dvajcata
    n = len(zaednicki)

    # Ako nemaat zaednicki predmeti vrati korelacija 0
    if n == 0: return 0

    # Soberi gi zaednickite oceni (rejtinzi) za  sekoja licnost posebno
    sum1 = 0
    sum2 = 0

    # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za  sekoja licnost posebno
    sum1Sq = 0
    sum2Sq = 0

    # Soberi gi proizvodite od ocenite na dvete licnosti
    pSum = 0
    for item in zaednicki:
        ocena1 = oceni[p1][item]
        ocena2 = oceni[p2][item]
        sum1 += ocena1
        sum1Sq += ocena1 ** 2
        sum2 += ocena2
        sum2Sq += ocena2 ** 2
        pSum += ocena1 * ocena2

    # Presmetaj go koeficientot na korelacija
    num = pSum - (sum1 * sum2 / n)
    den = math.sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
    if den == 0: return 0
    r = num / den
    return r

def topMatches(oceni, person, n=5, similarity=sim_pearson):
    scores = []
    for person2 in oceni.keys():
        if person != person2:
            s = similarity(oceni, person, person2)
            scores.append((s, person2))
    # Se sortira listata vo rastecki redosled
    scores.sort()
    # Se prevrtuva za najslicnite (so najgolema vrednost) da bidat prvi
    scores.reverse()
    if n is None:
        return scores
    else:
        return scores[0:n]

def getUserBasedRecomendations(oceni, person, similarity=sim_pearson, min_zaednicki=None):
    totals = {}
    simSums = {}
    for person2 in oceni.keys():
        # Za da ne se sporeduva so samiot sebe
        if person2 == person: continue
        filmovi1=set(oceni[person].keys())
        filmovi2=set(oceni[person2].keys())
        zaednicki = filmovi1.intersection(filmovi2)
        # ova e ako se bara minimum zaednicki filmovi
        # za da se zemat vo predvid ocenite na drugiot korisnik
        if min_zaednicki and len(zaednicki)<min_zaednicki:
            # print('So korisnikot', person2, 'imame samo',len(zaednicki),'filmovi, pa go preskoknuvame')
            continue
        sim = similarity(oceni, person, person2)
#         print(person,person2,sim)
        # ne se zemaat vo predvid rezultati <= 0
        if sim <= 0: continue
        # print(person,person2,sim)
        for item in oceni[person2].keys():
#             print(item, oceni[person].get(item, None), oceni[person2].get(item, None))
            # za tekovniot korisnik gi zemame samo filmovite sto gi nemame veke gledano
            if item not in oceni[person]: # or oceni[person][item] == 0:
                # similarity * Score   (Slicnost * Ocena)
                # print(item, sim, oceni[person2][item], sim* oceni[person2][item])
                totals.setdefault(item, 0)
                totals[item] += oceni[person2][item] * sim

                # Sumuma na slicnosti
                simSums.setdefault(item, 0)
                simSums[item] += sim
        # print()
    # return
    # print()
    # Kreiranje na normalizirana lista so rejtinzi
    # rankings = [(total / simSums[item], item) for item, total in totals.items()]
    rankings = []
    for item, weighted_score in totals.items():
        sim_total = simSums[item]
        my_score = round(weighted_score / sim_total, 1)
        # print(item, weighted_score, sim_total, my_score)
        rankings.append((my_score, item))

    # Sortiranje na listata vo rastecki redosled
    rankings.sort(reverse=True)
    # Prevrtuvanje na lista za najgolemite vrednosti da bidat napred
#     rankings.reverse()
    a = [item[1] for item in rankings][0:3]
    a.sort()
    return a

def transformoceni(oceni):
    result = {}
    for person in oceni.keys():
        for item in oceni[person]:
            result.setdefault(item, {})
            # Zameni gi mestata na licnosta i predmetot
            result[item][person] = oceni[person][item]
    return result

def getItemBasedRecomendations(critics, person1, n=3):
    oceni_po_film = transformoceni(critics)
    similarity_per_item = {}
    for item in critics[person1].keys():
        similar_items = topMatches(oceni_po_film, item, n=None)
        my_rating = critics[person1][item]

        for similarity, item2 in similar_items:
            if item2 in critics[person1] or similarity <= 0:
#                 print('Slicnost', similarity, 'na', item,'so', item2)
                continue
            weight= similarity * my_rating
#             print('Slicnost', similarity, 'na', item,'so', item2, weight)
            similarity_per_item.setdefault(item2, [])
            similarity_per_item[item2].append(weight)
#         print(item, my_rating, list(similarity_per_item.items()))
    similarity_per_item_avg = []
    import numpy as np
    for item in similarity_per_item:
        #print(item, similarity_per_item[item])
        avg_sim = np.mean(similarity_per_item[item])
        similarity_per_item_avg.append((avg_sim, item))
    #similarity_per_item_avg.sort(reverse=True)

    similarity_per_item_avg.sort(reverse=True)
    novi = [] #[item[1] for item in similarity_per_item_avg if item[1] > 0][0:3]

    for item in similarity_per_item_avg:
        # print item
        novi.append(item[1])
    novi = novi[0:3]
    novi.sort()
    return novi

    #return li #similarity_per_item_avg[:n]

if __name__ == '__main__':

    k = input()
    n = input()

    long = len(oceniPoKorisnici[k].keys())
    if long < n:
        print 'item-based\n', getItemBasedRecomendations(oceniPoKorisnici,k)
    else:
        print 'user-based\n', getUserBasedRecomendations(oceniPoKorisnici, k)

------------------------------------------------------------------------------------------------------
Sistemi za preporaka

"""
По изработка на задачите од претходната вежба веќе ќе имате две тренинг множества претставени во Python како речник од речници. Искористете ги за изработка на систем за препораки така што да може на секој од тест корисниците да им предложи по 3 филмови, еднаш користејќи item-based, а еднаш user-based препораки. При item-based пристапот се предлагаат фимови кои ги нема гледано корисникот кои се со позитивна сличност со некои од филмовите кои ги има гледано. На излез треба да се печатат две листи кои ги содржат само имињата на предложените филмови во растечки (азбучен) редослед. Првата листа е според user-based, а втората според item-based пристап.

"""

critics={
    'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
    'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
    'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
    'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
    'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
    'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
    'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
    'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
    'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
    'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
    }


from math import sqrt

def sim_distance(oceni, person1, person2):
    filmovi1=set(oceni[person1].keys())
    filmovi2=set(oceni[person2].keys())
    zaednicki = filmovi1.intersection(filmovi2)
    #print(filmovi1)
    #print(filmovi2)
    #print(zaednicki)

    for item in oceni[person1].keys():
         if item in oceni[person2]:
             zaednicki.add(item)

    if len(zaednicki) == 0: return 0

    suma = 0.0
    for item in zaednicki:
        ocena1 = oceni[person1][item]
        ocena2 = oceni[person2][item]
        suma += (ocena1 - ocena2) ** 2
#         print(item, person1, ocena1, person2, ocena2)

    return 1 / (1 + sqrt(suma))

def sim_pearson(oceni, p1, p2):
    zaednicki = set()
    for item in oceni[p1]:
        if item in oceni[p2]:
            zaednicki.add(item)

    # Se presmetuva brojot na predmeti oceneti od dvajcata
    n = len(zaednicki)

    # Ako nemaat zaednicki predmeti vrati korelacija 0
    if n == 0: return 0

    # Soberi gi zaednickite oceni (rejtinzi) za  sekoja licnost posebno
    sum1 = 0
    sum2 = 0

    # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za  sekoja licnost posebno
    sum1Sq = 0
    sum2Sq = 0

    # Soberi gi proizvodite od ocenite na dvete licnosti
    pSum = 0
    for item in zaednicki:
        ocena1 = oceni[p1][item]
        ocena2 = oceni[p2][item]
        sum1 += ocena1
        sum1Sq += ocena1 ** 2
        sum2 += ocena2
        sum2Sq += ocena2 ** 2
        pSum += ocena1 * ocena2

    # Presmetaj go koeficientot na korelacija
    num = pSum - (sum1 * sum2 / n)
    den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
    if den == 0: return 0
    r = num / den
    return r

def transformPrefs(oceni):
    result = {}
    for person in oceni.keys():
        for item in oceni[person]:
            result.setdefault(item, {})
            # Zameni gi mestata na licnosta i predmetot
            result[item][person] = oceni[person][item]
    return result

def topMatches(oceni, person, n=5, similarity=sim_pearson):
    scores = []
    for person2 in oceni.keys():
        if person != person2:
            s = similarity(oceni, person, person2)
            scores.append((s, person2))
    # Se sortira listata vo rastecki redosled
    scores.sort()
    # Se prevrtuva za najslicnite (so najgolema vrednost) da bidat prvi
    scores.reverse()
    if n is None:
        return scores
    else:
        return scores[0:n]

def getRecommendations(oceni, person, similarity=sim_pearson, min_zaednicki=None):
    totals = {}
    simSums = {}
    for person2 in oceni.keys():

        filmovi1 = set(oceni[person].keys())
        filmovi2 = set(oceni[person2].keys())
        zaednicki = filmovi1.intersection(filmovi2)
        # ova e ako se bara minimum zaednicki filmovi
        # za da se zemat vo predvid ocenite na drugiot korisnik
        if min_zaednicki and len(zaednicki) < min_zaednicki:
            print('So korisnikot', person2, 'imame samo', len(zaednicki), 'filmovi, pa go preskoknuvame')
            continue

        if person2 == person:
            continue

        sim = similarity(oceni, person, person2)

        if sim <= 0:
            continue

        for item in oceni[person2]:
            if item not in oceni[person] or oceni[person][item] == 0:
                # similarity * Score   (Slicnost * Ocena)
                #print(item, sim, oceni[person2][item], sim* oceni[person2][item])
                totals.setdefault(item, 0)
                totals[item] += oceni[person2][item] * sim

                # Sumuma na slicnosti
                simSums.setdefault(item, 0)
                simSums[item] += sim

    rankings = [(total / simSums[item], item) for item, total in totals.items()]
    #rankings = [item for item,v in totals.items()]
    rankings.sort(reverse=True)
    a = []
    for i in rankings:
        a.append(i[1])
    a = a[:3]
    a.sort()
    return a#sorted(rankings[0:3])

def getItemBasedRecomendations(oceni,korisnik,similarity=sim_pearson):
    rankings = []
    filmovi = oceni.keys()
    gledani = [item for item in filmovi if critics[korisnik].has_key(item)]
    negledani = [item for item in filmovi if not critics[korisnik].has_key(item)]
    #print gledani
    #print negledani
    slicnosti = {}
    for film in negledani:
        for drug in gledani:
            sim = similarity(oceni, film, drug)
            slicnosti.setdefault(film, 0);
            if slicnosti[film] < sim:
                slicnosti[film] = sim
    stvari = slicnosti.items()
    #print stvari
    stvari.sort(key=lambda tup: tup[1], reverse=True)
    novi = [item[0] for item in stvari if item[1] > 0][0:3]
    novi.sort()
    return novi

def item_based(critics, person1, n=3):
    oceni_po_film = transformPrefs(critics)
    similarity_per_item = {}
    for item in critics[person1].keys():
        similar_items = topMatches(oceni_po_film, item, n=None)
        my_rating = critics[person1][item]

        for similarity, item2 in similar_items:
            if item2 in critics[person1] or similarity <= 0:
#                 print('Slicnost', similarity, 'na', item,'so', item2)
                continue
            weight= similarity * my_rating
#             print('Slicnost', similarity, 'na', item,'so', item2, weight)
            similarity_per_item.setdefault(item2, [])
            similarity_per_item[item2].append(weight)
#         print(item, my_rating, list(similarity_per_item.items()))
    similarity_per_item_avg = []
    import numpy as np
    for item in similarity_per_item:
        #print(item, similarity_per_item[item])
        avg_sim = np.mean(similarity_per_item[item])
        similarity_per_item_avg.append((avg_sim, item))
    similarity_per_item_avg.sort(reverse=True)
    a = []
    for i in similarity_per_item_avg:
        a.append(i[1])
    a = a[:n]
    a.sort()
    return a #similarity_per_item_avg[:n]

if __name__ == "__main__":
    korisnik = input()

    print "user-based: " + str(getRecommendations(critics, korisnik))
    print "item-based: " + str(item_based(critics=critics, person1=korisnik))

------------------------------------------------------------------------------------------------------
# -*- coding:utf-8 -*-

"""
Да испрограмира функција за косинусна сличност која е дефинирана со следнава формула, каде A е листа со оцените на едниот корисник или филм, а B е листа со оцените на другиот корисник или филм:
enter image description here
Притоа треба да се избегне делење со нула и во тој случај да се смета дека сличноста е -1.
Речник со оцени на корисници по филмови треба е веќе даден. Од стандардниот влез се вчитува име на еден филм. Да се испечати сличноста на прочитаниот филм со секој друг филм (освен самиот со себе) така што ќе се печати:
Филм 2
Косинусна сличност, Пеарсонова сличност, Евклидова сличност
Празна линија
При печатењето филмовите треба да бидат подредени по азбучен редослед. Сите сличности треба да бидат заокружени на 2 децимали.
"""


from math import sqrt


oceniPoKorisnici={
    'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
    'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5,  'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
    'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
    'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
    'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
    'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
    'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
    'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
    'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
    'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
    }


def sim_cos(oceni, p1, p2):

    zaednicki = set()
    for item in oceni[p1]:
        if item in oceni[p2]:
            zaednicki.add(item)

    #print(zaednicki)
    #return

    # Se presmetuva brojot na predmeti oceneti od dvajcata
    n = len(zaednicki)

    # Ako nemaat zaednicki predmeti vrati korelacija 0
    if n == 0: return 0

    # Soberi gi zaednickite oceni (rejtinzi) za  sekoja licnost posebno
    sum1 = 0
    sum2 = 0

    # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za  sekoja licnost posebno
    sum1Sq = 0
    sum2Sq = 0

    # Soberi gi proizvodite od ocenite na dvete licnosti
    pSum = 0
    for item in zaednicki:
        ocena1 = oceni[p1][item]
        ocena2 = oceni[p2][item]

        sum1 += ocena1
        sum1Sq += ocena1 ** 2
        sum2 += ocena2
        sum2Sq += ocena2 ** 2
        pSum += ocena1 * ocena2

    # Presmetaj go koeficientot na korelacija
    num = pSum
    den = sqrt(sum1Sq) * sqrt(sum2Sq)
    if den == 0: return -1
    r = num / den
    return round(r,2)

def sim_distance(oceni, person1, person2):
    # Se pravi lista na zaednicki predmeti (filmovi)

    filmovi1=set(oceni[person1].keys())
    filmovi2=set(oceni[person2].keys())
    zaednicki = filmovi1.intersection(filmovi2)
#     print(filmovi1)
#     print(filmovi2)
#     print(zaednicki)
#     for item in oceni[person1].keys():
#         if item in oceni[person2]:
#             zaednicki.add(item)
#     # ako nemaat zaednicki rejtinzi, vrati 0
    if len(zaednicki) == 0: return 0
#     # Soberi gi kvadratite na zaednickite razliki
    suma = 0.0
    for item in zaednicki:
        ocena1 = oceni[person1][item]
        ocena2 = oceni[person2][item]
        suma += (ocena1 - ocena2) ** 2
#         print(item, person1, ocena1, person2, ocena2)

    return round(1 / (1 + sqrt(suma)),2)

def sim_pearson(oceni, p1, p2):
    # Se kreira recnik vo koj ke se cuvaat predmetite (filmovi) koi se oceneti od dvajcata
    # Vo recnikot ni se vazni samo klucevite za da gi cuvame iminjata na filmovite koi se zaednicki, a vrednostite ne ni se vazni
    zaednicki = set()
    for item in oceni[p1]:
        if item in oceni[p2]:
            zaednicki.add(item)

    # Se presmetuva brojot na predmeti oceneti od dvajcata
    n = len(zaednicki)

    # Ako nemaat zaednicki predmeti vrati korelacija 0
    if n == 0: return 0

    # Soberi gi zaednickite oceni (rejtinzi) za  sekoja licnost posebno
    sum1 = 0
    sum2 = 0

    # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za  sekoja licnost posebno
    sum1Sq = 0
    sum2Sq = 0

    # Soberi gi proizvodite od ocenite na dvete licnosti
    pSum = 0
    for item in zaednicki:
        ocena1 = oceni[p1][item]
        ocena2 = oceni[p2][item]
        sum1 += ocena1
        sum1Sq += ocena1 ** 2
        sum2 += ocena2
        sum2Sq += ocena2 ** 2
        pSum += ocena1 * ocena2

    # Presmetaj go koeficientot na korelacija
    num = pSum - (sum1 * sum2 / n)
    den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
    if den == 0: return 0
    r = num / den
    return round(r,2)

def topMatches(oceni, person, n=5, similarity=sim_pearson):
    scores = []
    for person2 in oceni.keys():
        if person != person2:
            s = similarity(oceni, person, person2)
            scores.append((s, person2))
    # Se sortira listata vo rastecki redosled
    scores.sort()
    # Se prevrtuva za najslicnite (so najgolema vrednost) da bidat prvi
    scores.reverse()
    if n is None:
        return scores
    else:
        return scores[0:n]

def transformoceni(oceni):
    result = {}
    for person in oceni.keys():
        for item in oceni[person]:
            result.setdefault(item, {})
            # Zameni gi mestata na licnosta i predmetot
            result[item][person] = oceni[person][item]
    return result

def getRecommendations(oceni, person, similarity=sim_pearson, min_zaednicki=None):
    totals = {}
    simSums = {}
    for person2 in oceni.keys():
        # Za da ne se sporeduva so samiot sebe
        if person2 == person: continue
        filmovi1=set(oceni[person].keys())
        filmovi2=set(oceni[person2].keys())
        zaednicki = filmovi1.intersection(filmovi2)
        # ova e ako se bara minimum zaednicki filmovi
        # za da se zemat vo predvid ocenite na drugiot korisnik
        if min_zaednicki and len(zaednicki)<min_zaednicki:
            print('So korisnikot', person2, 'imame samo',len(zaednicki),'filmovi, pa go preskoknuvame')
            continue
        sim = similarity(oceni, person, person2)
#         print(person,person2,sim)
        # ne se zemaat vo predvid rezultati <= 0
        if sim <= 0: continue
        print(person,person2,sim)
        for item in oceni[person2].keys():
#             print(item, oceni[person].get(item, None), oceni[person2].get(item, None))
            # za tekovniot korisnik gi zemame samo filmovite sto gi nemame veke gledano
            if item not in oceni[person]: # or oceni[person][item] == 0:
                # similarity * Score   (Slicnost * Ocena)
                print(item, sim, oceni[person2][item], sim* oceni[person2][item])
                totals.setdefault(item, 0)
                totals[item] += oceni[person2][item] * sim

                # Sumuma na slicnosti
                simSums.setdefault(item, 0)
                simSums[item] += sim
        print()
#     return
    print()
    # Kreiranje na normalizirana lista so rejtinzi
    # rankings = [(total / simSums[item], item) for item, total in totals.items()]
    rankings = []
    for item, weighted_score in totals.items():
        sim_total = simSums[item]
        my_score = round(weighted_score / sim_total, 1)
        print(item, weighted_score, sim_total, my_score)
        rankings.append((my_score, item))

    # Sortiranje na listata vo rastecki redosled
    rankings.sort(reverse=True)
    # Prevrtuvanje na lista za najgolemite vrednosti da bidat napred
#     rankings.reverse()
    return rankings

def item_based(critics, person1, n=3):
    oceni_po_film = transformoceni(critics)
    similarity_per_item = {}
    for item in critics[person1].keys():
        similar_items = topMatches(oceni_po_film, item, n=None)
        my_rating = critics[person1][item]

        for similarity, item2 in similar_items:
            if item2 in critics[person1] or similarity <= 0:
#                 print('Slicnost', similarity, 'na', item,'so', item2)
                continue
            weight= similarity * my_rating
#             print('Slicnost', similarity, 'na', item,'so', item2, weight)
            similarity_per_item.setdefault(item2, [])
            similarity_per_item[item2].append(weight)
#         print(item, my_rating, list(similarity_per_item.items()))
    similarity_per_item_avg = []
    import numpy as np
    for item in similarity_per_item:
        print(item, similarity_per_item[item])
        avg_sim = np.mean(similarity_per_item[item])
        similarity_per_item_avg.append((avg_sim, item))
    similarity_per_item_avg.sort(reverse=True)
    return similarity_per_item_avg[:n]

def transformoceni(oceni):
    result = {}
    for person in oceni.keys():
        for item in oceni[person]:
            result.setdefault(item, {})
            # Zameni gi mestata na licnosta i predmetot
            result[item][person] = oceni[person][item]
    return result


if __name__ == '__main__':

    film = 'Catch Me If You Can'

    # film = input()

    movie_base=transformoceni(oceniPoKorisnici)

    for k in sorted(movie_base.keys()):
        if film == k:
            continue
        else:
            print(k)
            print sim_cos(movie_base,film,k), sim_pearson(movie_base,film,k), sim_distance(movie_base,film,k)
            print

------------------------------------------------------------------------------------------------------
Sistemi za preporaka januari 2017

За корисникот внесен на влез да се препорача филм. Да се користи Пирсонов коефициент на корелација како мерка.
Ако корисникот го нема во базата да се препорача најгледаниот филм. Доколку корисникот има гледано повеќе од 5 филмови
, да се препорача според филмовите, во спротивно да се препорача според корисниците кои се слични со него.

from __future__ import print_function
import json
from math import sqrt

# A dictionary of movie critics and their ratings of a small set of movies
critics = {
    'Lisa Rose': {'Catch Me If You Can': 3.0, 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5,
                  'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
    'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'The Night Listener': 3.0,
                     'You, Me and Dupree': 3.5},
    'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5, 'Superman Returns': 3.5,
                         'The Night Listener': 4.0, 'Snitch': 2.0},
    'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'The Night Listener': 4.5, 'Superman Returns': 4.0,
                     'You, Me and Dupree': 2.5},
    'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'Just My Luck': 2.0, 'Superman Returns': 3.0,
                     'You, Me and Dupree': 2.0},
    'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
                      'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
    'Toby': {'Snakes on a Plane': 4.5, 'Snitch': 5.0},
    'Michelle Nichols': {'Just My Luck': 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5,
                         'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
    'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5,
                     'You, Me and Dupree': 2.0},
    'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
}

def sim_distance(oceni,person1,person2):
    si={}
    for item in oceni[person1]:
        if item in oceni[person2]:
            si[item]=1
    if len(si)==0: return 0
    sum_of_squares=sum([pow(oceni[person1][item]-oceni[person2][item],2)
    for item in oceni[person1] if item in oceni[person2]])
    return 1/(1+sqrt(sum_of_squares))

def sim_pearson(oceni,person1,person2):
    si={}
    for item in oceni[person1]:
        if item in oceni[person2]: si[item]=1
    n=len(si)
    if n==0: return 0
    sum1=sum([oceni[person1][it] for it in si])
    sum2=sum([oceni[person2][it] for it in si])
    sum1Sq=sum([pow(oceni[person1][it],2) for it in si])
    sum2Sq=sum([pow(oceni[person2][it],2) for it in si])
    pSum=sum([oceni[person1][it]*oceni[person2][it] for it in si])
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: return 0
    r=num/den
    return r

def transformPrefs(prefs):
    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            result[item][person]=prefs[person][item]
    return result

def topMatches(prefs,person,n=4,similarity=sim_pearson):
    scores=[(similarity(prefs,person,other),other)
            for other in prefs if other!=person]
    scores.sort()
    scores.reverse()
    return scores[0:n]

def getUserBasedRecommendations(oceni,korisnik,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in oceni:
        if other==korisnik: continue
        sim=similarity(oceni,korisnik,other)
        if sim<=0: continue
        for item in oceni[other]:
            if item not in oceni[korisnik] or oceni[korisnik][item]==0:
                totals.setdefault(item,0)
                totals[item]+=oceni[other][item]*sim
                simSums.setdefault(item,0)
                simSums[item]+=sim

    rankings=[(total/simSums[item],item) for item,total in totals.items()]
    rankings.sort()
    rankings.reverse()
    rankings = rankings[0:3]
    return rankings

def getItemBasedRecomendations(oceni,korisnik,similarity=sim_pearson):
    similar={}
    films=transformPrefs(oceni)
    for gledanFilm in oceni[korisnik]:
            similar_filmovi=topMatches(films,gledanFilm)
            for slicnost,slicen_film in similar_filmovi:
                if slicen_film not in oceni[korisnik] and (slicen_film not in similar or slicnost>similar[slicen_film]):
                    similar[slicen_film]=slicnost

    rankings=sorted(similar, key=similar.get)
    rankings.reverse()
    rankings=rankings[0:3]
    rankings.sort()
    return rankings


def transformoceni(oceni):
    result={}
    for person in oceni:
        for item in oceni[person]:
            result.setdefault(item,{})
            result[item][person]=oceni[person][item]
    return result


def item_based(critics, person1, n=3):
    oceni_po_film = transformoceni(critics)
    similarity_per_item = {}
    for item in critics[person1].keys():
        similar_items = topMatches(oceni_po_film, item, n=None)
        my_rating = critics[person1][item]
        for similarity, item in similar_items:
            if item in critics[person1] or similarity <= 0:
                continue
            similarity_per_item.setdefault(item, [])
            similarity_per_item[item].append(similarity * my_rating)
    similarity_per_item_avg = []
    import numpy as np
    for item in similarity_per_item:
        avg_sim = np.mean(similarity_per_item[item])
        similarity_per_item_avg.append((avg_sim, item))
    similarity_per_item_avg.sort(reverse=True)
    return similarity_per_item_avg[:n]

if __name__ == "__main__":
    korisnik = input()
    inverse = transformoceni(critics)
    korisniciIFilmovi = inverse.items()

    korisniciIFilmovi.sort(key=lambda tup: len(tup[1].items()), reverse=True)

    najgledan = korisniciIFilmovi[0][0]

    if not korisnik in critics:
        print(najgledan)
        exit()

    oceniNaKorisnik = critics[korisnik]
    filmoviNaKorisnik = oceniNaKorisnik.keys()

    imaPoveke = False
    for k in critics:
        if k != korisnik:
            brojac = 0
            for film in filmoviNaKorisnik:
                brojac += 1

            if brojac > 5:
                imaPoveke = True


    if imaPoveke:
        print (str(getItemBasedRecomendations(critics, korisnik)[0]))
    else:
        print (str(getUserBasedRecommendations(critics, korisnik)[0][1]))

------------------------------------------------------------------------------------------------------