Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #lab1 drva
- """
- Задача 1 Problem 1 (2 / 12)
- Да се промени класата за дрво на одлука да чува и информација на кое ниво во дрвото се наоѓа јазолот.
- Потоа да се променат и функциите за градење и печатење на дрвото така што за секој јазол ќе се печати
- и нивото. Коренот е на нулто ниво. На излез со функцијата printTree треба да се испечати даденото
- тренинг множество. Прочитана инстанца од стандарден влез да се додаде на тренинг множеството
- и потоа да се истренира и испечати истото.
- """
- trainingData=[['slashdot','USA','yes',18,'None'],
- ['google','France','yes',23,'Premium'],
- ['google','France','yes',23,'Basic'],
- ['google','France','yes',23,'Basic'],
- ['digg','USA','yes',24,'Basic'],
- ['kiwitobes','France','yes',23,'Basic'],
- ['google','UK','no',21,'Premium'],
- ['(direct)','New Zealand','no',12,'None'],
- ['(direct)','UK','no',21,'Basic'],
- ['google','USA','no',24,'Premium'],
- ['slashdot','France','yes',19,'None'],
- ['digg','USA','no',18,'None'],
- ['google','UK','no',18,'None'],
- ['kiwitobes','UK','no',19,'None'],
- ['digg','New Zealand','yes',12,'Basic'],
- ['slashdot','UK','no',21,'None'],
- ['google','UK','yes',18,'Basic'],
- ['kiwitobes','France','yes',19,'Basic']]
- class decisionnode:
- def __init__(self, col=-1, value=None, results=None, tb=None, fb=None, l=None):
- self.col = col
- self.value = value
- self.results = results
- self.tb = tb
- self.fb = fb
- self.l = l
- def sporedi_broj(row, column, value):
- return row[column] >= value
- def sporedi_string(row, column, value):
- return row[column] == value
- # Divides a set on a specific column. Can handle numeric
- # or nominal values
- def divideset(rows, column, value):
- # Make a function that tells us if a row is in
- # the first group (true) or the second group (false)
- split_function = None
- if isinstance(value, int) or isinstance(value, float): # ako vrednosta so koja sporeduvame e od tip int ili float
- # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
- split_function = sporedi_broj
- else:
- # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
- split_function = sporedi_string
- # Divide the rows into two sets and return them
- set_false = []
- set_true = []
- for row in rows:
- if split_function(row, column, value):
- set_true.append(row)
- else:
- set_false.append(row)
- set1 = [row for row in rows if
- split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja true
- set2 = [row for row in rows if
- not split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja false
- # return (set1, set2)
- return (set_true, set_false)
- # Create counts of possible results (the last column of
- # each row is the result)
- def uniquecounts(rows):
- results = {}
- for row in rows:
- # The result is the last column
- r = row[-1]
- results.setdefault(r, 0)
- results[r] += 1
- return results
- def log2(x):
- from math import log
- l2 = log(x) / log(2)
- return l2
- def entropy(rows):
- results = uniquecounts(rows)
- # Now calculate the entropy
- ent = 0.0
- for r in results.keys():
- p = float(results[r]) / len(rows)
- ent = ent - p * log2(p)
- return ent
- def buildtree(rows, l=-1, scoref=entropy):
- if len(rows) == 0: return decisionnode()
- current_score = scoref(rows)
- # Set up some variables to track the best criteria
- best_gain = 0.0
- best_column = -1
- #best_value = None
- #best_subsetf = None
- #best_subsett = None
- best_criteria = None
- best_sets = None
- column_count = len(rows[0]) - 1
- for col in range(column_count):
- # Generate the list of different values in
- # this column
- column_values = {}
- for row in rows:
- column_values[row[col]] = 1
- # Now try dividing the rows up for each value
- # in this column
- for value in column_values:
- (set1, set2) = divideset(rows, col, value)
- # Information gain
- p = float(len(set1)) / len(rows)
- gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
- if gain > best_gain and len(set1) > 0 and len(set2) > 0:
- best_gain = gain
- #best_column = col
- #best_value = value
- #best_subsett = set1
- #best_subsetf = set2
- best_criteria = (col, value)
- best_sets = (set1, set2)
- # Create the subbranches
- if best_gain > 0:
- l = l+1
- trueBranch = buildtree(best_sets[0],l, scoref)
- falseBranch = buildtree(best_sets[1],l, scoref)
- return decisionnode(col=best_criteria[0], value=best_criteria[1],
- tb=trueBranch, fb=falseBranch,l=l)
- else:
- return decisionnode(results=uniquecounts(rows))
- def printtree(tree, indent=''):
- # Is this a leaf node?
- if tree.results != None:
- print str(tree.results)
- else:
- # Print the criteria
- print(str(tree.col) + ':' + str(tree.value) + '? ') + 'Level=' + str(tree.l)
- # Print the branches
- print(indent + 'T->'),
- printtree(tree.tb, indent + ' ')
- print(indent + 'F->'),
- printtree(tree.fb, indent + ' ')
- def classify(observation, tree):
- if tree.results != None:
- return tree.results
- else:
- vrednost = observation[tree.col]
- branch = None
- if isinstance(vrednost, int) or isinstance(vrednost, float):
- if vrednost >= tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- else:
- if vrednost == tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- return classify(observation, branch)
- if __name__ == "__main__":
- referrer=input()
- location=input()
- readFAQ=input()
- pagesVisited=input()
- serviceChosen=input()
- #referrer = 'google'
- #location = 'UK'
- #readFAQ = 'no',
- #pagesVisited = 18
- #serviceChosen = 'None'
- tmp = [referrer,location,readFAQ,pagesVisited,serviceChosen]
- trainingData.append(tmp)
- t = buildtree(trainingData)
- printtree(t)
- -------------------------------------------------------------------------------------------------------
- #lab2 Drva
- """
- Да се промени функцијата за предвидување, така што таа ќе ја печати само класата
- која ја предвидува (а не речник како сега). Притоа да се проверува дали во листот
- има повеќе од една класа. Ако има само една класа тогаш се предвидува истата, но
- ако има повеќе од една треба да се испечати таа со најголем број на инстанци. Ако
- во листот има неколку класи со ист број на инстанци да се предвиде првата класа по азбучен ред.
- """
- trainingData=[['slashdot','USA','yes',18,'None'],
- ['google','France','yes',23,'Premium'],
- ['google','France','yes',23,'Basic'],
- ['google','France','yes',23,'Basic'],
- ['digg','USA','yes',24,'Basic'],
- ['kiwitobes','France','yes',23,'Basic'],
- ['google','UK','no',21,'Premium'],
- ['(direct)','New Zealand','no',12,'None'],
- ['(direct)','UK','no',21,'Basic'],
- ['google','USA','no',24,'Premium'],
- ['slashdot','France','yes',19,'None'],
- ['digg','USA','no',18,'None'],
- ['google','UK','no',18,'None'],
- ['kiwitobes','UK','no',19,'None'],
- ['digg','New Zealand','yes',12,'Basic'],
- ['slashdot','UK','no',21,'None'],
- ['google','UK','yes',18,'Basic'],
- ['kiwitobes','France','yes',19,'Basic']]
- class decisionnode:
- #lel init konstruktor?
- def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
- self.col=col
- self.value=value
- self.results=results
- self.tb=tb
- self.fb=fb
- def sporedi_broj(row,column,value):
- return row[column]>=value
- def sporedi_string(row,column,value):
- return row[column]==value
- # Divides a set on a specific column. Can handle numeric
- # or nominal values
- def divideset(rows,column,value):
- # Make a function that tells us if a row is in
- # the first group (true) or the second group (false)
- split_function=None #Flag za proverka
- if isinstance(value,int) or isinstance(value,float):
- # ako vrednosta so koja sporeduvame e od tip int ili float
- split_function=sporedi_broj
- else:
- # ako vrednosta so koja sporeduvame e od drug tip (string)
- split_function=sporedi_string
- # Divide the rows into two sets and return them
- set1=[row for row in rows if split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja true
- set2=[row for row in rows if not split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja false
- return (set1,set2)
- # Create counts of possible results
- #(the last column (vertical result) of each row is the result)
- def uniquecounts(rows):
- results={}
- for row in rows:
- # The result is the last column
- r=row[len(row)-1]
- if r not in results:
- results[r]=0
- results[r]+=1
- return results
- # Entropy is the sum of p(x)log(p(x)) across all
- # the different possible results
- def entropy(rows):
- from math import log
- log2=lambda x:log(x)/log(2)
- results=uniquecounts(rows)
- # Now calculate the entropy
- ent=0.0
- for r in results.keys():
- p=float(results[r])/len(rows)
- ent=ent-p*log2(p)
- return ent
- def buildtree(rows,scoref=entropy):
- if len(rows)==0: return decisionnode()
- current_score=scoref(rows)
- # Set up some variables to track the best criteria
- best_gain=0.0
- best_criteria=None
- best_sets=None
- column_count=len(rows[0])-1
- for col in range(0,column_count):
- # Generate the list of different values in
- # this column
- column_values={}
- for row in rows:
- column_values[row[col]]=1
- # Now try dividing the rows up for each value
- # in this column
- for value in column_values.keys():
- (set1,set2)=divideset(rows,col,value)
- # Information gain
- p=float(len(set1))/len(rows)
- gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
- if gain>best_gain and len(set1)>0 and len(set2)>0:
- best_gain=gain
- best_criteria=(col,value)
- best_sets=(set1,set2)
- # Create the subbranches
- if best_gain>0:
- trueBranch=buildtree(best_sets[0])
- falseBranch=buildtree(best_sets[1])
- return decisionnode(col=best_criteria[0],value=best_criteria[1],tb=trueBranch, fb=falseBranch)
- else:
- return decisionnode(results=uniquecounts(rows))
- def classify(observation,tree):
- if tree.results!=None:
- recnik=tree.results;
- lista=[]
- for k in recnik.keys():
- torka=(k,recnik[k])
- lista.append(torka)
- brKlasi=len(torka)
- if brKlasi==1:
- return lista[0][0]
- lista.sort()
- return lista[0][0]
- else:
- vrednost=observation[tree.col]
- branch=None
- if isinstance(vrednost,int) or isinstance(vrednost,float):
- if vrednost>=tree.value: branch=tree.tb
- else: branch=tree.fb
- else:
- if vrednost==tree.value: branch=tree.tb
- else: branch=tree.fb
- return classify(observation,branch)
- if __name__ == "__main__":
- # referrer='slashdot'
- # location='UK'
- # readFAQ='no'
- # pagesVisited=21
- # serviceChosen='Unknown'
- referrer=input()
- location=input()
- readFAQ=input()
- pagesVisited=input()
- serviceChosen=input()
- testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
- t=buildtree(trainingData)
- print classify(testCase,t)
- -------------------------------------------------------------------------------------------------------
- #januari 2017
- """
- Да се промени алгоритмот за дрво на одлука така што ќе се изградат 2 дрва на одлука.
- Едното дрво на одлука ќе ја користи првата половина од податочното множество, а другото дрво,
- втората половина.
- Доколку двете дрва на одлука на тест примерот го дадат истиот резултат, да се испечати тој резултат.
- Доколку дадат различен резултат, да се испечати KONTRADIKCIJA.
- Доколку некое од дрвата има само една класа тогаш се предвидува истата,
- но ако има повеќе од една треба да се избере таа со најголем број на инстанци.
- Ако во листот има неколку класи со ист број на инстанци да се предвиде првата класа по азбучен ред.
- """
- trainingData=[
- [6.3,2.9,5.6,1.8,'I. virginica'],
- [6.5,3.0,5.8,2.2,'I. virginica'],
- [7.6,3.0,6.6,2.1,'I. virginica'],
- [4.9,2.5,4.5,1.7,'I. virginica'],
- [7.3,2.9,6.3,1.8,'I. virginica'],
- [6.7,2.5,5.8,1.8,'I. virginica'],
- [7.2,3.6,6.1,2.5,'I. virginica'],
- [6.5,3.2,5.1,2.0,'I. virginica'],
- [6.4,2.7,5.3,1.9,'I. virginica'],
- [6.8,3.0,5.5,2.1,'I. virginica'],
- [5.7,2.5,5.0,2.0,'I. virginica'],
- [5.8,2.8,5.1,2.4,'I. virginica'],
- [6.4,3.2,5.3,2.3,'I. virginica'],
- [6.5,3.0,5.5,1.8,'I. virginica'],
- [7.7,3.8,6.7,2.2,'I. virginica'],
- [7.7,2.6,6.9,2.3,'I. virginica'],
- [6.0,2.2,5.0,1.5,'I. virginica'],
- [6.9,3.2,5.7,2.3,'I. virginica'],
- [5.6,2.8,4.9,2.0,'I. virginica'],
- [7.7,2.8,6.7,2.0,'I. virginica'],
- [6.3,2.7,4.9,1.8,'I. virginica'],
- [6.7,3.3,5.7,2.1,'I. virginica'],
- [7.2,3.2,6.0,1.8,'I. virginica'],
- [6.2,2.8,4.8,1.8,'I. virginica'],
- [6.1,3.0,4.9,1.8,'I. virginica'],
- [6.4,2.8,5.6,2.1,'I. virginica'],
- [7.2,3.0,5.8,1.6,'I. virginica'],
- [7.4,2.8,6.1,1.9,'I. virginica'],
- [7.9,3.8,6.4,2.0,'I. virginica'],
- [6.4,2.8,5.6,2.2,'I. virginica'],
- [6.3,2.8,5.1,1.5,'I. virginica'],
- [6.1,2.6,5.6,1.4,'I. virginica'],
- [7.7,3.0,6.1,2.3,'I. virginica'],
- [6.3,3.4,5.6,2.4,'I. virginica'],
- [5.1,3.5,1.4,0.2,'I. setosa'],
- [4.9,3.0,1.4,0.2,'I. setosa'],
- [4.7,3.2,1.3,0.2,'I. setosa'],
- [4.6,3.1,1.5,0.2,'I. setosa'],
- [5.0,3.6,1.4,0.2,'I. setosa'],
- [5.4,3.9,1.7,0.4,'I. setosa'],
- [4.6,3.4,1.4,0.3,'I. setosa'],
- [5.0,3.4,1.5,0.2,'I. setosa'],
- [4.4,2.9,1.4,0.2,'I. setosa'],
- [4.9,3.1,1.5,0.1,'I. setosa'],
- [5.4,3.7,1.5,0.2,'I. setosa'],
- [4.8,3.4,1.6,0.2,'I. setosa'],
- [4.8,3.0,1.4,0.1,'I. setosa'],
- [4.3,3.0,1.1,0.1,'I. setosa'],
- [5.8,4.0,1.2,0.2,'I. setosa'],
- [5.7,4.4,1.5,0.4,'I. setosa'],
- [5.4,3.9,1.3,0.4,'I. setosa'],
- [5.1,3.5,1.4,0.3,'I. setosa'],
- [5.7,3.8,1.7,0.3,'I. setosa'],
- [5.1,3.8,1.5,0.3,'I. setosa'],
- [5.4,3.4,1.7,0.2,'I. setosa'],
- [5.1,3.7,1.5,0.4,'I. setosa'],
- [4.6,3.6,1.0,0.2,'I. setosa'],
- [5.1,3.3,1.7,0.5,'I. setosa'],
- [4.8,3.4,1.9,0.2,'I. setosa'],
- [5.0,3.0,1.6,0.2,'I. setosa'],
- [5.0,3.4,1.6,0.4,'I. setosa'],
- [5.2,3.5,1.5,0.2,'I. setosa'],
- [5.2,3.4,1.4,0.2,'I. setosa'],
- [5.5,2.3,4.0,1.3,'I. versicolor'],
- [6.5,2.8,4.6,1.5,'I. versicolor'],
- [5.7,2.8,4.5,1.3,'I. versicolor'],
- [6.3,3.3,4.7,1.6,'I. versicolor'],
- [4.9,2.4,3.3,1.0,'I. versicolor'],
- [6.6,2.9,4.6,1.3,'I. versicolor'],
- [5.2,2.7,3.9,1.4,'I. versicolor'],
- [5.0,2.0,3.5,1.0,'I. versicolor'],
- [5.9,3.0,4.2,1.5,'I. versicolor'],
- [6.0,2.2,4.0,1.0,'I. versicolor'],
- [6.1,2.9,4.7,1.4,'I. versicolor'],
- [5.6,2.9,3.6,1.3,'I. versicolor'],
- [6.7,3.1,4.4,1.4,'I. versicolor'],
- [5.6,3.0,4.5,1.5,'I. versicolor'],
- [5.8,2.7,4.1,1.0,'I. versicolor'],
- [6.2,2.2,4.5,1.5,'I. versicolor'],
- [5.6,2.5,3.9,1.1,'I. versicolor'],
- [5.9,3.2,4.8,1.8,'I. versicolor'],
- [6.1,2.8,4.0,1.3,'I. versicolor'],
- [6.3,2.5,4.9,1.5,'I. versicolor'],
- [6.1,2.8,4.7,1.2,'I. versicolor'],
- [6.4,2.9,4.3,1.3,'I. versicolor'],
- [6.6,3.0,4.4,1.4,'I. versicolor'],
- [6.8,2.8,4.8,1.4,'I. versicolor'],
- [6.7,3.0,5.0,1.7,'I. versicolor'],
- [6.0,2.9,4.5,1.5,'I. versicolor'],
- [5.7,2.6,3.5,1.0,'I. versicolor'],
- [5.5,2.4,3.8,1.1,'I. versicolor'],
- [5.5,2.4,3.7,1.0,'I. versicolor'],
- [5.8,2.7,3.9,1.2,'I. versicolor'],
- [6.0,2.7,5.1,1.6,'I. versicolor'],
- [5.4,3.0,4.5,1.5,'I. versicolor'],
- [6.0,3.4,4.5,1.6,'I. versicolor'],
- [6.7,3.1,4.7,1.5,'I. versicolor'],
- [6.3,2.3,4.4,1.3,'I. versicolor'],
- [5.6,3.0,4.1,1.3,'I. versicolor'],
- [5.5,2.5,4.0,1.3,'I. versicolor'],
- [5.5,2.6,4.4,1.2,'I. versicolor'],
- [6.1,3.0,4.6,1.4,'I. versicolor'],
- [5.8,2.6,4.0,1.2,'I. versicolor'],
- [5.0,2.3,3.3,1.0,'I. versicolor'],
- [5.6,2.7,4.2,1.3,'I. versicolor'],
- [5.7,3.0,4.2,1.2,'I. versicolor'],
- [5.7,2.9,4.2,1.3,'I. versicolor'],
- [6.2,2.9,4.3,1.3,'I. versicolor'],
- [5.1,2.5,3.0,1.1,'I. versicolor'],
- [5.7,2.8,4.1,1.3,'I. versicolor'],
- [6.4,3.1,5.5,1.8,'I. virginica'],
- [6.0,3.0,4.8,1.8,'I. virginica'],
- [6.9,3.1,5.4,2.1,'I. virginica'],
- [6.7,3.1,5.6,2.4,'I. virginica'],
- [6.9,3.1,5.1,2.3,'I. virginica'],
- [5.8,2.7,5.1,1.9,'I. virginica'],
- [6.8,3.2,5.9,2.3,'I. virginica'],
- [6.7,3.3,5.7,2.5,'I. virginica'],
- [6.7,3.0,5.2,2.3,'I. virginica'],
- [6.3,2.5,5.0,1.9,'I. virginica'],
- [6.5,3.0,5.2,2.0,'I. virginica'],
- [6.2,3.4,5.4,2.3,'I. virginica'],
- [4.7,3.2,1.6,0.2,'I. setosa'],
- [4.8,3.1,1.6,0.2,'I. setosa'],
- [5.4,3.4,1.5,0.4,'I. setosa'],
- [5.2,4.1,1.5,0.1,'I. setosa'],
- [5.5,4.2,1.4,0.2,'I. setosa'],
- [4.9,3.1,1.5,0.2,'I. setosa'],
- [5.0,3.2,1.2,0.2,'I. setosa'],
- [5.5,3.5,1.3,0.2,'I. setosa'],
- [4.9,3.6,1.4,0.1,'I. setosa'],
- [4.4,3.0,1.3,0.2,'I. setosa'],
- [5.1,3.4,1.5,0.2,'I. setosa'],
- [5.0,3.5,1.3,0.3,'I. setosa'],
- [4.5,2.3,1.3,0.3,'I. setosa'],
- [4.4,3.2,1.3,0.2,'I. setosa'],
- [5.0,3.5,1.6,0.6,'I. setosa'],
- [5.1,3.8,1.9,0.4,'I. setosa'],
- [4.8,3.0,1.4,0.3,'I. setosa'],
- [5.1,3.8,1.6,0.2,'I. setosa'],
- [5.9,3.0,5.1,1.8,'I. virginica']
- ]
- class decisionnode:
- def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
- self.col = col
- self.value = value
- self.results = results
- self.tb = tb
- self.fb = fb
- def sporedi_broj(row, column, value):
- return row[column] >= value
- def sporedi_string(row, column, value):
- return row[column] == value
- # Divides a set on a specific column. Can handle numeric
- # or nominal values
- def divideset(rows, column, value):
- # Make a function that tells us if a row is in
- # the first group (true) or the second group (false)
- split_function = None
- if isinstance(value, int) or isinstance(value, float): # ako vrednosta so koja sporeduvame e od tip int ili float
- # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
- split_function = sporedi_broj
- else:
- # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
- split_function = sporedi_string
- # Divide the rows into two sets and return them
- set_false = []
- set_true = []
- for row in rows:
- if split_function(row, column, value):
- set_true.append(row)
- else:
- set_false.append(row)
- set1 = [row for row in rows if
- split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja true
- set2 = [row for row in rows if
- not split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja false
- # return (set1, set2)
- return (set_true, set_false)
- def uniquecounts(rows):
- results = {}
- for row in rows:
- # The result is the last column
- r = row[-1]
- results.setdefault(r, 0)
- results[r] += 1
- return results
- def log2(x):
- from math import log
- l2 = log(x) / log(2)
- return l2
- def entropy(rows):
- results = uniquecounts(rows)
- # Now calculate the entropy
- ent = 0.0
- for r in results.keys():
- p = float(results[r]) / len(rows)
- ent = ent - p * log2(p)
- return ent
- def buildtree(rows, scoref=entropy):
- if len(rows) == 0: return decisionnode()
- current_score = scoref(rows)
- # Set up some variables to track the best criteria
- best_gain = 0.0
- best_column = -1
- best_value = None
- best_subsetf = None
- best_subsett = None
- column_count = len(rows[0]) - 1
- for col in range(column_count):
- # Generate the list of different values in
- # this column
- column_values = set()
- for row in rows:
- column_values.add(row[col])
- # Now try dividing the rows up for each value
- # in this column
- for value in column_values:
- (set1, set2) = divideset(rows, col, value)
- # Information gain
- p = float(len(set1)) / len(rows)
- gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
- if gain > best_gain and len(set1) > 0 and len(set2) > 0:
- best_gain = gain
- best_column = col
- best_value = value
- best_subsett = set1
- best_subsetf = set2
- # best_criteria = (col, value)
- # best_sets = (set1, set2)
- # Create the subbranches
- if best_gain > 0:
- trueBranch = buildtree(best_subsett, scoref)
- falseBranch = buildtree(best_subsetf, scoref)
- return decisionnode(col=best_column, value=best_value,
- tb=trueBranch, fb=falseBranch)
- else:
- return decisionnode(results=uniquecounts(rows))
- def classify(observation, tree):
- if tree.results != None:
- return tree.results
- else:
- vrednost = observation[tree.col]
- branch = None
- if isinstance(vrednost, int) or isinstance(vrednost, float):
- if vrednost >= tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- else:
- if vrednost == tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- return classify(observation, branch)
- if __name__ == "__main__":
- att1=input()
- att2=input()
- att3=input()
- att4=input()
- planttype=input()
- testCase=[att1,att2,att3,att4,planttype]
- mn1=[] #kreiranje prvo mnozestvo
- mn2=[] #kreiranje vtoro mnozestvo
- vkupno=len(trainingData) #vkupno redovi
- for i in range(0,vkupno/2): #polnenje na prvo trening mnozhestvo
- mn1.append(trainingData[i])
- for i in range(vkupno/2,vkupno): # polnenje na vtoro trening mnozhestvo
- mn2.append(trainingData[i])
- drvo1=buildtree(mn1) #kreiranje prvo drvo od prvo trening mnozhestvo
- drvo2=buildtree(mn2) #kreiranje vtoro drvo od vtoro trening mnozhestvo
- kl1=classify(testCase,drvo1) #prva klasifikacija
- kl2=classify(testCase,drvo2) #vtora klasifikacija
- if (kl1.keys()[0]==kl2.keys()[0]): #proverka dali se isti
- print kl1.keys()[0]
- if(kl1.values()[0]>kl2.values()[0]):
- print kl1.keys()[0]
- print 'KONTRADIKCIJA'
- if(kl2.values()[0]>kl1.values()[0]):
- print kl1.keys()[0]
- print 'KONTRADIKCIJA'
- -------------------------------------------------------------------------------------------------------
- #januari 2018 - Drva
- Да се промени функцијата за предвидување, така што при изминувањето ќе печати информации за:
- -со која колона и вредност се споредува
- -за која е тековната вредност на тест примерокот за бараната колона
- -нивото на тековниот јазол во дрвото
- -која е следната гранка што ќе се изминува низ дрвото (True branch или False branch)
- -преостанатиот дел од дрвото што треба да се измине
- -празна линија
- Потоа да се испечати истренираното дрво, да се вчита непознат тренинг примерок од стандардниот влез
- и истиот да се класифицира со новата функција за предвидување.
- trainingData=[['twitter','USA','yes',18,'None'],
- ['google','France','yes',23,'Premium'],
- ['google','France','no',26,'Basic'],
- ['google','Macedonia','yes',13,'None'],
- ['pinterest','USA','yes',24,'Basic'],
- ['bing','France','yes',23,'Basic'],
- ['google','UK','no',21,'Premium'],
- ['facebook','New Zealand','no',12,'None'],
- ['facebook','UK','no',21,'Basic'],
- ['google','USA','no',24,'Premium'],
- ['twitter','France','yes',19,'None'],
- ['pinterest','USA','no',18,'None'],
- ['google','UK','no',18,'None'],
- ['bing','UK','yes',19,'Premium'],
- ['bing','Macedonia','no',10,'None'],
- ['facebook','Macedonia','no',16,'Basic'],
- ['bing','UK','no',19,'Basic'],
- ['pinterest','Germany','no',2,'None'],
- ['pinterest','USA','yes',12,'Basic'],
- ['twitter','UK','no',21,'None'],
- ['twitter','UK','yes',26,'Premium'],
- ['google','UK','yes',18,'Basic'],
- ['bing','France','yes',19,'Basic']]
- test_cases=[['google','MK','no',24,'Unknown'],
- ['google','MK','no',15,'Unknown'],
- ['pinterest','UK','yes',21,'Unknown'],
- ['pinterest','UK','no',25,'Unknown']]
- # trainingData=[line.split('\t') for line in file('decision_tree_example.txt')]
- class decisionnode:
- def __init__(self,col=-1,value=None,results=None,tb=None,fb=None,level=0):
- self.level=level
- self.col=col
- self.value=value
- self.results=results
- self.tb=tb
- self.fb=fb
- self.level=level
- def sporedi_broj(row,column,value):
- return row[column]>=value
- def sporedi_string(row,column,value):
- return row[column]==value
- # Divides a set on a specific column. Can handle numeric
- # or nominal values
- def divideset(rows,column,value):
- # Make a function that tells us if a row is in
- # the first group (true) or the second group (false)
- split_function=None
- if isinstance(value,int) or isinstance(value,float): # ako vrednosta so koja sporeduvame e od tip int ili float
- #split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
- split_function=sporedi_broj
- else:
- # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
- split_function=sporedi_string
- # Divide the rows into two sets and return them
- # set1=[row for row in rows if split_function(row)] # za sekoj row od rows za koj split_function vrakja true
- # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
- set1=[row for row in rows if split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja true
- set2=[row for row in rows if not split_function(row,column,value)] # za sekoj row od rows za koj split_function vrakja false
- return (set1,set2)
- # Divides a set on a specific column. Can handle numeric
- # or nominal values
- def divideset2(rows,column,value):
- # Make a function that tells us if a row is in
- # the first group (true) or the second group (false)
- split_function=None
- if isinstance(value,int) or isinstance(value,float): # ako vrednosta so koja sporeduvame e od tip int ili float
- #split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
- split_function=sporedi_broj
- else:
- # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
- split_function=sporedi_string
- # Divide the rows into two sets and return them
- # set1=[row for row in rows if split_function(row)] # za sekoj row od rows za koj split_function vrakja true
- # set2=[row for row in rows if not split_function(row)] # za sekoj row od rows za koj split_function vrakja false
- set1=[]
- set2=[]
- for row in rows:
- if split_function(row,column,value):
- set1.append(row)
- else:
- set2.append(row)
- return (set1,set2)
- # Create counts of possible results (the last column of
- # each row is the result)
- def uniquecounts(rows):
- results={}
- for row in rows:
- # The result is the last column
- r=row[len(row)-1]
- if r not in results: results[r]=0
- results[r]+=1
- return results
- # Entropy is the sum of p(x)log(p(x)) across all
- # the different possible results
- def entropy(rows):
- from math import log
- log2=lambda x:log(x)/log(2)
- results=uniquecounts(rows)
- # Now calculate the entropy
- ent=0.0
- for r in results.keys():
- p=float(results[r])/len(rows)
- ent=ent-p*log2(p)
- return ent
- def buildtree(rows,scoref=entropy,level=0):
- if len(rows)==0: return decisionnode()
- current_score=scoref(rows)
- # Set up some variables to track the best criteria
- best_gain=0.0
- best_criteria=None
- best_sets=None
- column_count=len(rows[0])-1
- for col in range(0,column_count):
- # Generate the list of different values in
- # this column
- column_values={}
- for row in rows:
- column_values[row[col]]=1
- # print row[col]
- # print
- # print column_values
- # Now try dividing the rows up for each value
- # in this column
- for value in column_values.keys():
- (set1,set2)=divideset(rows,col,value)
- # Information gain
- p=float(len(set1))/len(rows)
- gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
- # print set1, set2, gain
- if gain>best_gain and len(set1)>0 and len(set2)>0:
- best_gain=gain
- best_criteria=(col,value)
- best_sets=(set1,set2)
- # Create the subbranches
- if best_gain>0:
- trueBranch=buildtree(best_sets[0],level=level+1)
- falseBranch=buildtree(best_sets[1], level=level+1)
- return decisionnode(col=best_criteria[0],value=best_criteria[1],
- tb=trueBranch, fb=falseBranch, level=level)
- else:
- return decisionnode(results=uniquecounts(rows))
- def printtree(tree,indent=''):
- # Is this a leaf node?
- if tree.results!=None:
- print str(tree.results)
- else:
- # Print the criteria
- print str(tree.col)+':'+str(tree.value)+'?' + ' Level='+str(tree.level)
- # Print the branches
- print indent+'T->',
- printtree(tree.tb,indent+' ')
- print indent+'F->',
- printtree(tree.fb,indent+' ')
- def classify(observation,tree):
- if tree.results!=None:
- results=[(value,key) for key,value in tree.results.items()]
- results.sort()
- return results[0][1]
- else:
- vrednost=observation[tree.col]
- branch=None
- if isinstance(vrednost,int) or isinstance(vrednost,float):
- if vrednost>=tree.value: branch=tree.tb
- else: branch=tree.fb
- else:
- if vrednost==tree.value: branch=tree.tb
- else: branch=tree.fb
- return classify(observation,branch)
- def classify2(observation,tree):
- if tree.results!=None:
- results=[(value,key) for key,value in tree.results.items()]
- results.sort()
- return results[0][1]
- else:
- vrednost=observation[tree.col]
- branch=None
- if isinstance(vrednost,int) or isinstance(vrednost,float):
- if vrednost>=tree.value: branch=tree.tb
- else: branch=tree.fb
- else:
- if vrednost==tree.value: branch=tree.tb
- else: branch=tree.fb
- return classify2(observation,branch)
- def classify3(observation,tree):
- if tree.results!=None:
- results=[(value,key) for key,value in tree.results.items()]
- results.sort()
- return results[0][1]
- else:
- vrednost=observation[tree.col]
- branch=None
- granka='True branch'
- if isinstance(vrednost,int) or isinstance(vrednost,float):
- if vrednost>=tree.value:
- branch=tree.tb
- else:
- branch=tree.fb
- granka='False branch'
- else:
- if vrednost==tree.value:
- branch=tree.tb
- else:
- branch=tree.fb
- granka='False branch'
- print 'Sporeduvam kolona i vrednost', (tree.col, tree.value)
- print 'Tekovna vrednost:', vrednost
- print 'Sledna granka:',granka
- print 'Preostanata granka za izminuvanje:'
- printtree(branch)
- print
- return classify3(observation,branch)
- if __name__ == "__main__":
- referrer=input()
- location=input()
- readFAQ=input()
- pagesVisited=input()
- serviceChosen='Unknown'
- testCase=[referrer, location, readFAQ, pagesVisited, serviceChosen]
- t=buildtree(trainingData)
- printtree(t)
- print classify3(testCase,t)
- --------------------------------------------------------------------------------------------------------
- # -*- coding: utf-8 -*-
- #Дадено е податочно множество од риби кое ги содржи следните видови:
- # Code Finnish Swedish English Latin
- # 1 Lahna Braxen Bream Abramis brama
- # 2 Siika Iiden Whitewish Leusiscus idus
- # 3 Saerki Moerten Roach Leuciscus rutilus
- # 4 Parkki Bjoerknan ? Abramis bjrkna
- # 5 Norssi Norssen Smelt Osmerus eperlanus
- # 6 Hauki Jaedda Pike Esox lucius
- # 7 Ahven Abborre Perch Perca fluviatilis
- #Дадени се следните атрибути:
- # 0 Weight Weight of the fish (in grams)
- # 1 Length1 Length from the nose to the beginning of the tail (in cm)
- # 2 Length2 Length from the nose to the notch of the tail (in cm)
- # 3 Length3 Length from the nose to the end of the tail (in cm)
- # 4 Height% Maximal height as % of Length3
- # 5 Width% Maximal width as % of Length3
- #Класата е дадена во последната колона.
- #Да се направи модел за класификација за даденото податочно множество. За тренинг да се земат
- #само првите 5 примероци од секоја од класите во множеството. Притоа ова да се направи во програмата,
- # а не со рачно копирање! Да се класифицира елементот од податочното множество даден на влез и да се
- # испечати предвидувањето. Елементот е даден со индексот од податочното множество.
- data = [[242.0, 23.2, 25.4, 30.0, 38.4, 13.4, 1],
- [290.0, 24.0, 26.3, 31.2, 40.0, 13.8, 1],
- [340.0, 23.9, 26.5, 31.1, 39.8, 15.1, 1],
- [363.0, 26.3, 29.0, 33.5, 38.0, 13.3, 1],
- [430.0, 26.5, 29.0, 34.0, 36.6, 15.1, 1],
- [450.0, 26.8, 29.7, 34.7, 39.2, 14.2, 1],
- [500.0, 26.8, 29.7, 34.5, 41.1, 15.3, 1],
- [390.0, 27.6, 30.0, 35.0, 36.2, 13.4, 1],
- [450.0, 27.6, 30.0, 35.1, 39.9, 13.8, 1],
- [500.0, 28.5, 30.7, 36.2, 39.3, 13.7, 1],
- [475.0, 28.4, 31.0, 36.2, 39.4, 14.1, 1],
- [500.0, 28.7, 31.0, 36.2, 39.7, 13.3, 1],
- [500.0, 29.1, 31.5, 36.4, 37.8, 12.0, 1],
- [500.0, 29.5, 32.0, 37.3, 37.3, 13.6, 1],
- [600.0, 29.4, 32.0, 37.2, 40.2, 13.9, 1],
- [600.0, 29.4, 32.0, 37.2, 41.5, 15.0, 1],
- [700.0, 30.4, 33.0, 38.3, 38.8, 13.8, 1],
- [700.0, 30.4, 33.0, 38.5, 38.8, 13.5, 1],
- [610.0, 30.9, 33.5, 38.6, 40.5, 13.3, 1],
- [650.0, 31.0, 33.5, 38.7, 37.4, 14.8, 1],
- [575.0, 31.3, 34.0, 39.5, 38.3, 14.1, 1],
- [685.0, 31.4, 34.0, 39.2, 40.8, 13.7, 1],
- [620.0, 31.5, 34.5, 39.7, 39.1, 13.3, 1],
- [680.0, 31.8, 35.0, 40.6, 38.1, 15.1, 1],
- [700.0, 31.9, 35.0, 40.5, 40.1, 13.8, 1],
- [725.0, 31.8, 35.0, 40.9, 40.0, 14.8, 1],
- [720.0, 32.0, 35.0, 40.6, 40.3, 15.0, 1],
- [714.0, 32.7, 36.0, 41.5, 39.8, 14.1, 1],
- [850.0, 32.8, 36.0, 41.6, 40.6, 14.9, 1],
- [1000.0, 33.5, 37.0, 42.6, 44.5, 15.5, 1],
- [920.0, 35.0, 38.5, 44.1, 40.9, 14.3, 1],
- [955.0, 35.0, 38.5, 44.0, 41.1, 14.3, 1],
- [925.0, 36.2, 39.5, 45.3, 41.4, 14.9, 1],
- [975.0, 37.4, 41.0, 45.9, 40.6, 14.7, 1],
- [950.0, 38.0, 41.0, 46.5, 37.9, 13.7, 1],
- [270.0, 23.6, 26.0, 28.7, 29.2, 14.8, 2],
- [270.0, 24.1, 26.5, 29.3, 27.8, 14.5, 2],
- [306.0, 25.6, 28.0, 30.8, 28.5, 15.2, 2],
- [540.0, 28.5, 31.0, 34.0, 31.6, 19.3, 2],
- [800.0, 33.7, 36.4, 39.6, 29.7, 16.6, 2],
- [1000.0, 37.3, 40.0, 43.5, 28.4, 15.0, 2],
- [40.0, 12.9, 14.1, 16.2, 25.6, 14.0, 3],
- [69.0, 16.5, 18.2, 20.3, 26.1, 13.9, 3],
- [78.0, 17.5, 18.8, 21.2, 26.3, 13.7, 3],
- [87.0, 18.2, 19.8, 22.2, 25.3, 14.3, 3],
- [120.0, 18.6, 20.0, 22.2, 28.0, 16.1, 3],
- [0.0, 19.0, 20.5, 22.8, 28.4, 14.7, 3],
- [110.0, 19.1, 20.8, 23.1, 26.7, 14.7, 3],
- [120.0, 19.4, 21.0, 23.7, 25.8, 13.9, 3],
- [150.0, 20.4, 22.0, 24.7, 23.5, 15.2, 3],
- [145.0, 20.5, 22.0, 24.3, 27.3, 14.6, 3],
- [160.0, 20.5, 22.5, 25.3, 27.8, 15.1, 3],
- [140.0, 21.0, 22.5, 25.0, 26.2, 13.3, 3],
- [160.0, 21.1, 22.5, 25.0, 25.6, 15.2, 3],
- [169.0, 22.0, 24.0, 27.2, 27.7, 14.1, 3],
- [161.0, 22.0, 23.4, 26.7, 25.9, 13.6, 3],
- [200.0, 22.1, 23.5, 26.8, 27.6, 15.4, 3],
- [180.0, 23.6, 25.2, 27.9, 25.4, 14.0, 3],
- [290.0, 24.0, 26.0, 29.2, 30.4, 15.4, 3],
- [272.0, 25.0, 27.0, 30.6, 28.0, 15.6, 3],
- [390.0, 29.5, 31.7, 35.0, 27.1, 15.3, 3],
- [55.0, 13.5, 14.7, 16.5, 41.5, 14.1, 4],
- [60.0, 14.3, 15.5, 17.4, 37.8, 13.3, 4],
- [90.0, 16.3, 17.7, 19.8, 37.4, 13.5, 4],
- [120.0, 17.5, 19.0, 21.3, 39.4, 13.7, 4],
- [150.0, 18.4, 20.0, 22.4, 39.7, 14.7, 4],
- [140.0, 19.0, 20.7, 23.2, 36.8, 14.2, 4],
- [170.0, 19.0, 20.7, 23.2, 40.5, 14.7, 4],
- [145.0, 19.8, 21.5, 24.1, 40.4, 13.1, 4],
- [200.0, 21.2, 23.0, 25.8, 40.1, 14.2, 4],
- [273.0, 23.0, 25.0, 28.0, 39.6, 14.8, 4],
- [300.0, 24.0, 26.0, 29.0, 39.2, 14.6, 4],
- [6.7, 9.3, 9.8, 10.8, 16.1, 9.7, 5],
- [7.5, 10.0, 10.5, 11.6, 17.0, 10.0, 5],
- [7.0, 10.1, 10.6, 11.6, 14.9, 9.9, 5],
- [9.7, 10.4, 11.0, 12.0, 18.3, 11.5, 5],
- [9.8, 10.7, 11.2, 12.4, 16.8, 10.3, 5],
- [8.7, 10.8, 11.3, 12.6, 15.7, 10.2, 5],
- [10.0, 11.3, 11.8, 13.1, 16.9, 9.8, 5],
- [9.9, 11.3, 11.8, 13.1, 16.9, 8.9, 5],
- [9.8, 11.4, 12.0, 13.2, 16.7, 8.7, 5],
- [12.2, 11.5, 12.2, 13.4, 15.6, 10.4, 5],
- [13.4, 11.7, 12.4, 13.5, 18.0, 9.4, 5],
- [12.2, 12.1, 13.0, 13.8, 16.5, 9.1, 5],
- [19.7, 13.2, 14.3, 15.2, 18.9, 13.6, 5],
- [19.9, 13.8, 15.0, 16.2, 18.1, 11.6, 5],
- [200.0, 30.0, 32.3, 34.8, 16.0, 9.7, 6],
- [300.0, 31.7, 34.0, 37.8, 15.1, 11.0, 6],
- [300.0, 32.7, 35.0, 38.8, 15.3, 11.3, 6],
- [300.0, 34.8, 37.3, 39.8, 15.8, 10.1, 6],
- [430.0, 35.5, 38.0, 40.5, 18.0, 11.3, 6],
- [345.0, 36.0, 38.5, 41.0, 15.6, 9.7, 6],
- [456.0, 40.0, 42.5, 45.5, 16.0, 9.5, 6],
- [510.0, 40.0, 42.5, 45.5, 15.0, 9.8, 6],
- [540.0, 40.1, 43.0, 45.8, 17.0, 11.2, 6],
- [500.0, 42.0, 45.0, 48.0, 14.5, 10.2, 6],
- [567.0, 43.2, 46.0, 48.7, 16.0, 10.0, 6],
- [770.0, 44.8, 48.0, 51.2, 15.0, 10.5, 6],
- [950.0, 48.3, 51.7, 55.1, 16.2, 11.2, 6],
- [1250.0, 52.0, 56.0, 59.7, 17.9, 11.7, 6],
- [1600.0, 56.0, 60.0, 64.0, 15.0, 9.6, 6],
- [1550.0, 56.0, 60.0, 64.0, 15.0, 9.6, 6],
- [1650.0, 59.0, 63.4, 68.0, 15.9, 11.0, 6],
- [5.9, 7.5, 8.4, 8.8, 24.0, 16.0, 7],
- [32.0, 12.5, 13.7, 14.7, 24.0, 13.6, 7],
- [40.0, 13.8, 15.0, 16.0, 23.9, 15.2, 7],
- [51.5, 15.0, 16.2, 17.2, 26.7, 15.3, 7],
- [70.0, 15.7, 17.4, 18.5, 24.8, 15.9, 7],
- [100.0, 16.2, 18.0, 19.2, 27.2, 17.3, 7],
- [78.0, 16.8, 18.7, 19.4, 26.8, 16.1, 7],
- [80.0, 17.2, 19.0, 20.2, 27.9, 15.1, 7],
- [85.0, 17.8, 19.6, 20.8, 24.7, 14.6, 7],
- [85.0, 18.2, 20.0, 21.0, 24.2, 13.2, 7],
- [110.0, 19.0, 21.0, 22.5, 25.3, 15.8, 7],
- [115.0, 19.0, 21.0, 22.5, 26.3, 14.7, 7],
- [125.0, 19.0, 21.0, 22.5, 25.3, 16.3, 7],
- [130.0, 19.3, 21.3, 22.8, 28.0, 15.5, 7],
- [120.0, 20.0, 22.0, 23.5, 26.0, 14.5, 7],
- [120.0, 20.0, 22.0, 23.5, 24.0, 15.0, 7],
- [130.0, 20.0, 22.0, 23.5, 26.0, 15.0, 7],
- [135.0, 20.0, 22.0, 23.5, 25.0, 15.0, 7],
- [110.0, 20.0, 22.0, 23.5, 23.5, 17.0, 7],
- [130.0, 20.5, 22.5, 24.0, 24.4, 15.1, 7],
- [150.0, 20.5, 22.5, 24.0, 28.3, 15.1, 7],
- [145.0, 20.7, 22.7, 24.2, 24.6, 15.0, 7],
- [150.0, 21.0, 23.0, 24.5, 21.3, 14.8, 7],
- [170.0, 21.5, 23.5, 25.0, 25.1, 14.9, 7],
- [225.0, 22.0, 24.0, 25.5, 28.6, 14.6, 7],
- [145.0, 22.0, 24.0, 25.5, 25.0, 15.0, 7],
- [188.0, 22.6, 24.6, 26.2, 25.7, 15.9, 7],
- [180.0, 23.0, 25.0, 26.5, 24.3, 13.9, 7],
- [197.0, 23.5, 25.6, 27.0, 24.3, 15.7, 7],
- [218.0, 25.0, 26.5, 28.0, 25.6, 14.8, 7],
- [300.0, 25.2, 27.3, 28.7, 29.0, 17.9, 7],
- [260.0, 25.4, 27.5, 28.9, 24.8, 15.0, 7],
- [265.0, 25.4, 27.5, 28.9, 24.4, 15.0, 7],
- [250.0, 25.4, 27.5, 28.9, 25.2, 15.8, 7],
- [250.0, 25.9, 28.0, 29.4, 26.6, 14.3, 7],
- [300.0, 26.9, 28.7, 30.1, 25.2, 15.4, 7],
- [320.0, 27.8, 30.0, 31.6, 24.1, 15.1, 7],
- [514.0, 30.5, 32.8, 34.0, 29.5, 17.7, 7],
- [556.0, 32.0, 34.5, 36.5, 28.1, 17.5, 7],
- [840.0, 32.5, 35.0, 37.3, 30.8, 20.9, 7],
- [685.0, 34.0, 36.5, 39.0, 27.9, 17.6, 7],
- [700.0, 34.0, 36.0, 38.3, 27.7, 17.6, 7],
- [700.0, 34.5, 37.0, 39.4, 27.5, 15.9, 7],
- [690.0, 34.6, 37.0, 39.3, 26.9, 16.2, 7],
- [900.0, 36.5, 39.0, 41.4, 26.9, 18.1, 7],
- [650.0, 36.5, 39.0, 41.4, 26.9, 14.5, 7],
- [820.0, 36.6, 39.0, 41.3, 30.1, 17.8, 7],
- [850.0, 36.9, 40.0, 42.3, 28.2, 16.8, 7],
- [900.0, 37.0, 40.0, 42.5, 27.6, 17.0, 7],
- [1015.0, 37.0, 40.0, 42.4, 29.2, 17.6, 7],
- [820.0, 37.1, 40.0, 42.5, 26.2, 15.6, 7],
- [1100.0, 39.0, 42.0, 44.6, 28.7, 15.4, 7],
- [1000.0, 39.8, 43.0, 45.2, 26.4, 16.1, 7],
- [1100.0, 40.1, 43.0, 45.5, 27.5, 16.3, 7],
- [1000.0, 40.2, 43.5, 46.0, 27.4, 17.7, 7],
- [1000.0, 41.1, 44.0, 46.6, 26.8, 16.3, 7]]
- class decisionnode:
- def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
- self.col = col
- self.value = value
- self.results = results
- self.tb = tb
- self.fb = fb
- def sporedi_broj(row, column, value):
- return row[column] >= value
- def sporedi_string(row, column, value):
- return row[column] == value
- # Divides a set on a specific column. Can handle numeric
- # or nominal values
- def divideset(rows, column, value):
- # Make a function that tells us if a row is in
- # the first group (true) or the second group (false)
- split_function = None
- if isinstance(value, int) or isinstance(value, float): # ako vrednosta so koja sporeduvame e od tip int ili float
- # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
- split_function = sporedi_broj
- else:
- # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
- split_function = sporedi_string
- # Divide the rows into two sets and return them
- set_false = []
- set_true = []
- for row in rows:
- if split_function(row, column, value):
- set_true.append(row)
- else:
- set_false.append(row)
- set1 = [row for row in rows if
- split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja true
- set2 = [row for row in rows if
- not split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja false
- # return (set1, set2)
- return (set_true, set_false)
- # Create counts of possible results (the last column of
- # each row is the result)
- def uniquecounts(rows):
- results = {}
- for row in rows:
- # The result is the last column
- r = row[-1]
- results.setdefault(r, 0)
- results[r] += 1
- return results
- # Probability that a randomly placed item will
- # be in the wrong category
- def log2(x):
- from math import log
- l2 = log(x) / log(2)
- return l2
- # Entropy is the sum of p(x)log(p(x)) across all
- # the different possible results
- def entropy(rows):
- results = uniquecounts(rows)
- # Now calculate the entropy
- ent = 0.0
- for r in results.keys():
- p = float(results[r]) / len(rows)
- ent = ent - p * log2(p)
- return ent
- def buildtree(rows, scoref=entropy):
- if len(rows) == 0: return decisionnode()
- current_score = scoref(rows)
- # Set up some variables to track the best criteria
- best_gain = 0.0
- best_column = -1
- best_value = None
- best_subsetf = None
- best_subsett = None
- column_count = len(rows[0]) - 1
- for col in range(column_count):
- # Generate the list of different values in
- # this column
- column_values = set()
- for row in rows:
- column_values.add(row[col])
- # Now try dividing the rows up for each value
- # in this column
- for value in column_values:
- (set1, set2) = divideset(rows, col, value)
- # Information gain
- p = float(len(set1)) / len(rows)
- gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
- if gain > best_gain and len(set1) > 0 and len(set2) > 0:
- best_gain = gain
- best_column = col
- best_value = value
- best_subsett = set1
- best_subsetf = set2
- # best_criteria = (col, value)
- # best_sets = (set1, set2)
- # Create the subbranches
- if best_gain > 0:
- trueBranch = buildtree(best_subsett, scoref)
- falseBranch = buildtree(best_subsetf, scoref)
- return decisionnode(col=best_column, value=best_value,
- tb=trueBranch, fb=falseBranch)
- else:
- return decisionnode(results=uniquecounts(rows))
- def printtree(tree, indent=''):
- # Is this a leaf node?
- if tree.results != None:
- print(indent + str(sorted(tree.results.items())))
- else:
- # Print the criteria
- print(indent + str(tree.col) + ':' + str(tree.value) + '? ')
- # Print the branches
- print(indent + 'T->')
- printtree(tree.tb, indent + ' ')
- print(indent + 'F->')
- printtree(tree.fb, indent + ' ')
- def classify(observation, tree):
- if tree.results != None:
- return tree.results
- else:
- vrednost = observation[tree.col]
- branch = None
- if isinstance(vrednost, int) or isinstance(vrednost, float):
- if vrednost >= tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- else:
- if vrednost == tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- return classify(observation, branch)
- def new_data_set_fu(data):
- classes = {}
- for i in data:
- # print(i)
- classes.setdefault(i[-1])
- classes_data = list(classes)
- flag = 0
- tmp = 0
- data_set = []
- for i in data:
- if flag == 5:
- flag = 0
- tmp += 1
- if tmp == classes_data[-1]:
- break
- elif classes_data[tmp] == i[-1]:
- data_set.append(i)
- # print(i)
- flag += 1
- return data_set
- if __name__ == '__main__':
- my_index = 5
- data_set = new_data_set_fu(data)
- t = buildtree(data_set)
- #printtree(t)
- c = classify(data[my_index],t)
- print(c)
- ----------------------------------------------------------------------------------------------------
- Drva ispitna
- """
- Дрва на одлука (100 поени) Problem 2 (0 / 16)
- Да се промени алгоритмот за дрво на одлука така што ќе се изградат две дрва на одлука. Секое од дрвата го користи половина од податочното множество.
- Да се промени начинот на печатење на дрвото така што покрај секој јазол, ќе се испечати и неговото ниво.
- Двете дрва да се испечатат и потоа да се испечати резултатот од класификацијата.
- Доколку некое од дрвата има само една класа тогаш се предвидува истата, но ако има повеќе од една треба да се избере таа со најголем број на инстанци. Ако во листот има неколку класи со ист број на инстанци да се предвиде првата класа по азбучен ред.
- Доколку двете дрва ја предвидат истата класа да се испечати класата. Во спротивно да се испечати KONTRADIKCIJA.
- """
- trainingData = [
- [6.3, 2.9, 5.6, 1.8, 'I. virginica'],
- [6.5, 3.0, 5.8, 2.2, 'I. virginica'],
- [7.6, 3.0, 6.6, 2.1, 'I. virginica'],
- [4.9, 2.5, 4.5, 1.7, 'I. virginica'],
- [7.3, 2.9, 6.3, 1.8, 'I. virginica'],
- [6.7, 2.5, 5.8, 1.8, 'I. virginica'],
- [7.2, 3.6, 6.1, 2.5, 'I. virginica'],
- [6.5, 3.2, 5.1, 2.0, 'I. virginica'],
- [6.4, 2.7, 5.3, 1.9, 'I. virginica'],
- [6.8, 3.0, 5.5, 2.1, 'I. virginica'],
- [5.7, 2.5, 5.0, 2.0, 'I. virginica'],
- [5.8, 2.8, 5.1, 2.4, 'I. virginica'],
- [6.4, 3.2, 5.3, 2.3, 'I. virginica'],
- [6.5, 3.0, 5.5, 1.8, 'I. virginica'],
- [7.7, 3.8, 6.7, 2.2, 'I. virginica'],
- [7.7, 2.6, 6.9, 2.3, 'I. virginica'],
- [6.0, 2.2, 5.0, 1.5, 'I. virginica'],
- [6.9, 3.2, 5.7, 2.3, 'I. virginica'],
- [5.6, 2.8, 4.9, 2.0, 'I. virginica'],
- [7.7, 2.8, 6.7, 2.0, 'I. virginica'],
- [6.3, 2.7, 4.9, 1.8, 'I. virginica'],
- [6.7, 3.3, 5.7, 2.1, 'I. virginica'],
- [7.2, 3.2, 6.0, 1.8, 'I. virginica'],
- [6.2, 2.8, 4.8, 1.8, 'I. virginica'],
- [6.1, 3.0, 4.9, 1.8, 'I. virginica'],
- [6.4, 2.8, 5.6, 2.1, 'I. virginica'],
- [7.2, 3.0, 5.8, 1.6, 'I. virginica'],
- [7.4, 2.8, 6.1, 1.9, 'I. virginica'],
- [7.9, 3.8, 6.4, 2.0, 'I. virginica'],
- [6.4, 2.8, 5.6, 2.2, 'I. virginica'],
- [6.3, 2.8, 5.1, 1.5, 'I. virginica'],
- [6.1, 2.6, 5.6, 1.4, 'I. virginica'],
- [7.7, 3.0, 6.1, 2.3, 'I. virginica'],
- [6.3, 3.4, 5.6, 2.4, 'I. virginica'],
- [5.1, 3.5, 1.4, 0.2, 'I. setosa'],
- [4.9, 3.0, 1.4, 0.2, 'I. setosa'],
- [4.7, 3.2, 1.3, 0.2, 'I. setosa'],
- [4.6, 3.1, 1.5, 0.2, 'I. setosa'],
- [5.0, 3.6, 1.4, 0.2, 'I. setosa'],
- [5.4, 3.9, 1.7, 0.4, 'I. setosa'],
- [4.6, 3.4, 1.4, 0.3, 'I. setosa'],
- [5.0, 3.4, 1.5, 0.2, 'I. setosa'],
- [4.4, 2.9, 1.4, 0.2, 'I. setosa'],
- [4.9, 3.1, 1.5, 0.1, 'I. setosa'],
- [5.4, 3.7, 1.5, 0.2, 'I. setosa'],
- [4.8, 3.4, 1.6, 0.2, 'I. setosa'],
- [4.8, 3.0, 1.4, 0.1, 'I. setosa'],
- [4.3, 3.0, 1.1, 0.1, 'I. setosa'],
- [5.8, 4.0, 1.2, 0.2, 'I. setosa'],
- [5.7, 4.4, 1.5, 0.4, 'I. setosa'],
- [5.4, 3.9, 1.3, 0.4, 'I. setosa'],
- [5.1, 3.5, 1.4, 0.3, 'I. setosa'],
- [5.7, 3.8, 1.7, 0.3, 'I. setosa'],
- [5.1, 3.8, 1.5, 0.3, 'I. setosa'],
- [5.4, 3.4, 1.7, 0.2, 'I. setosa'],
- [5.1, 3.7, 1.5, 0.4, 'I. setosa'],
- [4.6, 3.6, 1.0, 0.2, 'I. setosa'],
- [5.1, 3.3, 1.7, 0.5, 'I. setosa'],
- [4.8, 3.4, 1.9, 0.2, 'I. setosa'],
- [5.0, 3.0, 1.6, 0.2, 'I. setosa'],
- [5.0, 3.4, 1.6, 0.4, 'I. setosa'],
- [5.2, 3.5, 1.5, 0.2, 'I. setosa'],
- [5.2, 3.4, 1.4, 0.2, 'I. setosa'],
- [5.5, 2.3, 4.0, 1.3, 'I. versicolor'],
- [6.5, 2.8, 4.6, 1.5, 'I. versicolor'],
- [5.7, 2.8, 4.5, 1.3, 'I. versicolor'],
- [6.3, 3.3, 4.7, 1.6, 'I. versicolor'],
- [4.9, 2.4, 3.3, 1.0, 'I. versicolor'],
- [6.6, 2.9, 4.6, 1.3, 'I. versicolor'],
- [5.2, 2.7, 3.9, 1.4, 'I. versicolor'],
- [5.0, 2.0, 3.5, 1.0, 'I. versicolor'],
- [5.9, 3.0, 4.2, 1.5, 'I. versicolor'],
- [6.0, 2.2, 4.0, 1.0, 'I. versicolor'],
- [6.1, 2.9, 4.7, 1.4, 'I. versicolor'],
- [5.6, 2.9, 3.6, 1.3, 'I. versicolor'],
- [6.7, 3.1, 4.4, 1.4, 'I. versicolor'],
- [5.6, 3.0, 4.5, 1.5, 'I. versicolor'],
- [5.8, 2.7, 4.1, 1.0, 'I. versicolor'],
- [6.2, 2.2, 4.5, 1.5, 'I. versicolor'],
- [5.6, 2.5, 3.9, 1.1, 'I. versicolor'],
- [5.9, 3.2, 4.8, 1.8, 'I. versicolor'],
- [6.1, 2.8, 4.0, 1.3, 'I. versicolor'],
- [6.3, 2.5, 4.9, 1.5, 'I. versicolor'],
- [6.1, 2.8, 4.7, 1.2, 'I. versicolor'],
- [6.4, 2.9, 4.3, 1.3, 'I. versicolor'],
- [6.6, 3.0, 4.4, 1.4, 'I. versicolor'],
- [6.8, 2.8, 4.8, 1.4, 'I. versicolor'],
- [6.7, 3.0, 5.0, 1.7, 'I. versicolor'],
- [6.0, 2.9, 4.5, 1.5, 'I. versicolor'],
- [5.7, 2.6, 3.5, 1.0, 'I. versicolor'],
- [5.5, 2.4, 3.8, 1.1, 'I. versicolor'],
- [5.5, 2.4, 3.7, 1.0, 'I. versicolor'],
- [5.8, 2.7, 3.9, 1.2, 'I. versicolor'],
- [6.0, 2.7, 5.1, 1.6, 'I. versicolor'],
- [5.4, 3.0, 4.5, 1.5, 'I. versicolor'],
- [6.0, 3.4, 4.5, 1.6, 'I. versicolor'],
- [6.7, 3.1, 4.7, 1.5, 'I. versicolor'],
- [6.3, 2.3, 4.4, 1.3, 'I. versicolor'],
- [5.6, 3.0, 4.1, 1.3, 'I. versicolor'],
- [5.5, 2.5, 4.0, 1.3, 'I. versicolor'],
- [5.5, 2.6, 4.4, 1.2, 'I. versicolor'],
- [6.1, 3.0, 4.6, 1.4, 'I. versicolor'],
- [5.8, 2.6, 4.0, 1.2, 'I. versicolor'],
- [5.0, 2.3, 3.3, 1.0, 'I. versicolor'],
- [5.6, 2.7, 4.2, 1.3, 'I. versicolor'],
- [5.7, 3.0, 4.2, 1.2, 'I. versicolor'],
- [5.7, 2.9, 4.2, 1.3, 'I. versicolor'],
- [6.2, 2.9, 4.3, 1.3, 'I. versicolor'],
- [5.1, 2.5, 3.0, 1.1, 'I. versicolor'],
- [5.7, 2.8, 4.1, 1.3, 'I. versicolor'],
- [6.4, 3.1, 5.5, 1.8, 'I. virginica'],
- [6.0, 3.0, 4.8, 1.8, 'I. virginica'],
- [6.9, 3.1, 5.4, 2.1, 'I. virginica'],
- [6.7, 3.1, 5.6, 2.4, 'I. virginica'],
- [6.9, 3.1, 5.1, 2.3, 'I. virginica'],
- [5.8, 2.7, 5.1, 1.9, 'I. virginica'],
- [6.8, 3.2, 5.9, 2.3, 'I. virginica'],
- [6.7, 3.3, 5.7, 2.5, 'I. virginica'],
- [6.7, 3.0, 5.2, 2.3, 'I. virginica'],
- [6.3, 2.5, 5.0, 1.9, 'I. virginica'],
- [6.5, 3.0, 5.2, 2.0, 'I. virginica'],
- [6.2, 3.4, 5.4, 2.3, 'I. virginica'],
- [4.7, 3.2, 1.6, 0.2, 'I. setosa'],
- [4.8, 3.1, 1.6, 0.2, 'I. setosa'],
- [5.4, 3.4, 1.5, 0.4, 'I. setosa'],
- [5.2, 4.1, 1.5, 0.1, 'I. setosa'],
- [5.5, 4.2, 1.4, 0.2, 'I. setosa'],
- [4.9, 3.1, 1.5, 0.2, 'I. setosa'],
- [5.0, 3.2, 1.2, 0.2, 'I. setosa'],
- [5.5, 3.5, 1.3, 0.2, 'I. setosa'],
- [4.9, 3.6, 1.4, 0.1, 'I. setosa'],
- [4.4, 3.0, 1.3, 0.2, 'I. setosa'],
- [5.1, 3.4, 1.5, 0.2, 'I. setosa'],
- [5.0, 3.5, 1.3, 0.3, 'I. setosa'],
- [4.5, 2.3, 1.3, 0.3, 'I. setosa'],
- [4.4, 3.2, 1.3, 0.2, 'I. setosa'],
- [5.0, 3.5, 1.6, 0.6, 'I. setosa'],
- [5.1, 3.8, 1.9, 0.4, 'I. setosa'],
- [4.8, 3.0, 1.4, 0.3, 'I. setosa'],
- [5.1, 3.8, 1.6, 0.2, 'I. setosa'],
- [5.9, 3.0, 5.1, 1.8, 'I. virginica']
- ]
- class decisionnode:
- def __init__(self, col=-1, value=None, results=None, tb=None, fb=None,l=None):
- self.col = col
- self.value = value
- self.results = results
- self.tb = tb
- self.fb = fb
- self.l = l
- def sporedi_broj(row, column, value):
- return row[column] >= value
- def sporedi_string(row, column, value):
- return row[column] == value
- # Divides a set on a specific column. Can handle numeric
- # or nominal values
- def divideset(rows, column, value):
- # Make a function that tells us if a row is in
- # the first group (true) or the second group (false)
- split_function = None
- if isinstance(value, int) or isinstance(value, float): # ako vrednosta so koja sporeduvame e od tip int ili float
- # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
- split_function = sporedi_broj
- else:
- # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
- split_function = sporedi_string
- # Divide the rows into two sets and return them
- set_false = []
- set_true = []
- for row in rows:
- if split_function(row, column, value):
- set_true.append(row)
- else:
- set_false.append(row)
- set1 = [row for row in rows if
- split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja true
- set2 = [row for row in rows if
- not split_function(row, column, value)] # za sekoj row od rows za koj split_function vrakja false
- # return (set1, set2)
- return (set_true, set_false)
- #st, sf = divideset(my_data, 3, 20)
- #print(sf)
- #print(st)
- # Create counts of possible results (the last column of
- # each row is the result)
- def uniquecounts(rows):
- results = {}
- for row in rows:
- # The result is the last column
- r = row[-1]
- results.setdefault(r, 0)
- results[r] += 1
- return results
- #print(uniquecounts(my_data))
- #print(uniquecounts(st))
- #print(uniquecounts(sf))
- # Probability that a randomly placed item will
- # be in the wrong category
- def log2(x):
- from math import log
- l2 = log(x) / log(2)
- return l2
- # Entropy is the sum of p(x)log(p(x)) across all
- # the different possible results
- def entropy(rows):
- results = uniquecounts(rows)
- # Now calculate the entropy
- ent = 0.0
- for r in results.keys():
- p = float(results[r]) / len(rows)
- ent = ent - p * log2(p)
- return ent
- #print(entropy(my_data), entropy(st), entropy(sf))
- def buildtree(rows,l=-1, scoref=entropy):
- if len(rows) == 0: return decisionnode()
- current_score = scoref(rows)
- # Set up some variables to track the best criteria
- best_gain = 0.0
- best_column = -1
- best_value = None
- best_subsetf = None
- best_subsett = None
- column_count = len(rows[0]) - 1
- for col in range(column_count):
- # Generate the list of different values in
- # this column
- column_values = set()
- for row in rows:
- column_values.add(row[col])
- # Now try dividing the rows up for each value
- # in this column
- for value in column_values:
- (set1, set2) = divideset(rows, col, value)
- # Information gain
- p = float(len(set1)) / len(rows)
- gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
- if gain > best_gain and len(set1) > 0 and len(set2) > 0:
- best_gain = gain
- best_column = col
- best_value = value
- best_subsett = set1
- best_subsetf = set2
- # best_criteria = (col, value)
- # best_sets = (set1, set2)
- # Create the subbranches
- if best_gain > 0:
- l = l + 1
- trueBranch = buildtree(best_subsett,l, scoref)
- falseBranch = buildtree(best_subsetf,l, scoref)
- return decisionnode(col=best_column, value=best_value,
- tb=trueBranch, fb=falseBranch,l = l)
- else:
- return decisionnode(results=uniquecounts(rows))
- def printtree(tree, indent=''):
- # Is this a leaf node?
- if tree.results != None:
- print str(sorted(tree.results.items()))
- else:
- # Print the criteria
- print( str(tree.col) + ':' + str(tree.value) + '? '+'Level='+str(tree.l))
- # Print the branches
- print(indent + 'T->'),
- printtree(tree.tb, indent + ' ')
- print(indent + 'F->'),
- printtree(tree.fb, indent + ' ')
- def classify(observation, tree):
- if tree.results != None:
- maxi=0
- # print tree.results
- #for k in tree.results:
- # if tree.results[k]>=maxi:
- # maxi=tree.results[k]
- #for k in tree.results:
- # if tree.results[k] == maxi:
- # lista.append(k)
- #lista.sort()
- #return lista[0]
- return tree.results
- else:
- vrednost = observation[tree.col]
- branch = None
- if isinstance(vrednost, int) or isinstance(vrednost, float):
- if vrednost >= tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- else:
- if vrednost == tree.value:
- branch = tree.tb
- else:
- branch = tree.fb
- return classify(observation, branch)
- if __name__ == '__main__':
- arg1 = 1
- arg2 = 2.2
- arg3 = 4.0
- arg4 = 1.1
- cl = 'I. virginica'
- tmp = [arg1,arg2,arg3,arg4,cl]
- p1 = []
- p2 = []
- leng = len(trainingData)
- for i in range(0,leng/2):
- p1.append(trainingData[i])
- for i in range(leng/2,len(trainingData)):
- p2.append(trainingData[i])
- d1 = buildtree(p1)
- d2 = buildtree(p2)
- print 'DRVO 1\n',printtree(d1)
- print 'DRVO 2\n', printtree(d2)
- k1 = classify(tmp,d1)
- k2 = classify(tmp,d2)
- print k1
- print k2
- if k1.keys()[0] == k2.keys()[0]:
- print k1.keys()[0]
- if(k1.values()[0]>k2.values()[0]):
- print k1.keys()[0]
- print 'KONTRADIKCIJA'
- if(k2.values()[0]>k1.values()[0]):
- print k2.keys()[0]
- print 'KONTRADIKCIJA'
- ------------------------------------------------------------------------------------------------------
- ------------------------------------------------------------------------------------------------------
- ### Klasifikacija ###
- """Задача 1 Problem 1 (2 / 3)
- Дадено е тренинг множество од неколку документи. Притоа се знае секој документ од која класа е
- (science или sport). Mножеството е претставено како листи од торки, така што во секоја торка
- прв елемент е текстот на документот како стринг, а втор елемент е класата како стринг.
- Да се истренира модел врз основа на тренинг множеството и потоа за секој документ
- прочитан од стандарден влез да се испечати неговата класа.
- """
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import re
- import math
- train_data = [
- ("""What Are We Searching for on Mars?
- Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
- Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
- """,
- "science"),
- ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
- If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
- Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
- This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
- """,
- "science"),
- ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
- Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
- "With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
- """,
- "science"),
- ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
- Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
- Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
- A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
- """,
- "science"),
- ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
- At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
- """,
- "science"),
- ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
- The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
- Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
- """,
- "science"),
- ("""Envisioning a River of Air
- By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
- """,
- "science"),
- ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
- """
- , "sport"),
- ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
- They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
- """
- , "sport"),
- ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
- """
- , "sport"),
- ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
- """
- , "sport")
- , ("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
- """
- , "sport"),
- ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
- """
- , "sport"),
- ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
- So far, that's half true.
- Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
- ''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
- """
- , "sport"),
- ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
- """
- , "sport"),
- ]
- def getwords(doc):
- splitter = re.compile('\\W*')
- words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
- return dict([(word, 1) for word in words])
- class documentClassifier:
- def __init__(self, getfeatures, filename=None):
- self.featureCountsPerCategory = {}
- self.categoryCounts = {}
- self.getfeatures = getfeatures
- def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
- self.featureCountsPerCategory.setdefault(currentFeature, {})
- self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
- self.featureCountsPerCategory[currentFeature][currentCategory] += 1
- def incrementCategoryCounts(self, cat):
- self.categoryCounts.setdefault(cat, 0)
- self.categoryCounts[cat] += 1
- def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
- if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
- currentFeature]:
- return float(self.featureCountsPerCategory[currentFeature][currentCategory])
- return 0.0
- def getCategoryCount(self, currentCategory):
- if currentCategory in self.categoryCounts:
- return float(self.categoryCounts[currentCategory])
- return 0
- def getTotal(self):
- return sum(self.categoryCounts.values())
- def categories(self):
- return self.categoryCounts.keys()
- def train(self, item, currentCategory):
- features = self.getfeatures(item)
- for currentFeature in features:
- self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
- self.incrementCategoryCounts(currentCategory)
- def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
- if self.getCategoryCount(currentCategory) == 0: return 0
- return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
- currentCategory)
- def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
- basicprob = prf(currentFeature, currentCategory)
- totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
- self.categories()])
- bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
- return bp
- class naivebayes(documentClassifier):
- def __init__(self, getfeatures):
- documentClassifier.__init__(self, getfeatures)
- self.thresholds = {}
- def setThreshold(self, currentCategory, threshold):
- self.thresholds[currentCategory] = threshold
- def getThreshold(self, currentCategory):
- if currentCategory not in self.thresholds: return 1.0
- return self.thresholds[currentCategory]
- def calculateDocumentProbabilityInClass(self, item, currentCategory):
- features = self.getfeatures(item)
- p = 1
- for currentFeature in features:
- p *= self.weightedprob(currentFeature, currentCategory, self.getFeaturePerCategoryProbability)
- return p
- def getCategoryProbabilityForDocument(self, item, currentCategory):
- catprob = self.getCategoryCount(currentCategory) / self.getTotal()
- calculateDocumentProbabilityInClass = self.calculateDocumentProbabilityInClass(item, currentCategory)
- return calculateDocumentProbabilityInClass * catprob / (1.0 / self.getTotal())
- def classifyDocument(self, item, default=None):
- probs = {}
- max = 0.0
- for cat in self.categories():
- probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
- if probs[cat] > max:
- max = probs[cat]
- best = cat
- for cat in probs:
- if cat == best: continue
- if probs[cat] * self.getThreshold(best) > probs[best]: return default
- return best
- def trainClassifier(cl, data):
- cl.train("""So far this season Chelsea have looked the class of the league, but that does not faze Rooney.""",
- "science")
- cl.train("""Particularly for the strikers whose hype and profiles eclipse their prowess in front of goal. ""","sport")
- cl.train("""Armed with a bachelors degree in botany and a masters in microbiology, Hoffman moved to Charlotte where she became director of the Charlotte Nature Museum. ""","science")
- cl.train("""Chicago Bears quarterback Jay Cutler sits down for his usual press conference, but doesn't stay long as the media room has yet to be filled with most of the local media members.""","sport")
- cl.train("""When astronauts return from the International Space Station, their capsule hits the atmosphere at a speed of more than 17,000 miles per hour.""","science")
- if __name__ == "__main__":
- cl = naivebayes(getwords)
- trainClassifier(cl, train_data)
- recenica = input()
- # klasa = 'bad'
- # print klasa
- print cl.classifyDocument(recenica)
- -----------------------------------------------------------------------------------------------------
- Klasifikacija lab2
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import re
- import math
- train_data = [
- ("""What Are We Searching for on Mars?
- Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
- Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
- """,
- "science"),
- ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
- If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
- Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
- This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
- """,
- "science"),
- ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
- Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
- "With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
- """,
- "science"),
- ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
- Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
- Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
- A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
- """,
- "science"),
- ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
- At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
- """,
- "science"),
- ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
- The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
- Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
- """,
- "science"),
- ("""Envisioning a River of Air
- By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
- """,
- "science"),
- ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
- """
- , "sport"),
- ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
- They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
- """
- , "sport"),
- ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
- """
- , "sport"),
- ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
- """
- , "sport")
- , ("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
- """
- , "sport"),
- ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
- """
- , "sport"),
- ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
- So far, that's half true.
- Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
- ''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
- """
- , "sport"),
- ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
- """
- , "sport"),
- ]
- def getwords(doc):
- splitter = re.compile('\\W*')
- words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
- return dict([(word, 1) for word in words])
- class documentClassifier:
- def __init__(self, getfeatures, filename=None):
- self.featureCountsPerCategory = {}
- self.categoryCounts = {}
- self.getfeatures = getfeatures
- def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
- self.featureCountsPerCategory.setdefault(currentFeature, {})
- self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
- self.featureCountsPerCategory[currentFeature][currentCategory] += 1
- def incrementCategoryCounts(self, cat):
- self.categoryCounts.setdefault(cat, 0)
- self.categoryCounts[cat] += 1
- def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
- if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
- currentFeature]:
- return float(self.featureCountsPerCategory[currentFeature][currentCategory])
- return 0.0
- def getCategoryCount(self, currentCategory):
- if currentCategory in self.categoryCounts:
- return float(self.categoryCounts[currentCategory])
- return 0
- def getTotal(self):
- return sum(self.categoryCounts.values())
- def categories(self):
- return self.categoryCounts.keys()
- def train(self, item, currentCategory):
- features = self.getfeatures(item)
- for currentFeature in features:
- self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
- self.incrementCategoryCounts(currentCategory)
- def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
- if self.getCategoryCount(currentCategory) == 0: return 0
- return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
- currentCategory)
- def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
- basicprob = prf(currentFeature, currentCategory)
- totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
- self.categories()])
- bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
- return bp
- class naivebayes(documentClassifier):
- def __init__(self, getfeatures):
- documentClassifier.__init__(self, getfeatures)
- self.thresholds = {}
- def setThreshold(self, currentCategory, threshold):
- self.thresholds[currentCategory] = threshold
- def getThreshold(self, currentCategory):
- if currentCategory not in self.thresholds: return 1.0
- return self.thresholds[currentCategory]
- def calculateDocumentProbabilityInClass(self, item, currentCategory):
- features = self.getfeatures(item)
- p = 1
- for currentFeature in features:
- p *= self.weightedprob(currentFeature, currentCategory, self.getFeaturePerCategoryProbability)
- return p
- def getCategoryProbabilityForDocument(self, item, currentCategory):
- catprob = self.getCategoryCount(currentCategory) / self.getTotal()
- calculateDocumentProbabilityInClass = self.calculateDocumentProbabilityInClass(item, currentCategory)
- return calculateDocumentProbabilityInClass * catprob / (1.0 / self.getTotal())
- def classifyDocument(self, item, default=None):
- probs = {}
- max = 0.0
- for cat in self.categories():
- probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
- if probs[cat] > max:
- max = probs[cat]
- best = cat
- for cat in probs:
- if cat == best: continue
- if probs[cat] * self.getThreshold(best) > probs[best]: return default
- return best
- def trainClassifier(cl, data):
- #cl.train("""So far this season Chelsea have looked the class of the league, but that does not faze Rooney.""","science")
- #cl.train("""Particularly for the strikers whose hype and profiles eclipse their prowess in front of goal. ""","sport")
- #cl.train("""Armed with a bachelors degree in botany and a masters in microbiology, Hoffman moved to Charlotte where she became director of the Charlotte Nature Museum. ""","science")
- #cl.train("""Chicago Bears quarterback Jay Cutler sits down for his usual press conference, but doesn't stay long as the media room has yet to be filled with most of the local media members.""","sport")
- #cl.train("""When astronauts return from the International Space Station, their capsule hits the atmosphere at a speed of more than 17,000 miles per hour.""","science")
- for el in data:
- cl.train(el[0],el[1])
- if __name__ == "__main__":
- cl = naivebayes(getwords)
- trainClassifier(cl, train_data)
- recenica = input()
- klasa = cl.classifyDocument(recenica)
- verojatnost=cl.getCategoryProbabilityForDocument(recenica,klasa)
- # print klasa
- print klasa, '%.8f'% verojatnost
- -----------------------------------------------------------------------------------------------------
- Klasifikacija 3
- """
- За секоја прочитан документ од стандарден влез да се испечатат зборовите кои се употребуваат
- за класификација, класата и веројатноста на зборот да е од таа класа (заокружено на 4 децимали),
- како и тежинската веројатност на зборот да припаѓа на класата (заокружено на 4 децимали).
- На крај да се испечати предвидената класа на документот и логаритам со основа 2 од веројатноста
- со која се предвидува (заокружено на 4 децимали).
- """
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import re
- import math
- train_data = [
- ("""What Are We Searching for on Mars?
- Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
- Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
- """,
- "science"),
- ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
- If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
- Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
- This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
- """,
- "science"),
- ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
- Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
- "With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
- """,
- "science"),
- ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
- Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
- Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
- A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
- """,
- "science"),
- ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
- At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
- """,
- "science"),
- ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
- The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
- Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
- """,
- "science"),
- ("""Envisioning a River of Air
- By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
- """,
- "science"),
- ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
- """
- , "sport"),
- ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
- They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
- """
- , "sport"),
- ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
- """
- , "sport"),
- ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
- """
- , "sport")
- , ("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
- """
- , "sport"),
- ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
- """
- , "sport"),
- ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
- So far, that's half true.
- Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
- ''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
- """
- , "sport"),
- ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
- """
- , "sport"),
- ]
- def getwords(doc):
- splitter = re.compile('\\W*')
- words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
- return dict([(word, 1) for word in words])
- class documentClassifier:
- def __init__(self, getfeatures, filename=None):
- self.featureCountsPerCategory = {}
- self.categoryCounts = {}
- self.getfeatures = getfeatures
- def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
- self.featureCountsPerCategory.setdefault(currentFeature, {})
- self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
- self.featureCountsPerCategory[currentFeature][currentCategory] += 1
- def incrementCategoryCounts(self, cat):
- self.categoryCounts.setdefault(cat, 0)
- self.categoryCounts[cat] += 1
- def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
- if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
- currentFeature]:
- return float(self.featureCountsPerCategory[currentFeature][currentCategory])
- return 0.0
- def getCategoryCount(self, currentCategory):
- if currentCategory in self.categoryCounts:
- return float(self.categoryCounts[currentCategory])
- return 0
- def getTotal(self):
- return sum(self.categoryCounts.values())
- def categories(self):
- return self.categoryCounts.keys()
- def train(self, item, currentCategory):
- features = self.getfeatures(item)
- for currentFeature in features:
- self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
- self.incrementCategoryCounts(currentCategory)
- def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
- if self.getCategoryCount(currentCategory) == 0: return 0
- return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
- currentCategory)
- def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
- basicprob = prf(currentFeature, currentCategory)
- totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
- self.categories()])
- bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
- return bp
- class naivebayes(documentClassifier):
- def __init__(self, getfeatures):
- documentClassifier.__init__(self, getfeatures)
- self.thresholds = {}
- def setThreshold(self, currentCategory, threshold):
- self.thresholds[currentCategory] = threshold
- def getThreshold(self, currentCategory):
- if currentCategory not in self.thresholds: return 1.0
- return self.thresholds[currentCategory]
- def calculateDocumentProbabilityInClass(self, item, currentCategory):
- features = self.getfeatures(item)
- p = 1
- for currentFeature in features:
- p *= self.weightedprob(currentFeature, currentCategory, self.getFeaturePerCategoryProbability)
- return p
- def getCategoryProbabilityForDocument(self, item, currentCategory):
- catprob = self.getCategoryCount(currentCategory) / self.getTotal()
- calculateDocumentProbabilityInClass = self.calculateDocumentProbabilityInClass(item, currentCategory)
- return calculateDocumentProbabilityInClass * catprob / (1.0 / self.getTotal())
- def classifyDocument(self, item, default=None):
- probs = {}
- max = 0.0
- for cat in self.categories():
- probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
- if probs[cat] > max:
- max = probs[cat]
- best = cat
- for cat in probs:
- if cat == best: continue
- if probs[cat] * self.getThreshold(best) > probs[best]: return default
- return best
- def trainClassifier(cl, data):
- for el in data:
- cl.train(el[0],el[1])
- if __name__ == "__main__":
- cl = naivebayes(getwords)
- trainClassifier(cl, train_data)
- recenica = input()
- cl.setThreshold('science',1)
- klasa=cl.classifyDocument(recenica)
- pom=cl.getCategoryProbabilityForDocument(recenica,klasa)
- verojatnost=0
- verojatnost=round(math.log(pom)/math.log(2),4)
- zborovi =getwords(recenica)
- kategorii = cl.categories()
- for zbor in zborovi:
- for kategorija in kategorii:
- verojatnostNaZbor = round(cl.getFeaturePerCategoryProbability(zbor,kategorija),4)
- verojatnostNaZborTezinska = round(cl.weightedprob(zbor,kategorija,cl.getFeaturePerCategoryProbability),4)
- print zbor, kategorija, verojatnostNaZbor, verojatnostNaZborTezinska
- #klasa = 'bad'
- #verojatnost = 0
- print klasa, verojatnost
- -----------------------------------------------------------------------------------------------------
- Klasifikacija ispit januari-courses
- Заради потребата на софистицирана класификација на документи, веќе е имплементирана и достапна во почетниот
- код фунцијата getwords_with_ignore која ги дава уникатните зборовите од еден документ така што зборовите
- кои се веќе во интерната променлива words_to_ingore се игнорираат. Значи секој збор во words_to_ingore не
- фигурира во речникот со уникатни зборови кој се добива како резултат на getwords_with_ignore.
- Множеството на податоци train_data е предефинирано. Притоа се знае секој документ од која класа е
- (science или sport). Mножеството е претставено како листи од торки, така што во секоја торка прв елемент е
- текстот на документот како стринг, а втор елемент е класата како стринг. Да се истренира класификатор со
- користење на стандардната getwords (од аудиториските вежби) врз основа на тренинг множеството. Исто така
- да се направат потребните промени за да се истренира и втор класификатор кој ќе го употребува истото
- тренинг множество, но притоа ќе ја употребува новата функција која е веќе имплементирана
- getwords_with_ignore.
- Потоа за секој документ прочитан од стандарден влез да се испечатат 2 реда. Првиот ред ја содржи
- предвидената класа со стандардниот класификатор и логаритам со основа 2 од веројатноста со која се
- предвидува (заокружено на 4 децимали), а вториот ред предвидената класа со помош на вториот класификатор и
- логаритам со основа 2 од веројатноста со која се предвидува (заокружено на 4 децимали). Да се испечати
- колку пати втората веројатност е поголема од првата заокружено на 4 децимали. Ако предвидувањето на двата
- класификатори е различно да се испчати уште еден ред со зборот “kontradikcija”.
- Vlez:
- """Just last week, preservationists at the Old Pejeta animal sanctuary in Kenya conceded
- that their one male and two female northern white rhinos will not reproduce naturally.
- The animals were flown from the Czech zoo to the Kenyan conservancy in December 2009 in
- hopes that the natural environment could be easier for them to breed there than in captivity."""
- Izlez:
- science -51.5029
- science -46.0544
- 43.6706
- Vlez 2:
- """HONOLULU (AP) — Lava from a volcano on Hawaii's Big Island is on course to reach a shopping center
- with a gas station and a supermarket in seven to 10 days, officials said Monday."""
- Izlez 2:
- sport -21.1937
- science -17.6781
- 11.4370
- kontradikcija
- ====*===*====*============*****============***=============**=============***
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import re
- import math
- train_data=[
- ("""What Are We Searching for on Mars?
- Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
- Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
- """,
- "science"),
- ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
- If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
- Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
- This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
- """,
- "science"),
- ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
- Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
- "With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
- """,
- "science"),
- ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
- Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
- Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
- A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
- """,
- "science"),
- ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
- At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
- """,
- "science"),
- ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
- The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
- Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
- """,
- "science"),
- ("""Envisioning a River of Air
- By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
- """,
- "science"),
- ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
- """
- ,"sport"),
- ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
- They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
- """
- ,"sport"),
- ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
- """
- ,"sport"),
- ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
- """
- ,"sport")
- ,("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
- """
- ,"sport"),
- ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
- """
- ,"sport"),
- ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
- So far, that's half true.
- Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
- ''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
- """
- ,"sport"),
- ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
- """
- ,"sport"),
- ]
- def getwords(doc, words_to_ignore=None):
- splitter = re.compile('\\W*')
- words = [word.lower() for word in splitter.split(doc) if
- len(word) > 2 and len(word) < 20 and (words_to_ignore is None or word.lower() not in words_to_ignore)]
- return dict([(word, 1) for word in words])
- def getwords_with_ignore(doc,words_to_ignore=['and', 'are', 'for', 'was', 'what', 'when', 'who', 'but', 'from', 'after', 'out', 'our', 'my', 'the', 'with', 'some', 'not', 'this', 'that']):
- return getwords(doc,words_to_ignore)
- def getwords2(doc,words_to_ignore=None):
- splitter=re.compile('\\W*')
- words=[word.lower() for word in splitter.split(doc) if len(word)>2 and len(word)<20 and (words_to_ignore is None or word.lower() not in words_to_ignore)]
- return dict([(word,1) for word in words])
- words_to_ignore=['and', 'are', 'for', 'was', 'what', 'when', 'who', 'but', 'from', 'after', 'out', 'our', 'my', 'the', 'with', 'some', 'not', 'this', 'that']
- def trainClassifier(cl, data):
- for i in data:
- cl.train(i[0],i[1])
- # cl.train(data[0][0],data[0][1])
- class documentClassifier:
- def __init__(self, getfeatures, filename=None):
- # Broj na parovi atribut/kategorija (feature/category)
- self.featureCountsPerCategory = {}
- # Broj na dokumenti vo sekoja kategorija
- self.categoryCounts = {}
- # funkcija za dobivanje na atributite (zborovite) vo dokumentot
- self.getfeatures = getfeatures
- # Zgolemuvanje na brojot na parovi atribut/kategorija
- def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
- self.featureCountsPerCategory.setdefault(currentFeature, {})
- self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
- self.featureCountsPerCategory[currentFeature][currentCategory] += 1
- # Zgolemuvanje na brojot na predmeti(dokumenti) vo kategorija
- def incrementCategoryCounts(self, cat):
- self.categoryCounts.setdefault(cat, 0)
- self.categoryCounts[cat] += 1
- # Dobivanje na brojot kolku pati
- # odreden atribut se ima pojaveno vo odredena kategorija
- def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
- if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
- currentFeature]:
- return float(self.featureCountsPerCategory[currentFeature][currentCategory])
- return 0.0
- # Dobivanje na brojot na predmeti(dokumenti) vo kategorija
- def getCategoryCount(self, currentCategory):
- if currentCategory in self.categoryCounts:
- return float(self.categoryCounts[currentCategory])
- return 0
- # Dobivanje na vkupniot broj na predmeti
- def getTotalCount(self):
- return sum(self.categoryCounts.values())
- # Dobivanje na lista na site kategorii
- def categories(self):
- return self.categoryCounts.keys()
- # Treniranje na klasifikatorot
- # Noviot predmet (dokument) item pripagja na kategorijata cat
- def train(self, item, currentCategory):
- # Se zemaat atributite (zborovite) vo predmetot(dokumentot)
- features = self.getfeatures(item)
- # Se zgolemuva brojot na sekoj atribut vo ovaa kategorija
- for currentFeature in features:
- self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
- # Se zgolemuva brojot na predmeti (dokumenti) vo ovaa kategorija
- self.incrementCategoryCounts(currentCategory)
- def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
- if self.getCategoryCount(currentCategory) == 0: return 0
- # Verojatnosta e vkupniot broj na pati koga
- # ovoj atribut f (zbor) se pojavil vo ovaa
- # kategorija podeleno so vkupniot broj na
- # predmeti (dokumenti) vo ovaa kategorija
- return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
- currentCategory)
- def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
- # Presmetaj ja osnovnata verojatnost
- basicprob = prf(currentFeature, currentCategory)
- # Izbroj kolku pati se ima pojaveno ovoj atribut (zbor)
- # vo site kategorii
- totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
- self.categories()])
- # Presmetaj ja tezinski usrednetata verojatnost
- bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
- return bp
- class naivebayes(documentClassifier):
- def __init__(self, getfeatures):
- documentClassifier.__init__(self, getfeatures)
- self.thresholds = {}
- def setThreshold(self, currentCategory, threshold):
- self.thresholds[currentCategory] = threshold
- def getThreshold(self, currentCategory):
- if currentCategory not in self.thresholds: return 1.0
- return self.thresholds[currentCategory]
- # ja vrakja verojatnosta na dokumentot da e od klasata cat (cat e odnapred poznata)
- def caclulateDocumentProbabilityInClass(self, item, currentCategory):
- # zemi gi zborovite vo dokumentot item
- features = self.getfeatures(item)
- # pomnozi gi verojatnostite na site zborovi
- p = 1
- for currentFeature in features:
- p *= self.weightedprob(currentFeature, currentCategory,
- self.getFeaturePerCategoryProbability)
- return p
- # Ja vrakja verojatnosta na klasata ako e poznat dokumentot
- def getCategoryProbabilityForDocument(self, item, currentCategory):
- catprob = self.getCategoryCount(currentCategory) / self.getTotalCount()
- caclulateDocumentProbabilityInClass = self.caclulateDocumentProbabilityInClass(item, currentCategory)
- # Bayes Theorem
- return caclulateDocumentProbabilityInClass * catprob / (1.0 / self.getTotalCount())
- # klasificiranje na dokument
- def classifyDocument(self, item, default=None):
- probs = {}
- # najdi ja kategorijata (klasata)
- # so najgolema verojatnost
- max = 0.0
- for cat in self.categories():
- probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
- if probs[cat] > max:
- max = probs[cat]
- best = cat
- # proveri dali verojatnosta e pogolema od
- # threshold*next best (sledna najdobra)
- for cat in probs:
- if cat == best: continue
- if probs[cat] * self.getThreshold(best) > probs[best]: return default
- return best
- if __name__ == "__main__":
- #recenica = """Just last week, preservationists at the Old Pejeta animal sanctuary in Kenya conceded that their one male and two female northern white rhinos will not reproduce naturally. The animals were flown from the Czech zoo to the Kenyan conservancy in December 2009 in hopes that the natural environment could be easier for them to breed there than in captivity."""
- recenica = input()
- cl = naivebayes(getwords)
- cl2 = naivebayes(getwords_with_ignore)
- trainClassifier(cl,train_data)
- trainClassifier(cl2,train_data)
- klasa1 = cl.classifyDocument(recenica)
- klasa2 = cl2.classifyDocument(recenica)
- verojatnost1 = cl.getCategoryProbabilityForDocument(recenica, klasa1)
- verojatnost2 = cl2.getCategoryProbabilityForDocument(recenica, klasa2)
- # print (klasa1, klasa2)
- print klasa1,"%.4f" % math.log(verojatnost1,2)
- print klasa2,"%.4f" % math.log(verojatnost2,2)
- print "%.4f" % (verojatnost2/verojatnost1)
- if klasa1 != klasa2:
- print 'kontradikcija'
- ------------------------------------------------------------------------------------------------------
- #-*- coding: utf-8 -*-
- """
- Класификација на документи (испит)
- Заради потребата на софистицирана класификација на документи, фунцијата која ги дава уникатните зборовите од
- еден документ getwords треба да се промени така што ќе прима втор опционален аргумент words_to_ingore. Ако не
- се проследи опционалниот аргумент подразбирлива вредност е None. Кога се проследува вредност таа вредност треба
- да биде листа од зборови кои функцијата ќе ги изоставува при генерирањето на излезниот речник. Значи секој збор
- во words_to_ingore не треба да фигурира во речникот со уникатни зборови кој се добива како резултат на getwords.
- Множеството на податоци train_data е предефинирано.
- Притоа се знае секој документ од која класа е (science или sport).
- Mножеството е претставено како листи од торки, така што во секоја торка прв елемент е текстот на документот како стринг,
- а втор елемент е класата како стринг. Да се истренира класификатор со користење на стандардната getwords (од аудиториските вежби)
- врз основа на тренинг множеството. Исто така да се направат потребните промени за да се истренира и втор
- класификатор кој ќе го употребува истото тренинг множество, но притоа ќе ја употребува новата функција која е веќе
- имплементирана getwords_with_ignore.
- Потоа за секој документ прочитан од стандарден влез да се испечатат 2 реда.
- Првиот ред ја содржи предвидената класа со стандардниот класификатор, а вториот ред предвидената класа со помош на
- вториот класификатор. Ако предвидувањето на двата класификатори е различно
- да се испчати уште еден ред со зборот “kontradikcija”.
- """
- import re
- import math
- train_data=[
- ("""What Are We Searching for on Mars?
- Martians terrified me growing up. I remember watching the 1996 movie Mars Attacks! and fearing that the Red Planet harbored hostile alien neighbors. Though I was only 6 at the time, I was convinced life on Mars meant little green men wielding vaporizer guns. There was a time, not so long ago, when such an assumption about Mars wouldn’t have seemed so far-fetched.
- Like a child watching a scary movie, people freaked out after listening to “The War of the Worlds,” the now-infamous 1938 radio drama that many listeners believed was a real report about an invading Martian army. Before humans left Earth, humanity’s sense of what—or who—might be in our galactic neighborhood was, by today’s standards, remarkably optimistic.
- """,
- "science"),
- ("""Mountains of Ice are Melting, But Don't Panic (Op-Ed)
- If the planet lost the entire West Antarctic ice sheet, global sea level would rise 11 feet, threatening nearly 13 million people worldwide and affecting more than $2 trillion worth of property.
- Ice loss from West Antarctica has been increasing nearly three times faster in the past decade than during the previous one — and much more quickly than scientists predicted.
- This unprecedented ice loss is occurring because warm ocean water is rising from below and melting the base of the glaciers, dumping huge volumes of additional water — the equivalent of a Mt. Everest every two years — into the ocean.
- """,
- "science"),
- ("""Some scientists think we'll find signs of aliens within our lifetimes. Here's how.
- Finding extraterrestrial life is the essence of science fiction. But it's not so far-fetched to predict that we might find evidence of life on a distant planet within a generation.
- "With new telescopes coming online within the next five or ten years, we'll really have a chance to figure out whether we're alone in the universe," says Lisa Kaltenegger, an astronomer and director of Cornell's new Institute for Pale Blue Dots, which will search for habitable planets. "For the first time in human history, we might have the capability to do this."
- """,
- "science"),
- ("""'Magic' Mushrooms in Royal Garden: What Is Fly Agaric?
- Hallucinogenic mushrooms are perhaps the last thing you'd expect to find growing in the Queen of England's garden.
- Yet a type of mushroom called Amanita muscaria — commonly known as fly agaric, or fly amanita — was found growing in the gardens of Buckingham Palace by the producers of a television show, the Associated Press reported on Friday (Dec. 12).
- A. muscaria is a bright red-and-white mushroom, and the fungus is psychoactive when consumed.
- """,
- "science"),
- ("""Upcoming Parks : 'Lost Corner' Finds New Life in Sandy Springs
- At the corner of Brandon Mill Road, where Johnson Ferry Road turns into Dalrymple Road, tucked among 24 forested acres, sits an early 20th Century farmhouse. A vestige of Sandy Springs' past, the old home has found new life as the centerpiece of Lost Forest Preserve. While the preserve isn't slated to officially debut until some time next year, the city has opened the hiking trails to the public until construction begins on the permanent parking lot (at the moment the parking lot is a mulched area). The new park space includes community garden plots, a 4,000-foot-long hiking trail and an ADA-accessible trail through the densely wooded site. For Atlantans seeking an alternate escape to serenity (or those who dig local history), it's certainly worth a visit.
- """,
- "science"),
- ("""Stargazers across the world got a treat this weekend when the Geminids meteor shower gave the best holiday displays a run for their money.
- The meteor shower is called the "Geminids" because they appear as though they are shooting out of the constellation of Gemini. The meteors are thought to be small pieces of an extinct comment called 3200 Phaeton, a dust cloud revolving around the sun. Phaeton is thought to have lost all of its gas and to be slowly breaking apart into small particles.
- Earth runs into a stream of debris from 3200 Phaethon every year in mid-December, causing a shower of meteors, which hit its peak over the weekend.
- """,
- "science"),
- ("""Envisioning a River of Air
- By the classification rules of the world of physics, we all know that the Earth's atmosphere is made of gas (rather than liquid, solid, or plasma). But in the world of flying it's often useful to think
- """,
- "science"),
- ("""Following Sunday's 17-7 loss to the Seattle Seahawks, the San Francisco 49ers were officially eliminated from playoff contention, and they have referee Ed Hochuli to blame. OK, so they have a lot of folks to point the finger at for their 7-7 record, but Hochuli's incorrect call is the latest and easiest scapegoat.
- """
- ,"sport"),
- ("""Kobe Bryant and his teammates have an odd relationship. That makes sense: Kobe Bryant is an odd guy, and the Los Angeles Lakers are an odd team.
- They’re also, for the first time this season, the proud owners of a three-game winning streak. On top of that, you may have heard, Kobe Bryant passed Michael Jordan on Sunday evening to move into third place on the NBA’s all-time scoring list.
- """
- ,"sport"),
- ("""The Patriots continued their divisional dominance and are close to clinching home-field advantage throughout the AFC playoffs. Meanwhile, both the Colts and Broncos again won their division titles with head-to-head wins.The Bills' upset of the Packers delivered a big blow to Green Bay's shot at clinching home-field advantage throughout the NFC playoffs. Detroit seized on the opportunity and now leads the NFC North.
- """
- ,"sport"),
- ("""If you thought the Washington Redskins secondary was humbled by another scintillating performance from New Yorks Giants rookie wide receiver sensation Odell Beckham Jr., think again.In what is becoming a weekly occurrence, Beckham led NFL highlight reels on Sunday, collecting 12 catches for 143 yards and three touchdowns in Sunday's 24-13 victory against an NFC East rival.
- """
- ,"sport")
- ,("""That was two touchdowns and 110 total yards for the three running backs. We break down the fantasy implications.The New England Patriots' rushing game has always been tough to handicap. Sunday, all three of the team's primary running backs put up numbers, and all in different ways, but it worked for the team, as the Patriots beat the Miami Dolphins, 41-13.
- """
- ,"sport"),
- ("""General Santos (Philippines) (AFP) - Philippine boxing legend Manny Pacquiao vowed to chase Floyd Mayweather into ring submission after his US rival offered to fight him next year in a blockbuster world title face-off. "He (Mayweather) has reached a dead end. He has nowhere to run but to fight me," Pacquiao told AFP late Saturday, hours after the undefeated Mayweather issued the May 2 challenge on US television. The two were long-time rivals as the "best pound-for-pound" boxers of their generation, but the dream fight has never materialised to the disappointment of the boxing world.
- """
- ,"sport"),
- ("""When St. John's landed Rysheed Jordan, the consensus was that he would be an excellent starter.
- So far, that's half true.
- Jordan came off the bench Sunday and tied a career high by scoring 24 points to lead No. 24 St. John's to a 74-53 rout of Fordham in the ECAC Holiday Festival.
- ''I thought Rysheed played with poise,'' Red Storm coach Steve Lavin said. ''Played with the right pace. Near perfect game.''
- """
- ,"sport"),
- ("""Five-time world player of the year Marta scored three goals to lead Brazil to a 3-2 come-from-behind win over the U.S. women's soccer team in the International Tournament of Brasilia on Sunday. Carli Lloyd and Megan Rapinoe scored a goal each in the first 10 minutes to give the U.S. an early lead, but Marta netted in the 19th, 55th and 66th minutes to guarantee the hosts a spot in the final of the four-team competition.
- """
- ,"sport"),
- ]
- words_to_ignore=['and', 'are', 'for', 'was', 'what', 'when', 'who', 'but', 'from', 'after', 'out', 'our', 'my', 'the', 'with', 'some', 'not', 'this', 'that']
- def getwords(doc):
- # regularen izraz koj ke go deli stringot na zborovi
- # stringot se deli na zborovi na site prazni mesta i interpunkciski znaci
- splitter = re.compile('\\W*')
- # podeli go dokumentot na zborovi
- # i konvertiraj gi vo mali bukvi
- # pa potoa stavi gi vo lista
- # ako nivnata dolzina e >2 i <20
- words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
- # se vrakja recnik cii klucevi se zborovite koi
- # se vo dokumentot, a eden zbor duri i da se
- # srekjava poveke pati vo recnikot ke bide samo ednas
- # vrednosta (1) vo recnikot ne e potrebna
- return dict([(word, 1) for word in words])
- def getwords2(doc):
- splitter = re.compile('\\W*')
- words = []
- for word in splitter.split(doc):
- if len(word) > 2 and len(word) < 20 and word.lower() not in words_to_ignore:
- words.append(word.lower())
- return dict([(word, 1) for word in words])
- class documentClassifier:
- def __init__(self, getfeatures, filename=None):
- # Broj na parovi atribut/kategorija (feature/category)
- self.featureCountsPerCategory = {}
- # Broj na dokumenti vo sekoja kategorija
- self.categoryCounts = {}
- # funkcija za dobivanje na atributite (zborovite) vo dokumentot
- self.getfeatures = getfeatures
- # Zgolemuvanje na brojot na parovi atribut/kategorija
- def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
- self.featureCountsPerCategory.setdefault(currentFeature, {})
- self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
- self.featureCountsPerCategory[currentFeature][currentCategory] += 1
- # Zgolemuvanje na brojot na predmeti(dokumenti) vo kategorija
- def incrementCategoryCounts(self, cat):
- self.categoryCounts.setdefault(cat, 0)
- self.categoryCounts[cat] += 1
- # Dobivanje na brojot kolku pati
- # odreden atribut se ima pojaveno vo odredena kategorija
- def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
- if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
- currentFeature]:
- return float(self.featureCountsPerCategory[currentFeature][currentCategory])
- return 0.0
- # Dobivanje na brojot na predmeti(dokumenti) vo kategorija
- def getCategoryCount(self, currentCategory):
- if currentCategory in self.categoryCounts:
- return float(self.categoryCounts[currentCategory])
- return 0
- # Dobivanje na vkupniot broj na predmeti
- def getTotalCount(self):
- return sum(self.categoryCounts.values())
- # Dobivanje na lista na site kategorii
- def categories(self):
- return self.categoryCounts.keys()
- # Treniranje na klasifikatorot
- # Noviot predmet (dokument) item pripagja na kategorijata cat
- def train(self, item, currentCategory):
- # Se zemaat atributite (zborovite) vo predmetot(dokumentot)
- features = self.getfeatures(item)
- # Se zgolemuva brojot na sekoj atribut vo ovaa kategorija
- for currentFeature in features:
- self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
- # Se zgolemuva brojot na predmeti (dokumenti) vo ovaa kategorija
- self.incrementCategoryCounts(currentCategory)
- def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
- if self.getCategoryCount(currentCategory) == 0: return 0
- # Verojatnosta e vkupniot broj na pati koga
- # ovoj atribut f (zbor) se pojavil vo ovaa
- # kategorija podeleno so vkupniot broj na
- # predmeti (dokumenti) vo ovaa kategorija
- return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
- currentCategory)
- def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
- # Presmetaj ja osnovnata verojatnost
- basicprob = prf(currentFeature, currentCategory)
- # Izbroj kolku pati se ima pojaveno ovoj atribut (zbor)
- # vo site kategorii
- totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
- self.categories()])
- # Presmetaj ja tezinski usrednetata verojatnost
- bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
- return bp
- class naivebayes(documentClassifier):
- def __init__(self, getfeatures):
- documentClassifier.__init__(self, getfeatures)
- self.thresholds = {}
- def setThreshold(self, currentCategory, threshold):
- self.thresholds[currentCategory] = threshold
- def getThreshold(self, currentCategory):
- if currentCategory not in self.thresholds: return 1.0
- return self.thresholds[currentCategory]
- # ja vrakja verojatnosta na dokumentot da e od klasata cat (cat e odnapred poznata)
- def caclulateDocumentProbabilityInClass(self, item, currentCategory):
- # zemi gi zborovite vo dokumentot item
- features = self.getfeatures(item)
- # pomnozi gi verojatnostite na site zborovi
- p = 1
- for currentFeature in features: p *= self.weightedprob(currentFeature, currentCategory,
- self.getFeaturePerCategoryProbability)
- return p
- # Ja vrakja verojatnosta na klasata ako e poznat dokumentot
- def getCategoryProbabilityForDocument(self, item, currentCategory):
- catprob = self.getCategoryCount(currentCategory) / self.getTotalCount()
- caclulateDocumentProbabilityInClass = self.caclulateDocumentProbabilityInClass(item, currentCategory)
- # Bayes Theorem
- return caclulateDocumentProbabilityInClass * catprob / (1.0 / self.getTotalCount())
- # klasificiranje na dokument
- def classifyDocument(self, item, default=None):
- probs = {}
- # najdi ja kategorijata (klasata)
- # so najgolema verojatnost
- max = 0.0
- for cat in self.categories():
- probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
- if probs[cat] > max:
- max = probs[cat]
- best = cat
- # proveri dali verojatnosta e pogolema od
- # threshold*next best (sledna najdobra)
- for cat in probs:
- if cat == best: continue
- if probs[cat] * self.getThreshold(best) > probs[best]: return default
- return best,round(math.log(max,2),4)
- if __name__ == '__main__':
- #recenica = input()
- recenica = """Just last week, preservationists at the Old Pejeta animal sanctuary in Kenya conceded that their one male and two female northern white rhinos will not reproduce naturally. The animals were flown from the Czech zoo to the Kenyan conservancy in December 2009 in hopes that the natural environment could be easier for them to breed there than in captivity."""
- cl = naivebayes(getwords)
- for i in train_data:
- cl.train(i[0],i[1])
- class1, verojatnost1 = cl.classifyDocument(recenica)
- cl1 = naivebayes(getwords2)
- for i in train_data:
- cl1.train(i[0],i[1])
- class2, verojatnost2 = cl1.classifyDocument(recenica)
- print class1, verojatnost1
- print class2, verojatnost2
- print "%.4f" % round((verojatnost2 / verojatnost1), 4)
- if class1 != class2:
- print("kontradikcija")
- -----------------------------------------------------------------------------------------------------
- """ Klasifikacija - Twiter
- Потребно е да се направи систем кој ќе знае да класифицира твитови во однос на тонот (sentiment) на позитивен и негативен.
- Дадена ви е листа train_data од торки. Прв елемент во торката е класата (positive/negative) и втор елемент е содржината на твитот.
- Користејќи ги првите 200 примери, да се изгради наивен Баесов класификатор кој ќе научи да класифицира непознати твитови.
- Потоа, за прочитан индекс од влезот (број од 200 до 999) да се најде твитот на соодветната позиција во train_data и истиот да се класифицира.
- Во првата линија се печати бројот на позитивни и негативни примери во тренинг множеството,
- а во втората линија се печати индексот на тест примерот (прочитано од влез), точната класа, предвидената класа и содржината на твитот.
- """
- def getwords(doc):
- # regularen izraz koj ke go deli stringot na zborovi
- # stringot se deli na zborovi na site prazni mesta i interpunkciski znaci
- splitter = re.compile('\\W*')
- # podeli go dokumentot na zborovi
- # i konvertiraj gi vo mali bukvi
- # pa potoa stavi gi vo lista
- # ako nivnata dolzina e >2 i <20
- words = set()
- for word in splitter.split(doc):
- if 2 < len(word) < 20:
- words.add(word.lower())
- return words
- # words = [word.lower() for word in splitter.split(doc) if len(word) > 2 and len(word) < 20]
- # # se vrakja recnik cii klucevi se zborovite koi
- # # se vo dokumentot, a eden zbor duri i da se
- # # srekjava poveke pati vo recnikot ke bide samo ednas
- # # vrednosta (1) vo recnikot ne e potrebna
- # return dict([(word, 1) for word in words])
- # {'python': {'bad': 0, 'good': 6}, 'the': {'bad': 3, 'good': 3}}
- # print(w)
- # exit(1)
- class documentClassifier:
- def __init__(self, getfeatures, filename=None):
- # Broj na parovi atribut/kategorija (feature/category)
- self.featureCountsPerCategory = {}
- # Broj na dokumenti vo sekoja kategorija
- self.categoryCounts = {}
- # funkcija za dobivanje na atributite (zborovite) vo dokumentot
- self.getfeatures = getfeatures
- # Zgolemuvanje na brojot na parovi atribut/kategorija
- def incrementFeatureCountsPerCategory(self, currentFeature, currentCategory):
- self.featureCountsPerCategory.setdefault(currentFeature, {})
- self.featureCountsPerCategory[currentFeature].setdefault(currentCategory, 0)
- self.featureCountsPerCategory[currentFeature][currentCategory] += 1
- # Zgolemuvanje na brojot na predmeti(dokumenti) vo kategorija
- def incrementCategoryCounts(self, cat):
- self.categoryCounts.setdefault(cat, 0)
- self.categoryCounts[cat] += 1
- # Dobivanje na brojot kolku pati
- # odreden atribut se ima pojaveno vo odredena kategorija
- def getFeatureCountsPerCategory(self, currentFeature, currentCategory):
- if currentFeature in self.featureCountsPerCategory and currentCategory in self.featureCountsPerCategory[
- currentFeature]:
- return float(self.featureCountsPerCategory[currentFeature][currentCategory])
- return 0.0
- # Dobivanje na brojot na predmeti(dokumenti) vo kategorija
- def getCategoryCount(self, currentCategory):
- if currentCategory in self.categoryCounts:
- return float(self.categoryCounts[currentCategory])
- return 0
- # Dobivanje na vkupniot broj na predmeti
- def getTotalCount(self):
- return sum(self.categoryCounts.values())
- # Dobivanje na lista na site kategorii
- def categories(self):
- return self.categoryCounts.keys()
- # Treniranje na klasifikatorot
- # Noviot predmet (dokument) item pripagja na kategorijata cat
- def train(self, item, currentCategory):
- # Se zemaat atributite (zborovite) vo predmetot(dokumentot)
- features = self.getfeatures(item)
- # Se zgolemuva brojot na sekoj atribut vo ovaa kategorija
- for currentFeature in features:
- self.incrementFeatureCountsPerCategory(currentFeature, currentCategory)
- # Se zgolemuva brojot na predmeti (dokumenti) vo ovaa kategorija
- self.incrementCategoryCounts(currentCategory)
- def getFeaturePerCategoryProbability(self, currentFeature, currentCategory):
- if self.getCategoryCount(currentCategory) == 0: return 0
- # Verojatnosta e vkupniot broj na pati koga
- # ovoj atribut f (zbor) se pojavil vo ovaa
- # kategorija podeleno so vkupniot broj na
- # predmeti (dokumenti) vo ovaa kategorija
- return self.getFeatureCountsPerCategory(currentFeature, currentCategory) / self.getCategoryCount(
- currentCategory)
- def weightedprob(self, currentFeature, currentCategory, prf, weight=1.0, ap=0.5):
- # Presmetaj ja osnovnata verojatnost
- basicprob = prf(currentFeature, currentCategory)
- # Izbroj kolku pati se ima pojaveno ovoj atribut (zbor)
- # vo site kategorii
- totals = sum([self.getFeatureCountsPerCategory(currentFeature, currentCategory) for currentCategory in
- self.categories()])
- # Presmetaj ja tezinski usrednetata verojatnost
- bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
- return bp
- #dc = documentClassifier(getwords)
- #dc.train("sistemi na znaenje e dosaden predmet", "tracevi")
- #dc.train("asistentot po sistemi na znaenje e isto taka dosaden", "tracevi")
- #dc.train("vezbite po sistemi na znaenje moze da se podobrat na sledniov nacin...", "kritiki")
- #dc.train("predvanjata po sistemi na znaenje ne moze da se podobrat bidejki se najdobri...", "kritiki")
- class naivebayes(documentClassifier):
- def __init__(self, getfeatures):
- documentClassifier.__init__(self, getfeatures)
- self.thresholds = {}
- def setThreshold(self, currentCategory, threshold):
- self.thresholds[currentCategory] = threshold
- def getThreshold(self, currentCategory):
- if currentCategory not in self.thresholds: return 1.0
- return self.thresholds[currentCategory]
- # ja vrakja verojatnosta na dokumentot da e od klasata cat (cat e odnapred poznata)
- def caclulateDocumentProbabilityInClass(self, item, currentCategory):
- # zemi gi zborovite vo dokumentot item
- features = self.getfeatures(item)
- # pomnozi gi verojatnostite na site zborovi
- p = 1
- for currentFeature in features:
- p *= self.weightedprob(currentFeature, currentCategory,
- self.getFeaturePerCategoryProbability)
- return p
- # Ja vrakja verojatnosta na klasata ako e poznat dokumentot
- def getCategoryProbabilityForDocument(self, item, currentCategory):
- catprob = self.getCategoryCount(currentCategory) / self.getTotalCount()
- caclulateDocumentProbabilityInClass = self.caclulateDocumentProbabilityInClass(item, currentCategory)
- # Bayes Theorem
- return caclulateDocumentProbabilityInClass * catprob / (1.0 / self.getTotalCount())
- # klasificiranje na dokument
- def classifyDocument(self, item, default=None):
- probs = {}
- # najdi ja kategorijata (klasata)
- # so najgolema verojatnost
- max = 0.0
- for cat in self.categories():
- probs[cat] = self.getCategoryProbabilityForDocument(item, cat)
- if probs[cat] > max:
- max = probs[cat]
- best = cat
- # proveri dali verojatnosta e pogolema od
- # threshold*next best (sledna najdobra)
- for cat in probs:
- if cat == best: continue
- if probs[cat] * self.getThreshold(best) > probs[best]: return default
- return best
- if __name__ == '__main__':
- cl = naivebayes(getwords)
- #index = input()
- index=250
- pozitive = 0
- negative = 0
- for i in range(200):
- #print train_data[i][1],train_data[i][0], i
- if train_data[i][0] == 'negative':
- negative+=1
- else:
- pozitive+=1
- cl.train(train_data[i][1], train_data[i][0])
- #break
- twit = train_data[index]
- klasifikacija = cl.classifyDocument(twit[1])
- print "Pozitivni: {}, Negativni: {}".format(pozitive, negative)
- print "Index: {}, Tocna Klasa: {}, Predvidena Klasa: {}, Twit: {}".format(index, tweet[0], klasifikacija, twit[1])
- ------------------------------------------------------------------------------------------------------
- #Пресметка на статистики и еден подвижен прозорец Problem 1 (2 / 2)
- За даденото податочно множество во листата X_data со предефинирана должина N од секоја
- колона треба да се пресметаат следниве статистики: минимум, максимум, средна вредност,
- стандардна девијација. Притоа од стандарден влез се чита поместувањето D и должнината на
- подвижниот прозорец L. На излез треба да се испечати испроцесираното множество, така што
- во секоја линија ќе се испечати бројот на редицата со која завршува подвижниот прозорец
- и листа од вредности заокружени на 2 децимали со бараните статистики од секоја колона
- (прво 4 статистики за првата колона, па 4 статистики за втората колона итн...)
- Вкупниот број на редови е floor((N-L)/D + 1).
- НАПОМЕНА: Заради ефикасност на решението треба да ги пресметувате само статистиките кои се бараат.
- from __future__ import print_function
- import numpy as np
- import scipy.stats as sp
- X_data = [[119, 57, 51, 3, 1, 141],
- [105, 54, 62, 3, 1, 133],
- .........................]]
- .........
- def percentiles_all(x, iqr=True, amplitude=True, percentiles_list=[5, 10, 25, 40, 50, 60, 75, 90, 95]):
- names = ['p_' + str(p) for p in percentiles_list]
- if iqr and 25 in percentiles_list and 75 in percentiles_list:
- names.append('iqr')
- if amplitude and 1 in percentiles_list and 99 in percentiles_list:
- names.append('perc_amp')
- if len(x) == 0:
- values = [0 for i in range(len(names))]
- return values, names
- if len(percentiles_list) > 0 and all([0 < q < 100 for q in percentiles_list]):
- values = list(np.percentile(x, percentiles_list))
- else:
- values = []
- if iqr:
- q1 = percentiles_list.index(25)
- q3 = percentiles_list.index(75)
- values.append(values[q3] - values[q1])
- if amplitude and 1 in percentiles_list and 99 in percentiles_list:
- q1 = percentiles_list.index(1)
- q3 = percentiles_list.index(99)
- values.append(values[q3] - values[q1])
- return values, names
- def stats_calculate_all(X_data):
- stats_all_names = stats_all_names = ['len', 'min', 'max', 'range', 'mean', 'hmean', 'gmean', 'var', 'std', 'skew',
- 'kurtosis', 'median', 'mode', 'energy', 'energy_sample', 'snr']
- xnp = np.array(X_data)
- n = len(X_data)
- if n == 0:
- values = [0 for i in range(len(stats_all_names))]
- return values, stats_all_names
- values = []
- vmin = float(min(xnp))
- if vmin < 1:
- offset = 1 + abs(vmin)
- else:
- offset = 0
- vmax = float(max(xnp))
- #vrange = vmax - vmin
- vmean = float(np.mean(xnp))
- vstd = float(np.std(xnp))
- #venergy = float(sum(np.array(xnp) ** 2))
- #venergy_sample = venergy / n
- #snr = 0.0
- #if vstd != 0: snr = vmean / vstd
- values.append(vmin)
- values.append(vmax)
- #values.append(vrange)
- values.append(vmean)
- # values.append(float(sp.hmean(xnp + offset)))
- # values.append(float(sp.gmean(xnp + offset)))
- # values.append(vstd ** 2)
- values.append(vstd)
- # values.append(sp.skew(xnp))
- # values.append(sp.kurtosis(xnp))
- #values.append(np.median(xnp))
- # vmode = sp.mode(xnp)
- # vmode = float(vmode[0][0])
- # values.append(vmode)
- # values.append(venergy)
- # values.append(venergy_sample)
- #values.append(snr)
- return values, stats_all_names
- if __name__ == "__main__":
- x = np.array(X_data)
- # vasiot kod tuka
- shift=input()
- w_long=input()
- #print(x.shape)
- #print(x[:10, 0].shape)
- #print(x[:10, 0])
- #print(sorted(x[:10, 0]))
- #print(stats_calculate_all(x[:10, 0]))
- #print(percentiles_all(x[:10, 0], iqr=False, amplitude=False, percentiles_list=[10, 20, 50, 70, 90]))
- #shift = 50
- #w_long=100
- #w_short = 20
- for i in range(w_long, len(X_data), shift):
- row=[]
- for j in range(x.shape[1]):
- #x.shape[1]
- x_winow_long = x[i - w_long:i,j]
- #x_winow_short = x[i - w_short:i, j]
- #p1 = percentiles_all(x_winow_long)
- #p2 = percentiles_all(x_winow_short)
- s1, stat_names = stats_calculate_all(x_winow_long)
- #s2, _ = stats_calculate_all(x_winow_short)
- #s2=round((s1),2)
- #long_MVA = s1[4]
- row+=s1
- #print(s1,row)
- #short_MVA = s2[4]
- row3=[round(r,2) for r in row]
- print(i,row3)
- ---------------------------------------------------------------------------------------------------
- #Пресметка на статистики и два подвижни прозорци Problem 2 (1 / 2)
- За даденото податочно множество во листата X_data со предефинирана должина N од секоја колона т
- реба да се пресмета средната вредност. Притоа од стандарден влез се чита должнината на долгиот
- подвижнен прозорец L1, и краткиот подвижнен прозорец L2. Поместувањето е фиксно и е 1 ред.
- На излез треба да се испечати испроцесираното множество, така што во секоја линија ќе се
- испечати бројот на редицата со која завршува подвижниот прозорец и листа од вредности за
- секоја колона заокружени на 2 децимали: тековната вредност, средната вредност во долгиот
- подвижен прозорец, средна вредност во краткиот подвижен прозорец, и разлика од двете средни вредности.
- За M колони во секој ред треба да се испечатат листа со M x 4 елементи
- (тековна вредност, средна вредност долг подвижен прозорец, средна вредност краток
- подвижен прозорец, разлика од средните вредности). Заокружувањето се прави при печатењето на
- вредностите, но тие се чуваат без заокружување.
- НАПОМЕНА: Заради ефикасност на решението треба да ги пресметувате само статистиките кои се бараат.
- def stats_calculate_all(x):
- """
- Calculates stats from the provided list xnp, based on the stats config object.
- :param stat_config: Configuration for which stats to be computed.
- :param x: The time series list.
- :param offset: Offset of the values so some stats can be calculated
- :return:
- """
- stats_all_names = ['len', 'min', 'max', 'range', 'mean', 'hmean', 'gmean', 'var', 'std', 'skew', 'kurtosis',
- 'median', 'mode', 'energy', 'energy_sample', 'snr']
- xnp = np.array(x)
- n = len(x)
- if n == 0:
- values = [0 for i in range(len(stats_all_names))]
- return values, stats_all_names
- values = [n]
- vmin = float(min(xnp))
- if vmin < 1:
- offset = 1 + abs(vmin)
- else:
- offset = 0
- vmax = float(max(xnp))
- vrange = vmax - vmin
- vmean = float(np.mean(xnp))
- vstd = float(np.std(xnp))
- values.append(vmin)
- values.append(vmax)
- values.append(vrange)
- values.append(vmean)
- values.append(vstd)
- return values, stats_all_names
- if __name__ == "__main__":
- x = np.array(X_data)
- w_long = input()
- w_short = input()
- shift = 1
- arr_len = len(x[0, :])
- for i in range(max(w_short, w_long), len(X_data), shift):
- result = []
- for j in range(0, arr_len):
- x_winow_long = x[i - w_long:i, j]
- x_winow_short = x[i - w_short:i, j]
- s1, stat_names = stats_calculate_all(x_winow_long)
- s2, _ = stats_calculate_all(x_winow_short)
- long_MVA = s1[4] # Moving Average long window
- short_MVA = s2[4] # Moving Average short window
- # broj na primerok, broj na kolona, vrednost na primerokot i kolonata
- # razlika od tekovna vrednost i prosek od dolg prozorec
- # razlika od tekovna vrednost i prosek od kratok prozorec
- # razlika od pomegu prosek vo dolg i kratok prozorec
- result.extend([round(x[i, j], 2), round(long_MVA, 2), round(short_MVA, 2), round(long_MVA - short_MVA, 2)])
- print(i,result)
- -------------------------------------------------------------------------------------------------------
- # Vremenski prozorci - ispit januari 2017 - courses
- За даденото податочно множество во листата X_data со предефинирана должина N од секоја колона треба да се пресмета средната вредност, медијаната и стандардната девијација.
- Притоа од стандарден влез се чита должнината на долгиот подвижнен прозорец L1, и краткиот подвижнен прозорец L2.
- Поместувањето е фиксно и е 5 реда. На излез треба да се испечати процесираното множество, така што во секоја линија
- ќе се испечати бројот на редицата со која завршува подвижниот прозорец и листа од вредности за секоја колона заокружени
- на 2 децимали: тековната вредност, средната вредност, медијаната и стандардната девијација во долгиот подвижен прозорец,
- средната вредност, медијаната и стандардната девијација во краткиот подвижен прозорец. Заокружувањето се прави при печатењето на вредностите,
- но тие се чуваат без заокружување. Дополнително, доколку средната вредност од долгиот е поголема од средната вредност од краткиот прозорец,
- за секоја колона да се додаде вредност 1, а доколку средната вредност од краткиот е поголема од средната вредност од долгиот, да се додаде вредност -1.
- y_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
- def percentiles_all(x, iqr=True, amplitude=True, percentiles_list=[5, 10, 25, 40, 50, 60, 75, 90, 95]):
- names = ['p_' + str(p) for p in percentiles_list]
- if iqr and 25 in percentiles_list and 75 in percentiles_list:
- names.append('iqr')
- if amplitude and 1 in percentiles_list and 99 in percentiles_list:
- names.append('perc_amp')
- if len(x) == 0:
- values = [0 for i in range(len(names))]
- return values, names
- if len(percentiles_list) > 0 and all([0 < q < 100 for q in percentiles_list]):
- values = list(np.percentile(x, percentiles_list))
- else:
- values = []
- if iqr:
- q1 = percentiles_list.index(25)
- q3 = percentiles_list.index(75)
- values.append(values[q3] - values[q1])
- if amplitude and 1 in percentiles_list and 99 in percentiles_list:
- q1 = percentiles_list.index(1)
- q3 = percentiles_list.index(99)
- values.append(values[q3] - values[q1])
- return values, names
- def stats_calculate_all(x):
- xnp = np.array(x)
- n = len(x)
- if n == 0:
- values = [0 for i in range(len(stats_all_names))]
- return values, stats_all_names
- values = []
- vmean = float(np.mean(xnp))
- vstd = float(np.std(xnp))
- if vstd != 0:
- snr = vmean / vstd
- values.append(vmean)
- values.append(np.median(xnp))
- values.append(vstd)
- return values
- if __name__ == "__main__":
- x = np.array(X_data)
- shift = 5 # pomestuvanje pomegu podviznite prozorci
- w_long = input() # dolzina (broj na otcituvanja) na dolgiot prozorec
- w_short = input() # dolzina (broj na otcituvanja) na kratkiot prozorec
- for i in range(max(w_short, w_long), len(X_data), shift):
- lista=[]
- nova=[]
- for j in range(x.shape[1]):
- x_winow_long = x[i - w_long:i, j]
- x_winow_short = x[i - w_short:i, j]
- s1 = stats_calculate_all(x_winow_long)
- s2= stats_calculate_all(x_winow_short)
- lista.append(round((x[i][j]),2))
- for d in s1:
- lista.append(round(d,2))
- for d in s2:
- lista.append(round(d,2))
- if(s1[0]<s2[0]):
- lista.append(-1)
- else:
- lista.append(1)
- print (i,lista)
- ------------------------------------------------------------------------------------------------------
- ------------------------------------------------------------------------------------------------------
- Sistemi za preporaka - 1
- Оцени на корисници и филмови Problem 1 (1 / 2)
- За даденото множество кое е претставено како речник чиј клуч е името на
- корисникот и вредност е речник чиј клуч е филмот, а вредност е оцената
- која корисникот ја дал за филмот, да се инвертира така што ќе добиете
- повторно речник од речници. Во новиот речник клуч е името на филмот,
- а вредност е речник чиј клуч е името на корисникот, а вредност е оцената
- која тој корисник ја дал за тековниот филм.
- Потоа за прочитано име на филм од стандарден влез да се испечати најмалата и најголемата
- оцена која е дадена за него.
- Sample input
- 'Catch Me If You Can'
- Sample output
- {'Lisa Rose': 3.0, 'Jack Matthews': 4.5, 'Michael Phillips': 2.5, 'Gary Coleman': 1.5, 'Michelle Nichols': 2.5}
- oceniPoKorisnici={
- 'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
- 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
- 'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
- 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
- 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
- 'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
- 'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
- 'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
- 'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
- 'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
- }
- def invertirajOceni(oceni):
- oceniPoFilmovi={}
- for person in oceni:
- for item in oceni[person]:
- oceniPoFilmovi.setdefault(item,{})
- oceniPoFilmovi[item][person]=oceni[person][item]
- return oceniPoFilmovi
- if __name__ == "__main__":
- oceniPoFilmovi=invertirajOceni(oceniPoKorisnici)
- film=input()
- print oceniPoFilmovi[film]
- -----------------------------------------------------------------------------------------------------
- Sistemi za preporaka - 2
- Opened: 196 дена
- Функции за сличност Problem 2 (1 / 2)
- Да се напишат функции за пресметување на сличност базирани на Пеарсонова корелација и
- Евклидово растојание кои ќе враќаат торка од сличноста и бројот на заеднички елементи.
- За прочитани имиња на двајца корисници да се испечатата торките што ги враќаат двете функции.
- Sample input
- 'Jack Matthews'
- 'Gene Seymour'
- Sample output
- (0.905, 4)
- (0.667, 4)
- import math
- oceniPoKorisnici={
- 'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
- 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
- 'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
- 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
- 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
- 'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
- 'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
- 'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
- 'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
- 'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
- }
- def sim_distance(oceni,person1,person2):
- # Se pravi lista na zaednicki predmeti (filmovi)
- zaednicki={}
- for item in oceni[person1].keys():
- if item in oceni[person2]:
- zaednicki[item]=1
- # ako nemaat zaednicki rejtinzi, vrati 0
- if len(zaednicki)==0: return 0
- # Soberi gi kvadratite na zaednickite razliki
- sum_of_squares=sum([pow(oceni[person1][item]-oceni[person2][item],2)
- for item in oceni[person1] if item in oceni[person2]])
- return (round(1/(1+math.sqrt(sum_of_squares)),3),len(zaednicki))
- def sim_pearson(oceni,p1,p2):
- #for math import sqrt
- # Se kreira recnik vo koj ke se cuvaat predmetite (filmovi) koi se oceneti od dvajcata
- # Vo recnikot ni se vazni samo klucevite za da gi cuvame iminjata na filmovite koi se zaednicki, a vrednostite ne ni se vazni
- zaednicki={}
- for item in oceni[p1]:
- if item in oceni[p2]: zaednicki[item]=1
- # Se presmetuva brojot na predmeti oceneti od dvajcata
- n=len(zaednicki)
- # Ako nemaat zaednicki predmeti vrati korelacija 0
- if n==0: return 0
- # Soberi gi zaednickite oceni (rejtinzi) za sekoja licnost posebno
- sum1=sum([oceni[p1][it] for it in zaednicki])
- sum2=sum([oceni[p2][it] for it in zaednicki])
- # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za sekoja licnost posebno
- sum1Sq=sum([pow(oceni[p1][it],2) for it in zaednicki])
- sum2Sq=sum([pow(oceni[p2][it],2) for it in zaednicki])
- # Soberi gi proizvodite od ocenite na dvete licnosti
- pSum=sum([oceni[p1][it]*oceni[p2][it] for it in zaednicki])
- # Presmetaj go koeficientot na korelacija
- num=pSum-(sum1*sum2/n)
- den=math.sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
- if den==0: return 0
- r=num/den
- return (round(r,3),n)
- if __name__ == "__main__":
- korisnik1=input()
- korisnik2=input()
- # korisnik1='Mick LaSalle'
- # korisnik2='Lisa Rose'
- print sim_pearson(oceniPoKorisnici, korisnik1, korisnik2)
- print sim_distance(oceniPoKorisnici, korisnik1, korisnik2)
- ------------------------------------------------------------------------------------------------------
- Sistemi za preporaka - 3
- Табела на слични корисници Problem 3 (1 / 2)
- Да се напише функција која ќе генерира табела на слични корисници претставена како речник од речници
- (клучеви се имињата на корисниците), така што за секој пар корисници ќе чува торка од сличност
- базирана на Пеарсонова корелација, сличност базирана на Евклидово растојание, и број на заеднички
- оцени (оцени дадени за исти филмови). Вредностите да бидат заокружени на 3 децимали. За прочитани
- имиња на двајца корисници да се испечати торката која се чува во генерираната табела.
- Sample input
- 'Larry'
- 'Gene Seymour'
- Sample output
- (0.327, -0.5, 3)
- from math import sqrt
- oceniPoKorisnici={
- 'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
- 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
- 'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
- 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
- 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
- 'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
- 'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
- 'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
- 'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
- 'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
- }
- # Vrakja merka za slicnost bazirana na rastojanieto za person1 i person2
- def sim_distance(oceni,person1,person2):
- si={}
- for item in oceni[person1]:
- if item in oceni[person2]:
- si[item]=1
- if len(si)==0:
- return 0
- sum_of_squares=sum([pow(oceni[person1][item]-oceni[person2][item],2)
- for item in oceni[person1] if item in oceni[person2]])
- return (round(1.0/(1+sqrt(sum_of_squares)),3),len(si))
- return (0,0)
- def sim_pearson(oceni,person1,person2):
- si={}
- for item in oceni[person1]:
- if item in oceni[person2]:
- si[item]=1
- n=len(si)
- if n==0: return 0
- sum1=sum([oceni[person1][it] for it in si])
- sum2=sum([oceni[person2][it] for it in si])
- sum1Sq=sum([pow(oceni[person1][it],2) for it in si])
- sum2Sq=sum([pow(oceni[person2][it],2) for it in si])
- pSum=sum([oceni[person1][it]*oceni[person2][it] for it in si])
- num=pSum-(sum1*sum2/n)
- den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
- if den==0:
- return 0
- r=num/den
- return (round(r,3),n)
- def TabelaNaSlicniKorisnici(oceni):
- slicnosti={}
- for item in oceni:
- for key in oceni:
- if item!=oceni:
- slicnosti.setdefault(item,{})
- torka=sim_pearson(oceni,item,key)
- torka1=sim_distance(oceni,item,key)
- if torka!=0:
- a=torka1[0]
- b=torka[0]
- c=torka[1]
- slicnosti[item][key]=a,b,c
- return slicnosti
- if __name__ == "__main__":
- korisnik1=input()
- korisnik2=input()
- # korisnik1='Mick LaSalle'
- # korisnik2='Lisa Rose'
- # print oceniPoKorisnici
- tabela=TabelaNaSlicniKorisnici(oceniPoKorisnici)
- # print tabela
- print tabela[korisnik1][korisnik2]
- ------------------------------------------------------------------------------------------------------
- Sistemi za preporaka
- # -*- coding: utf-8 -*-
- """
- Да се напише функција која во зависност од бројот на рангирани филмови на корисникот
- ќе одбира начинот на препорачување - дали item-based или user-based. Функцијата треба да прима аргумент
- име на корисникот и бројот n од стандарден влез. Ако бројот на рангирани филмови на корисникот е помал од n
- препорачува на со item-based начин, а ако е поголем или еднаков на n да препорачува на user-based начин. На излез да
- се печати одбраниот начин (user-based или item-based), и во вториот ред да се испечати листа од препорачани филмови која
- ги содржи само имињата сортирани во растечки (азбучен) редослед.
- """
- oceniPoKorisnici={
- 'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
- 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
- 'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
- 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
- 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
- 'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
- 'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
- 'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
- 'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
- 'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
- }
- import math
- # Vrakja merka za slicnost bazirana na rastojanieto za person1 i person2
- def sim_distance(oceni, person1, person2):
- # Se pravi lista na zaednicki predmeti (filmovi)
- filmovi1=set(oceni[person1].keys())
- filmovi2=set(oceni[person2].keys())
- zaednicki = filmovi1.intersection(filmovi2)
- # print(filmovi1)
- # print(filmovi2)
- # print(zaednicki)
- # for item in oceni[person1].keys():
- # if item in oceni[person2]:
- # zaednicki.add(item)
- # # ako nemaat zaednicki rejtinzi, vrati 0
- if len(zaednicki) == 0: return 0
- # # Soberi gi kvadratite na zaednickite razliki
- suma = 0.0
- for item in zaednicki:
- ocena1 = oceni[person1][item]
- ocena2 = oceni[person2][item]
- suma += (ocena1 - ocena2) ** 2
- # print(item, person1, ocena1, person2, ocena2)
- return 1 / (1 + math.sqrt(suma))
- def sim_pearson(oceni, p1, p2):
- # Se kreira recnik vo koj ke se cuvaat predmetite (filmovi) koi se oceneti od dvajcata
- # Vo recnikot ni se vazni samo klucevite za da gi cuvame iminjata na filmovite koi se zaednicki, a vrednostite ne ni se vazni
- zaednicki = set()
- for item in oceni[p1]:
- if item in oceni[p2]:
- zaednicki.add(item)
- # Se presmetuva brojot na predmeti oceneti od dvajcata
- n = len(zaednicki)
- # Ako nemaat zaednicki predmeti vrati korelacija 0
- if n == 0: return 0
- # Soberi gi zaednickite oceni (rejtinzi) za sekoja licnost posebno
- sum1 = 0
- sum2 = 0
- # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za sekoja licnost posebno
- sum1Sq = 0
- sum2Sq = 0
- # Soberi gi proizvodite od ocenite na dvete licnosti
- pSum = 0
- for item in zaednicki:
- ocena1 = oceni[p1][item]
- ocena2 = oceni[p2][item]
- sum1 += ocena1
- sum1Sq += ocena1 ** 2
- sum2 += ocena2
- sum2Sq += ocena2 ** 2
- pSum += ocena1 * ocena2
- # Presmetaj go koeficientot na korelacija
- num = pSum - (sum1 * sum2 / n)
- den = math.sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
- if den == 0: return 0
- r = num / den
- return r
- def topMatches(oceni, person, n=5, similarity=sim_pearson):
- scores = []
- for person2 in oceni.keys():
- if person != person2:
- s = similarity(oceni, person, person2)
- scores.append((s, person2))
- # Se sortira listata vo rastecki redosled
- scores.sort()
- # Se prevrtuva za najslicnite (so najgolema vrednost) da bidat prvi
- scores.reverse()
- if n is None:
- return scores
- else:
- return scores[0:n]
- def getUserBasedRecomendations(oceni, person, similarity=sim_pearson, min_zaednicki=None):
- totals = {}
- simSums = {}
- for person2 in oceni.keys():
- # Za da ne se sporeduva so samiot sebe
- if person2 == person: continue
- filmovi1=set(oceni[person].keys())
- filmovi2=set(oceni[person2].keys())
- zaednicki = filmovi1.intersection(filmovi2)
- # ova e ako se bara minimum zaednicki filmovi
- # za da se zemat vo predvid ocenite na drugiot korisnik
- if min_zaednicki and len(zaednicki)<min_zaednicki:
- # print('So korisnikot', person2, 'imame samo',len(zaednicki),'filmovi, pa go preskoknuvame')
- continue
- sim = similarity(oceni, person, person2)
- # print(person,person2,sim)
- # ne se zemaat vo predvid rezultati <= 0
- if sim <= 0: continue
- # print(person,person2,sim)
- for item in oceni[person2].keys():
- # print(item, oceni[person].get(item, None), oceni[person2].get(item, None))
- # za tekovniot korisnik gi zemame samo filmovite sto gi nemame veke gledano
- if item not in oceni[person]: # or oceni[person][item] == 0:
- # similarity * Score (Slicnost * Ocena)
- # print(item, sim, oceni[person2][item], sim* oceni[person2][item])
- totals.setdefault(item, 0)
- totals[item] += oceni[person2][item] * sim
- # Sumuma na slicnosti
- simSums.setdefault(item, 0)
- simSums[item] += sim
- # print()
- # return
- # print()
- # Kreiranje na normalizirana lista so rejtinzi
- # rankings = [(total / simSums[item], item) for item, total in totals.items()]
- rankings = []
- for item, weighted_score in totals.items():
- sim_total = simSums[item]
- my_score = round(weighted_score / sim_total, 1)
- # print(item, weighted_score, sim_total, my_score)
- rankings.append((my_score, item))
- # Sortiranje na listata vo rastecki redosled
- rankings.sort(reverse=True)
- # Prevrtuvanje na lista za najgolemite vrednosti da bidat napred
- # rankings.reverse()
- a = [item[1] for item in rankings][0:3]
- a.sort()
- return a
- def transformoceni(oceni):
- result = {}
- for person in oceni.keys():
- for item in oceni[person]:
- result.setdefault(item, {})
- # Zameni gi mestata na licnosta i predmetot
- result[item][person] = oceni[person][item]
- return result
- def getItemBasedRecomendations(critics, person1, n=3):
- oceni_po_film = transformoceni(critics)
- similarity_per_item = {}
- for item in critics[person1].keys():
- similar_items = topMatches(oceni_po_film, item, n=None)
- my_rating = critics[person1][item]
- for similarity, item2 in similar_items:
- if item2 in critics[person1] or similarity <= 0:
- # print('Slicnost', similarity, 'na', item,'so', item2)
- continue
- weight= similarity * my_rating
- # print('Slicnost', similarity, 'na', item,'so', item2, weight)
- similarity_per_item.setdefault(item2, [])
- similarity_per_item[item2].append(weight)
- # print(item, my_rating, list(similarity_per_item.items()))
- similarity_per_item_avg = []
- import numpy as np
- for item in similarity_per_item:
- #print(item, similarity_per_item[item])
- avg_sim = np.mean(similarity_per_item[item])
- similarity_per_item_avg.append((avg_sim, item))
- #similarity_per_item_avg.sort(reverse=True)
- similarity_per_item_avg.sort(reverse=True)
- novi = [] #[item[1] for item in similarity_per_item_avg if item[1] > 0][0:3]
- for item in similarity_per_item_avg:
- # print item
- novi.append(item[1])
- novi = novi[0:3]
- novi.sort()
- return novi
- #return li #similarity_per_item_avg[:n]
- if __name__ == '__main__':
- k = input()
- n = input()
- long = len(oceniPoKorisnici[k].keys())
- if long < n:
- print 'item-based\n', getItemBasedRecomendations(oceniPoKorisnici,k)
- else:
- print 'user-based\n', getUserBasedRecomendations(oceniPoKorisnici, k)
- ------------------------------------------------------------------------------------------------------
- Sistemi za preporaka
- """
- По изработка на задачите од претходната вежба веќе ќе имате две тренинг множества претставени во Python како речник од речници. Искористете ги за изработка на систем за препораки така што да може на секој од тест корисниците да им предложи по 3 филмови, еднаш користејќи item-based, а еднаш user-based препораки. При item-based пристапот се предлагаат фимови кои ги нема гледано корисникот кои се со позитивна сличност со некои од филмовите кои ги има гледано. На излез треба да се печатат две листи кои ги содржат само имињата на предложените филмови во растечки (азбучен) редослед. Првата листа е според user-based, а втората според item-based пристап.
- """
- critics={
- 'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
- 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
- 'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
- 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
- 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
- 'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
- 'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
- 'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
- 'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
- 'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
- }
- from math import sqrt
- def sim_distance(oceni, person1, person2):
- filmovi1=set(oceni[person1].keys())
- filmovi2=set(oceni[person2].keys())
- zaednicki = filmovi1.intersection(filmovi2)
- #print(filmovi1)
- #print(filmovi2)
- #print(zaednicki)
- for item in oceni[person1].keys():
- if item in oceni[person2]:
- zaednicki.add(item)
- if len(zaednicki) == 0: return 0
- suma = 0.0
- for item in zaednicki:
- ocena1 = oceni[person1][item]
- ocena2 = oceni[person2][item]
- suma += (ocena1 - ocena2) ** 2
- # print(item, person1, ocena1, person2, ocena2)
- return 1 / (1 + sqrt(suma))
- def sim_pearson(oceni, p1, p2):
- zaednicki = set()
- for item in oceni[p1]:
- if item in oceni[p2]:
- zaednicki.add(item)
- # Se presmetuva brojot na predmeti oceneti od dvajcata
- n = len(zaednicki)
- # Ako nemaat zaednicki predmeti vrati korelacija 0
- if n == 0: return 0
- # Soberi gi zaednickite oceni (rejtinzi) za sekoja licnost posebno
- sum1 = 0
- sum2 = 0
- # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za sekoja licnost posebno
- sum1Sq = 0
- sum2Sq = 0
- # Soberi gi proizvodite od ocenite na dvete licnosti
- pSum = 0
- for item in zaednicki:
- ocena1 = oceni[p1][item]
- ocena2 = oceni[p2][item]
- sum1 += ocena1
- sum1Sq += ocena1 ** 2
- sum2 += ocena2
- sum2Sq += ocena2 ** 2
- pSum += ocena1 * ocena2
- # Presmetaj go koeficientot na korelacija
- num = pSum - (sum1 * sum2 / n)
- den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
- if den == 0: return 0
- r = num / den
- return r
- def transformPrefs(oceni):
- result = {}
- for person in oceni.keys():
- for item in oceni[person]:
- result.setdefault(item, {})
- # Zameni gi mestata na licnosta i predmetot
- result[item][person] = oceni[person][item]
- return result
- def topMatches(oceni, person, n=5, similarity=sim_pearson):
- scores = []
- for person2 in oceni.keys():
- if person != person2:
- s = similarity(oceni, person, person2)
- scores.append((s, person2))
- # Se sortira listata vo rastecki redosled
- scores.sort()
- # Se prevrtuva za najslicnite (so najgolema vrednost) da bidat prvi
- scores.reverse()
- if n is None:
- return scores
- else:
- return scores[0:n]
- def getRecommendations(oceni, person, similarity=sim_pearson, min_zaednicki=None):
- totals = {}
- simSums = {}
- for person2 in oceni.keys():
- filmovi1 = set(oceni[person].keys())
- filmovi2 = set(oceni[person2].keys())
- zaednicki = filmovi1.intersection(filmovi2)
- # ova e ako se bara minimum zaednicki filmovi
- # za da se zemat vo predvid ocenite na drugiot korisnik
- if min_zaednicki and len(zaednicki) < min_zaednicki:
- print('So korisnikot', person2, 'imame samo', len(zaednicki), 'filmovi, pa go preskoknuvame')
- continue
- if person2 == person:
- continue
- sim = similarity(oceni, person, person2)
- if sim <= 0:
- continue
- for item in oceni[person2]:
- if item not in oceni[person] or oceni[person][item] == 0:
- # similarity * Score (Slicnost * Ocena)
- #print(item, sim, oceni[person2][item], sim* oceni[person2][item])
- totals.setdefault(item, 0)
- totals[item] += oceni[person2][item] * sim
- # Sumuma na slicnosti
- simSums.setdefault(item, 0)
- simSums[item] += sim
- rankings = [(total / simSums[item], item) for item, total in totals.items()]
- #rankings = [item for item,v in totals.items()]
- rankings.sort(reverse=True)
- a = []
- for i in rankings:
- a.append(i[1])
- a = a[:3]
- a.sort()
- return a#sorted(rankings[0:3])
- def getItemBasedRecomendations(oceni,korisnik,similarity=sim_pearson):
- rankings = []
- filmovi = oceni.keys()
- gledani = [item for item in filmovi if critics[korisnik].has_key(item)]
- negledani = [item for item in filmovi if not critics[korisnik].has_key(item)]
- #print gledani
- #print negledani
- slicnosti = {}
- for film in negledani:
- for drug in gledani:
- sim = similarity(oceni, film, drug)
- slicnosti.setdefault(film, 0);
- if slicnosti[film] < sim:
- slicnosti[film] = sim
- stvari = slicnosti.items()
- #print stvari
- stvari.sort(key=lambda tup: tup[1], reverse=True)
- novi = [item[0] for item in stvari if item[1] > 0][0:3]
- novi.sort()
- return novi
- def item_based(critics, person1, n=3):
- oceni_po_film = transformPrefs(critics)
- similarity_per_item = {}
- for item in critics[person1].keys():
- similar_items = topMatches(oceni_po_film, item, n=None)
- my_rating = critics[person1][item]
- for similarity, item2 in similar_items:
- if item2 in critics[person1] or similarity <= 0:
- # print('Slicnost', similarity, 'na', item,'so', item2)
- continue
- weight= similarity * my_rating
- # print('Slicnost', similarity, 'na', item,'so', item2, weight)
- similarity_per_item.setdefault(item2, [])
- similarity_per_item[item2].append(weight)
- # print(item, my_rating, list(similarity_per_item.items()))
- similarity_per_item_avg = []
- import numpy as np
- for item in similarity_per_item:
- #print(item, similarity_per_item[item])
- avg_sim = np.mean(similarity_per_item[item])
- similarity_per_item_avg.append((avg_sim, item))
- similarity_per_item_avg.sort(reverse=True)
- a = []
- for i in similarity_per_item_avg:
- a.append(i[1])
- a = a[:n]
- a.sort()
- return a #similarity_per_item_avg[:n]
- if __name__ == "__main__":
- korisnik = input()
- print "user-based: " + str(getRecommendations(critics, korisnik))
- print "item-based: " + str(item_based(critics=critics, person1=korisnik))
- ------------------------------------------------------------------------------------------------------
- # -*- coding:utf-8 -*-
- """
- Да испрограмира функција за косинусна сличност која е дефинирана со следнава формула, каде A е листа со оцените на едниот корисник или филм, а B е листа со оцените на другиот корисник или филм:
- enter image description here
- Притоа треба да се избегне делење со нула и во тој случај да се смета дека сличноста е -1.
- Речник со оцени на корисници по филмови треба е веќе даден. Од стандардниот влез се вчитува име на еден филм. Да се испечати сличноста на прочитаниот филм со секој друг филм (освен самиот со себе) така што ќе се печати:
- Филм 2
- Косинусна сличност, Пеарсонова сличност, Евклидова сличност
- Празна линија
- При печатењето филмовите треба да бидат подредени по азбучен редослед. Сите сличности треба да бидат заокружени на 2 децимали.
- """
- from math import sqrt
- oceniPoKorisnici={
- 'Lisa Rose': {'Catch Me If You Can': 3.0 , 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
- 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'The Night Listener': 3.0,'You, Me and Dupree': 3.5},
- 'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5,'Superman Returns': 3.5, 'The Night Listener': 4.0, 'Snitch': 2.0},
- 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},
- 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'You, Me and Dupree': 2.0},
- 'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
- 'Toby': {'Snakes on a Plane':4.5, 'Snitch': 5.0},
- 'Michelle Nichols': {'Just My Luck' : 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5, 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
- 'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5, 'You, Me and Dupree': 2.0},
- 'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
- }
- def sim_cos(oceni, p1, p2):
- zaednicki = set()
- for item in oceni[p1]:
- if item in oceni[p2]:
- zaednicki.add(item)
- #print(zaednicki)
- #return
- # Se presmetuva brojot na predmeti oceneti od dvajcata
- n = len(zaednicki)
- # Ako nemaat zaednicki predmeti vrati korelacija 0
- if n == 0: return 0
- # Soberi gi zaednickite oceni (rejtinzi) za sekoja licnost posebno
- sum1 = 0
- sum2 = 0
- # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za sekoja licnost posebno
- sum1Sq = 0
- sum2Sq = 0
- # Soberi gi proizvodite od ocenite na dvete licnosti
- pSum = 0
- for item in zaednicki:
- ocena1 = oceni[p1][item]
- ocena2 = oceni[p2][item]
- sum1 += ocena1
- sum1Sq += ocena1 ** 2
- sum2 += ocena2
- sum2Sq += ocena2 ** 2
- pSum += ocena1 * ocena2
- # Presmetaj go koeficientot na korelacija
- num = pSum
- den = sqrt(sum1Sq) * sqrt(sum2Sq)
- if den == 0: return -1
- r = num / den
- return round(r,2)
- def sim_distance(oceni, person1, person2):
- # Se pravi lista na zaednicki predmeti (filmovi)
- filmovi1=set(oceni[person1].keys())
- filmovi2=set(oceni[person2].keys())
- zaednicki = filmovi1.intersection(filmovi2)
- # print(filmovi1)
- # print(filmovi2)
- # print(zaednicki)
- # for item in oceni[person1].keys():
- # if item in oceni[person2]:
- # zaednicki.add(item)
- # # ako nemaat zaednicki rejtinzi, vrati 0
- if len(zaednicki) == 0: return 0
- # # Soberi gi kvadratite na zaednickite razliki
- suma = 0.0
- for item in zaednicki:
- ocena1 = oceni[person1][item]
- ocena2 = oceni[person2][item]
- suma += (ocena1 - ocena2) ** 2
- # print(item, person1, ocena1, person2, ocena2)
- return round(1 / (1 + sqrt(suma)),2)
- def sim_pearson(oceni, p1, p2):
- # Se kreira recnik vo koj ke se cuvaat predmetite (filmovi) koi se oceneti od dvajcata
- # Vo recnikot ni se vazni samo klucevite za da gi cuvame iminjata na filmovite koi se zaednicki, a vrednostite ne ni se vazni
- zaednicki = set()
- for item in oceni[p1]:
- if item in oceni[p2]:
- zaednicki.add(item)
- # Se presmetuva brojot na predmeti oceneti od dvajcata
- n = len(zaednicki)
- # Ako nemaat zaednicki predmeti vrati korelacija 0
- if n == 0: return 0
- # Soberi gi zaednickite oceni (rejtinzi) za sekoja licnost posebno
- sum1 = 0
- sum2 = 0
- # Soberi gi kvadratite od zaednickite oceni (rejtinzi) za sekoja licnost posebno
- sum1Sq = 0
- sum2Sq = 0
- # Soberi gi proizvodite od ocenite na dvete licnosti
- pSum = 0
- for item in zaednicki:
- ocena1 = oceni[p1][item]
- ocena2 = oceni[p2][item]
- sum1 += ocena1
- sum1Sq += ocena1 ** 2
- sum2 += ocena2
- sum2Sq += ocena2 ** 2
- pSum += ocena1 * ocena2
- # Presmetaj go koeficientot na korelacija
- num = pSum - (sum1 * sum2 / n)
- den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
- if den == 0: return 0
- r = num / den
- return round(r,2)
- def topMatches(oceni, person, n=5, similarity=sim_pearson):
- scores = []
- for person2 in oceni.keys():
- if person != person2:
- s = similarity(oceni, person, person2)
- scores.append((s, person2))
- # Se sortira listata vo rastecki redosled
- scores.sort()
- # Se prevrtuva za najslicnite (so najgolema vrednost) da bidat prvi
- scores.reverse()
- if n is None:
- return scores
- else:
- return scores[0:n]
- def transformoceni(oceni):
- result = {}
- for person in oceni.keys():
- for item in oceni[person]:
- result.setdefault(item, {})
- # Zameni gi mestata na licnosta i predmetot
- result[item][person] = oceni[person][item]
- return result
- def getRecommendations(oceni, person, similarity=sim_pearson, min_zaednicki=None):
- totals = {}
- simSums = {}
- for person2 in oceni.keys():
- # Za da ne se sporeduva so samiot sebe
- if person2 == person: continue
- filmovi1=set(oceni[person].keys())
- filmovi2=set(oceni[person2].keys())
- zaednicki = filmovi1.intersection(filmovi2)
- # ova e ako se bara minimum zaednicki filmovi
- # za da se zemat vo predvid ocenite na drugiot korisnik
- if min_zaednicki and len(zaednicki)<min_zaednicki:
- print('So korisnikot', person2, 'imame samo',len(zaednicki),'filmovi, pa go preskoknuvame')
- continue
- sim = similarity(oceni, person, person2)
- # print(person,person2,sim)
- # ne se zemaat vo predvid rezultati <= 0
- if sim <= 0: continue
- print(person,person2,sim)
- for item in oceni[person2].keys():
- # print(item, oceni[person].get(item, None), oceni[person2].get(item, None))
- # za tekovniot korisnik gi zemame samo filmovite sto gi nemame veke gledano
- if item not in oceni[person]: # or oceni[person][item] == 0:
- # similarity * Score (Slicnost * Ocena)
- print(item, sim, oceni[person2][item], sim* oceni[person2][item])
- totals.setdefault(item, 0)
- totals[item] += oceni[person2][item] * sim
- # Sumuma na slicnosti
- simSums.setdefault(item, 0)
- simSums[item] += sim
- print()
- # return
- print()
- # Kreiranje na normalizirana lista so rejtinzi
- # rankings = [(total / simSums[item], item) for item, total in totals.items()]
- rankings = []
- for item, weighted_score in totals.items():
- sim_total = simSums[item]
- my_score = round(weighted_score / sim_total, 1)
- print(item, weighted_score, sim_total, my_score)
- rankings.append((my_score, item))
- # Sortiranje na listata vo rastecki redosled
- rankings.sort(reverse=True)
- # Prevrtuvanje na lista za najgolemite vrednosti da bidat napred
- # rankings.reverse()
- return rankings
- def item_based(critics, person1, n=3):
- oceni_po_film = transformoceni(critics)
- similarity_per_item = {}
- for item in critics[person1].keys():
- similar_items = topMatches(oceni_po_film, item, n=None)
- my_rating = critics[person1][item]
- for similarity, item2 in similar_items:
- if item2 in critics[person1] or similarity <= 0:
- # print('Slicnost', similarity, 'na', item,'so', item2)
- continue
- weight= similarity * my_rating
- # print('Slicnost', similarity, 'na', item,'so', item2, weight)
- similarity_per_item.setdefault(item2, [])
- similarity_per_item[item2].append(weight)
- # print(item, my_rating, list(similarity_per_item.items()))
- similarity_per_item_avg = []
- import numpy as np
- for item in similarity_per_item:
- print(item, similarity_per_item[item])
- avg_sim = np.mean(similarity_per_item[item])
- similarity_per_item_avg.append((avg_sim, item))
- similarity_per_item_avg.sort(reverse=True)
- return similarity_per_item_avg[:n]
- def transformoceni(oceni):
- result = {}
- for person in oceni.keys():
- for item in oceni[person]:
- result.setdefault(item, {})
- # Zameni gi mestata na licnosta i predmetot
- result[item][person] = oceni[person][item]
- return result
- if __name__ == '__main__':
- film = 'Catch Me If You Can'
- # film = input()
- movie_base=transformoceni(oceniPoKorisnici)
- for k in sorted(movie_base.keys()):
- if film == k:
- continue
- else:
- print(k)
- print sim_cos(movie_base,film,k), sim_pearson(movie_base,film,k), sim_distance(movie_base,film,k)
- print
- ------------------------------------------------------------------------------------------------------
- Sistemi za preporaka januari 2017
- За корисникот внесен на влез да се препорача филм. Да се користи Пирсонов коефициент на корелација како мерка.
- Ако корисникот го нема во базата да се препорача најгледаниот филм. Доколку корисникот има гледано повеќе од 5 филмови
- , да се препорача според филмовите, во спротивно да се препорача според корисниците кои се слични со него.
- from __future__ import print_function
- import json
- from math import sqrt
- # A dictionary of movie critics and their ratings of a small set of movies
- critics = {
- 'Lisa Rose': {'Catch Me If You Can': 3.0, 'Snakes on a Plane': 3.5, 'Superman Returns': 3.5,
- 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0, 'Snitch': 3.0},
- 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'The Night Listener': 3.0,
- 'You, Me and Dupree': 3.5},
- 'Michael Phillips': {'Catch Me If You Can': 2.5, 'Lady in the Water': 2.5, 'Superman Returns': 3.5,
- 'The Night Listener': 4.0, 'Snitch': 2.0},
- 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'The Night Listener': 4.5, 'Superman Returns': 4.0,
- 'You, Me and Dupree': 2.5},
- 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'Just My Luck': 2.0, 'Superman Returns': 3.0,
- 'You, Me and Dupree': 2.0},
- 'Jack Matthews': {'Catch Me If You Can': 4.5, 'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
- 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5, 'Snitch': 4.5},
- 'Toby': {'Snakes on a Plane': 4.5, 'Snitch': 5.0},
- 'Michelle Nichols': {'Just My Luck': 1.0, 'The Night Listener': 4.5, 'You, Me and Dupree': 3.5,
- 'Catch Me If You Can': 2.5, 'Snakes on a Plane': 3.0},
- 'Gary Coleman': {'Lady in the Water': 1.0, 'Catch Me If You Can': 1.5, 'Superman Returns': 1.5,
- 'You, Me and Dupree': 2.0},
- 'Larry': {'Lady in the Water': 3.0, 'Just My Luck': 3.5, 'Snitch': 1.5, 'The Night Listener': 3.5}
- }
- def sim_distance(oceni,person1,person2):
- si={}
- for item in oceni[person1]:
- if item in oceni[person2]:
- si[item]=1
- if len(si)==0: return 0
- sum_of_squares=sum([pow(oceni[person1][item]-oceni[person2][item],2)
- for item in oceni[person1] if item in oceni[person2]])
- return 1/(1+sqrt(sum_of_squares))
- def sim_pearson(oceni,person1,person2):
- si={}
- for item in oceni[person1]:
- if item in oceni[person2]: si[item]=1
- n=len(si)
- if n==0: return 0
- sum1=sum([oceni[person1][it] for it in si])
- sum2=sum([oceni[person2][it] for it in si])
- sum1Sq=sum([pow(oceni[person1][it],2) for it in si])
- sum2Sq=sum([pow(oceni[person2][it],2) for it in si])
- pSum=sum([oceni[person1][it]*oceni[person2][it] for it in si])
- num=pSum-(sum1*sum2/n)
- den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
- if den==0: return 0
- r=num/den
- return r
- def transformPrefs(prefs):
- result = {}
- for person in prefs:
- for item in prefs[person]:
- result.setdefault(item,{})
- result[item][person]=prefs[person][item]
- return result
- def topMatches(prefs,person,n=4,similarity=sim_pearson):
- scores=[(similarity(prefs,person,other),other)
- for other in prefs if other!=person]
- scores.sort()
- scores.reverse()
- return scores[0:n]
- def getUserBasedRecommendations(oceni,korisnik,similarity=sim_pearson):
- totals={}
- simSums={}
- for other in oceni:
- if other==korisnik: continue
- sim=similarity(oceni,korisnik,other)
- if sim<=0: continue
- for item in oceni[other]:
- if item not in oceni[korisnik] or oceni[korisnik][item]==0:
- totals.setdefault(item,0)
- totals[item]+=oceni[other][item]*sim
- simSums.setdefault(item,0)
- simSums[item]+=sim
- rankings=[(total/simSums[item],item) for item,total in totals.items()]
- rankings.sort()
- rankings.reverse()
- rankings = rankings[0:3]
- return rankings
- def getItemBasedRecomendations(oceni,korisnik,similarity=sim_pearson):
- similar={}
- films=transformPrefs(oceni)
- for gledanFilm in oceni[korisnik]:
- similar_filmovi=topMatches(films,gledanFilm)
- for slicnost,slicen_film in similar_filmovi:
- if slicen_film not in oceni[korisnik] and (slicen_film not in similar or slicnost>similar[slicen_film]):
- similar[slicen_film]=slicnost
- rankings=sorted(similar, key=similar.get)
- rankings.reverse()
- rankings=rankings[0:3]
- rankings.sort()
- return rankings
- def transformoceni(oceni):
- result={}
- for person in oceni:
- for item in oceni[person]:
- result.setdefault(item,{})
- result[item][person]=oceni[person][item]
- return result
- def item_based(critics, person1, n=3):
- oceni_po_film = transformoceni(critics)
- similarity_per_item = {}
- for item in critics[person1].keys():
- similar_items = topMatches(oceni_po_film, item, n=None)
- my_rating = critics[person1][item]
- for similarity, item in similar_items:
- if item in critics[person1] or similarity <= 0:
- continue
- similarity_per_item.setdefault(item, [])
- similarity_per_item[item].append(similarity * my_rating)
- similarity_per_item_avg = []
- import numpy as np
- for item in similarity_per_item:
- avg_sim = np.mean(similarity_per_item[item])
- similarity_per_item_avg.append((avg_sim, item))
- similarity_per_item_avg.sort(reverse=True)
- return similarity_per_item_avg[:n]
- if __name__ == "__main__":
- korisnik = input()
- inverse = transformoceni(critics)
- korisniciIFilmovi = inverse.items()
- korisniciIFilmovi.sort(key=lambda tup: len(tup[1].items()), reverse=True)
- najgledan = korisniciIFilmovi[0][0]
- if not korisnik in critics:
- print(najgledan)
- exit()
- oceniNaKorisnik = critics[korisnik]
- filmoviNaKorisnik = oceniNaKorisnik.keys()
- imaPoveke = False
- for k in critics:
- if k != korisnik:
- brojac = 0
- for film in filmoviNaKorisnik:
- brojac += 1
- if brojac > 5:
- imaPoveke = True
- if imaPoveke:
- print (str(getItemBasedRecomendations(critics, korisnik)[0]))
- else:
- print (str(getUserBasedRecommendations(critics, korisnik)[0][1]))
- ------------------------------------------------------------------------------------------------------
Advertisement
Add Comment
Please, Sign In to add comment